Skip to main content

spg_engine/
lib.rs

1//! SPG execution engine — v0.3 wires the SQL front-end to the in-memory
2//! storage layer. Implements `CREATE TABLE`, single-row `INSERT VALUES`, and
3//! `SELECT * FROM <table>` (no WHERE yet — that lands in v0.4 alongside
4//! expression evaluation against rows).
5#![no_std]
6
7extern crate alloc;
8
9pub mod aggregate;
10pub mod describe;
11pub mod eval;
12pub mod json;
13pub mod memoize;
14pub mod plan_cache;
15pub mod publications;
16pub mod query_stats;
17pub mod reorder;
18pub mod selectivity;
19pub mod statistics;
20pub mod subscriptions;
21pub mod users;
22
23pub use crate::users::{Role, ScramSecrets, UserError, UserStore};
24
25use alloc::borrow::Cow;
26use alloc::boxed::Box;
27use alloc::collections::BTreeMap;
28use alloc::string::{String, ToString};
29use alloc::vec::Vec;
30use core::fmt;
31
32use spg_sql::ast::{
33    BinOp, ColumnDef, ColumnName, ColumnTypeName, CreateIndexStatement,
34    CreatePublicationStatement, CreateSubscriptionStatement, CreateTableStatement,
35    CreateUserStatement, Expr, FrameBound, FrameKind, FromClause, IndexMethod, InsertStatement,
36    JoinKind, Literal, OrderBy, SelectItem, SelectStatement, Statement, UnOp, UnionKind,
37    VecEncoding as SqlVecEncoding, WindowFrame,
38};
39use spg_sql::parser::{self, ParseError};
40use spg_storage::{
41    Catalog, ColumnSchema, CompactReport, DataType, IndexKey, IndexKind, Row, StorageError, Table,
42    TableSchema, Value, VecEncoding,
43};
44
45use crate::eval::{EvalContext, EvalError};
46
47/// Result of executing one statement.
48#[derive(Debug, Clone, PartialEq)]
49#[non_exhaustive]
50pub enum QueryResult {
51    /// DDL or DML succeeded.
52    ///
53    /// `affected` is the row count for `INSERT` and 0 elsewhere.
54    /// `modified_catalog` tells the server whether this statement
55    /// caused the *committed* catalog to change — it's the signal to
56    /// snapshot/audit. False for `BEGIN`/`ROLLBACK`, false for writeful
57    /// statements executed inside a transaction (those only touch the
58    /// shadow), and true for `COMMIT` and for writes outside a TX.
59    CommandOk {
60        affected: usize,
61        modified_catalog: bool,
62    },
63    /// `SELECT` returned a (possibly empty) row set.
64    Rows {
65        columns: Vec<ColumnSchema>,
66        rows: Vec<Row>,
67    },
68}
69
70/// All errors the engine can return.
71///
72/// Marked `#[non_exhaustive]` from v7.5.0 onward: external `match`
73/// must include a `_` arm so new variants in subsequent v7.x releases
74/// are not breaking changes.
75#[derive(Debug, Clone, PartialEq)]
76#[non_exhaustive]
77pub enum EngineError {
78    Parse(ParseError),
79    Storage(StorageError),
80    Eval(EvalError),
81    /// Front-end accepted a construct that the v0.x executor doesn't support.
82    Unsupported(String),
83    /// `BEGIN` while another transaction is already open.
84    TransactionAlreadyOpen,
85    /// `COMMIT` / `ROLLBACK` with no active transaction.
86    NoActiveTransaction,
87    /// v4.0 sentinel: `execute_readonly` got a statement that
88    /// mutates engine state (INSERT / CREATE / BEGIN / COMMIT / …).
89    /// The caller should retake the write lock and dispatch through
90    /// `execute(&mut self)` instead.
91    WriteRequired,
92    /// v4.2: a SELECT would have returned more rows than the
93    /// configured `max_query_rows` cap. Carries the cap.
94    RowLimitExceeded(usize),
95    /// v4.5: cooperative cancellation — the host (server's
96    /// per-query watchdog) set the cancel flag while a long-running
97    /// SELECT / UPDATE / DELETE was scanning rows. The partial work
98    /// is discarded; the caller should surface this as a timeout
99    /// to the client.
100    Cancelled,
101}
102
103impl fmt::Display for EngineError {
104    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
105        match self {
106            Self::Parse(e) => write!(f, "parse: {e}"),
107            Self::Storage(e) => write!(f, "storage: {e}"),
108            Self::Eval(e) => write!(f, "eval: {e}"),
109            Self::Unsupported(s) => write!(f, "unsupported: {s}"),
110            Self::TransactionAlreadyOpen => f.write_str("a transaction is already open"),
111            Self::NoActiveTransaction => f.write_str("no active transaction"),
112            Self::WriteRequired => {
113                f.write_str("statement requires a write lock (use execute, not execute_readonly)")
114            }
115            Self::RowLimitExceeded(n) => {
116                write!(f, "query exceeded max_query_rows={n}")
117            }
118            Self::Cancelled => f.write_str("query cancelled (timeout or client request)"),
119        }
120    }
121}
122
123impl From<ParseError> for EngineError {
124    fn from(e: ParseError) -> Self {
125        Self::Parse(e)
126    }
127}
128impl From<StorageError> for EngineError {
129    fn from(e: StorageError) -> Self {
130        Self::Storage(e)
131    }
132}
133impl From<EvalError> for EngineError {
134    fn from(e: EvalError) -> Self {
135        Self::Eval(e)
136    }
137}
138
139/// The execution engine. Holds the catalog and (later) other server-scope
140/// state. `Engine::new()` is intentionally cheap so callers can construct one
141/// per database, per test.
142/// Function pointer that returns "now" as microseconds since Unix
143/// epoch. The engine is `no_std`, so it can't reach for `std::time`
144/// itself — callers (`spg-server`, the sqllogictest runner) inject a
145/// concrete implementation. `None` means `NOW()` / `CURRENT_*` raise
146/// `Unsupported`.
147pub type ClockFn = fn() -> i64;
148
149/// Function pointer that produces 16 cryptographically random bytes.
150/// Like `ClockFn`, the engine is `no_std` and can't reach for /dev/urandom
151/// itself — host (`spg-server`) injects an OS-backed source. `None`
152/// means SQL-driven `CREATE USER` falls back to a deterministic salt
153/// derived from the username (acceptable in tests; the server always
154/// installs a real RNG so production paths never see this).
155pub type SaltFn = fn() -> [u8; 16];
156
157/// v4.5 cooperative cancellation token. A long-running SELECT /
158/// UPDATE / DELETE checks `is_cancelled` at row-loop checkpoints
159/// and bails with `EngineError::Cancelled`. The host
160/// (`spg-server`) creates an `AtomicBool` per query, spawns a
161/// watchdog thread that sets it after `SPG_QUERY_TIMEOUT_MS`,
162/// and passes it via `execute_with_cancel` / `execute_readonly_with_cancel`.
163///
164/// `CancelToken::none()` is a no-op — used by the legacy `execute`
165/// and `execute_readonly` entry points so existing callers don't
166/// change.
167#[derive(Debug, Clone, Copy)]
168pub struct CancelToken<'a> {
169    flag: Option<&'a core::sync::atomic::AtomicBool>,
170}
171
172impl<'a> CancelToken<'a> {
173    #[must_use]
174    pub const fn none() -> Self {
175        Self { flag: None }
176    }
177
178    #[must_use]
179    pub const fn from_flag(f: &'a core::sync::atomic::AtomicBool) -> Self {
180        Self { flag: Some(f) }
181    }
182
183    #[must_use]
184    pub fn is_cancelled(self) -> bool {
185        self.flag
186            .is_some_and(|f| f.load(core::sync::atomic::Ordering::Relaxed))
187    }
188
189    /// Returns `Err(Cancelled)` if the token has been tripped.
190    /// Used at row-loop checkpoints to bail cooperatively without
191    /// scattering raw `is_cancelled` checks across the executor.
192    #[inline]
193    pub fn check(self) -> Result<(), EngineError> {
194        if self.is_cancelled() {
195            Err(EngineError::Cancelled)
196        } else {
197            Ok(())
198        }
199    }
200}
201
202// ---- snapshot envelope (v4.1, extended with CRC32 in v4.37,  ----
203// ----   publications in v6.1.2 v3, subscriptions in v6.1.4 v4) ----
204//
205// Wraps a catalog blob + a user blob behind a small header so the
206// server can persist both atomically without inventing a new file.
207// Bare catalog blobs (v3.x) still load via `restore_envelope` since
208// the magic check fails fast and the function falls back to
209// `Catalog::deserialize`.
210//
211// Layout — v1 (v4.1, no CRC):
212//   [8 bytes magic "SPGENV01"]
213//   [u8 version = 1]
214//   [u32 catalog_len][catalog bytes]
215//   [u32 users_len][users bytes]
216//
217// Layout — v2 (v4.37, CRC32 of body):
218//   [8 bytes magic "SPGENV01"]
219//   [u8 version = 2]
220//   [u32 catalog_len][catalog bytes]
221//   [u32 users_len][users bytes]
222//   [u32 crc32]                      ← CRC32 of every byte before it.
223//
224// Layout — v3 (v6.1.2, publications trailer):
225//   [8 bytes magic "SPGENV01"]
226//   [u8 version = 3]
227//   [u32 catalog_len][catalog bytes]
228//   [u32 users_len][users bytes]
229//   [u32 pubs_len][publications bytes]
230//   [u32 crc32]
231//
232// Layout — v4 (v6.1.4, subscriptions trailer):
233//   [8 bytes magic "SPGENV01"]
234//   [u8 version = 4]
235//   [u32 catalog_len][catalog bytes]
236//   [u32 users_len][users bytes]
237//   [u32 pubs_len][publications bytes]
238//   [u32 subs_len][subscriptions bytes]
239//   [u32 crc32]
240//
241// Layout — v5 (v6.2.0, statistics trailer):
242//   [8 bytes magic "SPGENV01"]
243//   [u8 version = 5]
244//   [u32 catalog_len][catalog bytes]
245//   [u32 users_len][users bytes]
246//   [u32 pubs_len][publications bytes]
247//   [u32 subs_len][subscriptions bytes]
248//   [u32 stats_len][statistics bytes]      ← NEW
249//   [u32 crc32]
250//
251// Writers emit v5 from v6.2.0 on. Readers accept all of {v1, v2,
252// v3, v4, v5}: v1/v2 load with empty publications / subscriptions /
253// statistics; v3 loads with empty subscriptions + statistics; v4
254// loads with empty statistics; v5 deserialises all three. Older
255// SPG versions reading a v5 envelope fall through the version
256// match to `EnvelopeParse::Bare` — pre-v6.2.0 binaries cannot
257// open v6.2.0+ snapshots (matches the v6.1.2 / v6.1.4 breaks).
258
259const ENVELOPE_MAGIC: &[u8; 8] = b"SPGENV01";
260const ENVELOPE_VERSION_V1: u8 = 1;
261const ENVELOPE_VERSION_V2: u8 = 2;
262const ENVELOPE_VERSION_V3: u8 = 3;
263const ENVELOPE_VERSION_V4: u8 = 4;
264const ENVELOPE_VERSION_V5: u8 = 5;
265
266fn build_envelope(
267    catalog: &[u8],
268    users: &[u8],
269    pubs: &[u8],
270    subs: &[u8],
271    stats: &[u8],
272) -> Vec<u8> {
273    let mut out = Vec::with_capacity(
274        8 + 1
275            + 4
276            + catalog.len()
277            + 4
278            + users.len()
279            + 4
280            + pubs.len()
281            + 4
282            + subs.len()
283            + 4
284            + stats.len()
285            + 4,
286    );
287    out.extend_from_slice(ENVELOPE_MAGIC);
288    out.push(ENVELOPE_VERSION_V5);
289    out.extend_from_slice(
290        &u32::try_from(catalog.len())
291            .expect("≤ 4G catalog")
292            .to_le_bytes(),
293    );
294    out.extend_from_slice(catalog);
295    out.extend_from_slice(
296        &u32::try_from(users.len())
297            .expect("≤ 4G users")
298            .to_le_bytes(),
299    );
300    out.extend_from_slice(users);
301    out.extend_from_slice(
302        &u32::try_from(pubs.len())
303            .expect("≤ 4G publications")
304            .to_le_bytes(),
305    );
306    out.extend_from_slice(pubs);
307    out.extend_from_slice(
308        &u32::try_from(subs.len())
309            .expect("≤ 4G subscriptions")
310            .to_le_bytes(),
311    );
312    out.extend_from_slice(subs);
313    out.extend_from_slice(
314        &u32::try_from(stats.len())
315            .expect("≤ 4G statistics")
316            .to_le_bytes(),
317    );
318    out.extend_from_slice(stats);
319    let crc = spg_crypto::crc32::crc32(&out);
320    out.extend_from_slice(&crc.to_le_bytes());
321    out
322}
323
324/// Outcome of envelope parsing: either bare-catalog fallback, a
325/// successfully split section trio from a v1/v2/v3 envelope, or an
326/// explicit corruption error from a v2/v3 CRC mismatch. `Bare`
327/// (catalog-only fallback) preserves v3.x readability. v1/v2
328/// envelopes set `publications` to `None`; v3 sets it to the
329/// publications byte slice.
330enum EnvelopeParse<'a> {
331    Bare,
332    Pair {
333        catalog: &'a [u8],
334        users: &'a [u8],
335        publications: Option<&'a [u8]>,
336        subscriptions: Option<&'a [u8]>,
337        statistics: Option<&'a [u8]>,
338    },
339    CrcMismatch {
340        expected: u32,
341        computed: u32,
342    },
343}
344
345/// Returns `EnvelopeParse::Pair` for a valid v1 / v2 / v3 envelope,
346/// `Bare` for a buffer that doesn't look like an envelope (v3.x
347/// bare catalog fallback), and `CrcMismatch` for a v2/v3 envelope
348/// whose trailing CRC32 doesn't match the body.
349fn split_envelope(buf: &[u8]) -> EnvelopeParse<'_> {
350    if buf.len() < 8 + 1 + 4 || &buf[..8] != ENVELOPE_MAGIC {
351        return EnvelopeParse::Bare;
352    }
353    let version = buf[8];
354    if !matches!(
355        version,
356        ENVELOPE_VERSION_V1
357            | ENVELOPE_VERSION_V2
358            | ENVELOPE_VERSION_V3
359            | ENVELOPE_VERSION_V4
360            | ENVELOPE_VERSION_V5
361    ) {
362        return EnvelopeParse::Bare;
363    }
364    let mut p = 9usize;
365    let Some(cat_len_bytes) = buf.get(p..p + 4) else {
366        return EnvelopeParse::Bare;
367    };
368    let Ok(cat_len_arr) = cat_len_bytes.try_into() else {
369        return EnvelopeParse::Bare;
370    };
371    let cat_len = u32::from_le_bytes(cat_len_arr) as usize;
372    p += 4;
373    if p + cat_len + 4 > buf.len() {
374        return EnvelopeParse::Bare;
375    }
376    let catalog = &buf[p..p + cat_len];
377    p += cat_len;
378    let Some(user_len_bytes) = buf.get(p..p + 4) else {
379        return EnvelopeParse::Bare;
380    };
381    let Ok(user_len_arr) = user_len_bytes.try_into() else {
382        return EnvelopeParse::Bare;
383    };
384    let user_len = u32::from_le_bytes(user_len_arr) as usize;
385    p += 4;
386    if p + user_len > buf.len() {
387        return EnvelopeParse::Bare;
388    }
389    let users = &buf[p..p + user_len];
390    p += user_len;
391    let publications = if matches!(
392        version,
393        ENVELOPE_VERSION_V3 | ENVELOPE_VERSION_V4 | ENVELOPE_VERSION_V5
394    ) {
395        // [u32 pubs_len][publications bytes]
396        let Some(pubs_len_bytes) = buf.get(p..p + 4) else {
397            return EnvelopeParse::Bare;
398        };
399        let Ok(pubs_len_arr) = pubs_len_bytes.try_into() else {
400            return EnvelopeParse::Bare;
401        };
402        let pubs_len = u32::from_le_bytes(pubs_len_arr) as usize;
403        p += 4;
404        if p + pubs_len > buf.len() {
405            return EnvelopeParse::Bare;
406        }
407        let pubs_slice = &buf[p..p + pubs_len];
408        p += pubs_len;
409        Some(pubs_slice)
410    } else {
411        None
412    };
413    let subscriptions = if matches!(version, ENVELOPE_VERSION_V4 | ENVELOPE_VERSION_V5) {
414        // [u32 subs_len][subscriptions bytes]
415        let Some(subs_len_bytes) = buf.get(p..p + 4) else {
416            return EnvelopeParse::Bare;
417        };
418        let Ok(subs_len_arr) = subs_len_bytes.try_into() else {
419            return EnvelopeParse::Bare;
420        };
421        let subs_len = u32::from_le_bytes(subs_len_arr) as usize;
422        p += 4;
423        if p + subs_len > buf.len() {
424            return EnvelopeParse::Bare;
425        }
426        let subs_slice = &buf[p..p + subs_len];
427        p += subs_len;
428        Some(subs_slice)
429    } else {
430        None
431    };
432    let statistics = if version == ENVELOPE_VERSION_V5 {
433        // [u32 stats_len][statistics bytes]
434        let Some(stats_len_bytes) = buf.get(p..p + 4) else {
435            return EnvelopeParse::Bare;
436        };
437        let Ok(stats_len_arr) = stats_len_bytes.try_into() else {
438            return EnvelopeParse::Bare;
439        };
440        let stats_len = u32::from_le_bytes(stats_len_arr) as usize;
441        p += 4;
442        if p + stats_len > buf.len() {
443            return EnvelopeParse::Bare;
444        }
445        let stats_slice = &buf[p..p + stats_len];
446        p += stats_len;
447        Some(stats_slice)
448    } else {
449        None
450    };
451    if matches!(
452        version,
453        ENVELOPE_VERSION_V2 | ENVELOPE_VERSION_V3 | ENVELOPE_VERSION_V4 | ENVELOPE_VERSION_V5
454    ) {
455        if p + 4 != buf.len() {
456            return EnvelopeParse::Bare;
457        }
458        let Ok(crc_arr) = buf[p..p + 4].try_into() else {
459            return EnvelopeParse::Bare;
460        };
461        let expected = u32::from_le_bytes(crc_arr);
462        let computed = spg_crypto::crc32::crc32(&buf[..p]);
463        if expected != computed {
464            return EnvelopeParse::CrcMismatch { expected, computed };
465        }
466    } else if p != buf.len() {
467        // v1: must end exactly at the users section.
468        return EnvelopeParse::Bare;
469    }
470    EnvelopeParse::Pair {
471        catalog,
472        users,
473        publications,
474        subscriptions,
475        statistics,
476    }
477}
478
479/// v4.41.1 opaque transaction handle. Returned by `Engine::alloc_tx_id`,
480/// threaded through `Engine::execute_in` so dispatch can identify which
481/// in-flight TX a statement belongs to. `IMPLICIT_TX` is the reserved
482/// slot every legacy caller — engine self-tests, spg-cli, spg-embedded,
483/// startup replay — implicitly uses through the unchanged
484/// `Engine::execute(sql)` API. v4.41.1 keeps at most one active slot at
485/// runtime (dispatch holds `engine.write()` across the wrap, same as
486/// v4.34); the map shape is here to let v4.42 turn on N in-flight
487/// implicit TXs without reshuffling the engine internals.
488#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
489pub struct TxId(pub u64);
490
491/// Reserved slot used by `Engine::execute(sql)` — the legacy single-
492/// global-shadow path. New `alloc_tx_id` handles start at 1.
493pub const IMPLICIT_TX: TxId = TxId(0);
494
495/// v6.7.3 — default segment-size threshold used by `COMPACT COLD
496/// SEGMENTS` when no explicit target is supplied. Segments whose
497/// `OwnedSegment::bytes().len()` is **strictly** less than this
498/// value are eligible to merge. spg-server reads
499/// `SPG_COMPACTION_TARGET_SEGMENT_BYTES` to override.
500pub const COMPACTION_TARGET_DEFAULT_BYTES: u64 = 4 * 1024 * 1024;
501
502/// Per-slot transaction state. Held inside `tx_catalogs[tx_id]` for the
503/// lifetime of a BEGIN..COMMIT (or BEGIN..ROLLBACK) window. Drops when
504/// the TX commits (its `catalog` is moved over `Engine.catalog`) or
505/// rolls back (slot removed, catalog discarded).
506#[derive(Debug, Default, Clone)]
507struct TxState {
508    /// The TX's shadow copy of the catalog. Started as a clone of
509    /// `Engine.catalog` at BEGIN time; writes flow into it; COMMIT
510    /// installs it over `Engine.catalog`. `Catalog::clone()` is O(1)
511    /// since v4.40 (`PersistentVec` rows + `PersistentBTreeMap` indices).
512    catalog: Catalog,
513    /// Per-TX savepoint stack. Each entry pairs the savepoint name with
514    /// a clone of `catalog` at the moment `SAVEPOINT <name>` fired.
515    /// `ROLLBACK TO <name>` restores from the entry and pops everything
516    /// after it; `RELEASE <name>` discards the entry and everything
517    /// after; COMMIT/ROLLBACK clears the whole stack.
518    savepoints: Vec<(String, Catalog)>,
519}
520
521#[derive(Debug, Default)]
522pub struct Engine {
523    /// Committed catalog — what survives `Engine::snapshot()` and what
524    /// outside-TX `SELECT`s read.
525    catalog: Catalog,
526    /// Active TX slots, keyed by `TxId`. Empty when no TX is in flight.
527    /// v4.41.1 runtime invariant: at most one entry (single-writer
528    /// model unchanged). v4.42 will let dispatch hold multiple entries
529    /// concurrently for group commit + engine MVCC.
530    tx_catalogs: BTreeMap<TxId, TxState>,
531    /// Which slot the next exec_* call should mutate. Set by
532    /// `execute_in(sql, tx_id)` at the entry point; legacy `execute(sql)`
533    /// sets it to `IMPLICIT_TX`. None when no TX is in flight (read /
534    /// write goes straight against `catalog`).
535    current_tx: Option<TxId>,
536    /// Monotonic counter for `alloc_tx_id`. Starts at 1 — slot 0 is
537    /// reserved for `IMPLICIT_TX`.
538    next_tx_id: u64,
539    /// Optional wall clock used to satisfy `NOW()` / `CURRENT_TIMESTAMP`
540    /// / `CURRENT_DATE`. Set by the host environment.
541    clock: Option<ClockFn>,
542    /// v4.1 cryptographic RNG for per-user password salt. Set by the
543    /// host. `None` means SQL-driven `CREATE USER` uses a
544    /// deterministic fallback — see `SaltFn`.
545    salt_fn: Option<SaltFn>,
546    /// v4.2 per-query row cap. `None` = unlimited. When set, a
547    /// SELECT that materialises more than `n` rows returns
548    /// `EngineError::RowLimitExceeded`. Enforced before the result
549    /// is shaped into wire frames so a runaway scan can't blow the
550    /// server's heap.
551    max_query_rows: Option<usize>,
552    /// v4.1 RBAC user table. Empty means "no RBAC configured yet" —
553    /// the server decides what that means at the auth boundary
554    /// (open mode vs legacy single-password mode). User CRUD goes
555    /// through `create_user`/`drop_user`/`verify_user`; persistence
556    /// rides the snapshot envelope alongside the catalog.
557    users: UserStore,
558    /// v6.1.2 logical-replication publication catalog. Empty until
559    /// `CREATE PUBLICATION` runs. Persistence rides the v3 envelope
560    /// trailer (see `build_envelope`).
561    publications: publications::Publications,
562    /// v6.1.4 logical-replication subscription catalog. Empty until
563    /// `CREATE SUBSCRIPTION` runs. Persistence rides the v4 envelope
564    /// trailer.
565    subscriptions: subscriptions::Subscriptions,
566    /// v6.2.0 — per-column statistics for the cost-based optimizer.
567    /// Populated by `ANALYZE`; queried via `spg_statistic` virtual
568    /// table. Persistence rides the v5 envelope trailer.
569    statistics: statistics::Statistics,
570    /// v6.3.0 — engine-level plan cache. Caches the post-`prepare()`
571    /// `Statement` keyed on SQL text. In-memory only — does NOT ride
572    /// the snapshot envelope (rebuilt on demand after restart).
573    plan_cache: plan_cache::PlanCache,
574    /// v6.5.1 — per-distinct-SQL execution stats. In-memory only,
575    /// surfaced via `spg_stat_query` virtual table. Updated by the
576    /// `execute_*` paths after a successful execute.
577    query_stats: query_stats::QueryStats,
578    /// v6.5.2 — connection-state provider callback. spg-server
579    /// registers a function at startup that snapshots its
580    /// per-pgwire-connection registry into `ActivityRow`s; engine
581    /// reads through it on every `SELECT * FROM spg_stat_activity`.
582    /// `None` ⇒ no-data (returns empty rows; matches the no_std
583    /// embedded callers that don't run pgwire).
584    activity_provider: Option<ActivityProvider>,
585    /// v6.5.3 — audit-chain provider + verifier. Same pattern as
586    /// activity_provider: spg-server registers both at startup;
587    /// engine reads through on `SELECT * FROM spg_audit_chain` and
588    /// `SELECT * FROM spg_audit_verify`. `None` ⇒ no-data.
589    audit_chain_provider: Option<AuditChainProvider>,
590    audit_verifier: Option<AuditVerifier>,
591    /// v6.5.6 — slow-query log threshold in microseconds. When set,
592    /// every successful execute whose elapsed exceeds the threshold
593    /// gets fed to the registered slow-query log callback (so
594    /// spg-server can emit a structured log line). Default `None`
595    /// = no slow-query logging.
596    slow_query_threshold_us: Option<u64>,
597    slow_query_logger: Option<SlowQueryLogger>,
598}
599
600/// v6.5.6 — callback signature for slow-query log emission. Called
601/// with `(sql, elapsed_us)` once per successful execute that crosses
602/// the threshold.
603pub type SlowQueryLogger = fn(&str, u64);
604
605/// v6.5.4 — synthesise a `CREATE TABLE` statement from catalog
606/// state. Round-trips through `Engine::execute` to recreate the
607/// same schema (sans data + indexes — indexes are emitted as a
608/// separate `CREATE INDEX` chain in `spg_database_ddl`).
609fn render_create_table(name: &str, columns: &[ColumnSchema]) -> String {
610    let mut out = alloc::format!("CREATE TABLE {name} (");
611    for (i, col) in columns.iter().enumerate() {
612        if i > 0 {
613            out.push_str(", ");
614        }
615        out.push_str(&col.name);
616        out.push(' ');
617        out.push_str(&render_data_type(col.ty));
618        if !col.nullable {
619            out.push_str(" NOT NULL");
620        }
621        if col.auto_increment {
622            out.push_str(" AUTO_INCREMENT");
623        }
624    }
625    out.push(')');
626    out
627}
628
629fn render_data_type(ty: DataType) -> String {
630    match ty {
631        DataType::SmallInt => "SMALLINT".into(),
632        DataType::Int => "INT".into(),
633        DataType::BigInt => "BIGINT".into(),
634        DataType::Float => "FLOAT".into(),
635        DataType::Text => "TEXT".into(),
636        DataType::Varchar(n) => alloc::format!("VARCHAR({n})"),
637        DataType::Char(n) => alloc::format!("CHAR({n})"),
638        DataType::Bool => "BOOL".into(),
639        DataType::Vector { dim, encoding } => match encoding {
640            spg_storage::VecEncoding::F32 => alloc::format!("VECTOR({dim})"),
641            spg_storage::VecEncoding::Sq8 => alloc::format!("VECTOR({dim}) USING SQ8"),
642            spg_storage::VecEncoding::F16 => alloc::format!("VECTOR({dim}) USING HALF"),
643        },
644        DataType::Numeric { precision, scale } => {
645            alloc::format!("NUMERIC({precision},{scale})")
646        }
647        DataType::Date => "DATE".into(),
648        DataType::Timestamp => "TIMESTAMP".into(),
649        DataType::Interval => "INTERVAL".into(),
650        DataType::Json => "JSON".into(),
651        DataType::Jsonb => "JSONB".into(),
652        DataType::Timestamptz => "TIMESTAMPTZ".into(),
653    }
654}
655
656/// v6.5.2 — one row of `spg_stat_activity`. Engine-public so
657/// spg-server can construct rows without re-exporting internal
658/// dispatch types.
659#[derive(Debug, Clone)]
660pub struct ActivityRow {
661    pub pid: u32,
662    pub user: String,
663    pub started_at_us: i64,
664    pub current_sql: String,
665    pub wait_event: String,
666    pub elapsed_us: i64,
667    pub in_transaction: bool,
668}
669
670/// v6.5.2 — provider callback type. Fresh snapshot returned each
671/// call; engine doesn't cache the slice.
672pub type ActivityProvider = fn() -> Vec<ActivityRow>;
673
674/// v6.5.3 — one row of `spg_audit_chain`. Engine-public so
675/// spg-server can construct rows directly from `AuditEntry`.
676#[derive(Debug, Clone)]
677pub struct AuditRow {
678    pub seq: i64,
679    pub ts_ms: i64,
680    pub prev_hash_hex: String,
681    pub entry_hash_hex: String,
682    pub sql: String,
683}
684
685/// v6.5.3 — chain-table provider + verifier. spg-server registers
686/// fn pointers that snapshot / verify the audit log. `verify`
687/// returns `(verified_count, broken_at_seq)` — `broken_at_seq` is
688/// `-1` on a clean chain.
689pub type AuditChainProvider = fn() -> Vec<AuditRow>;
690pub type AuditVerifier = fn() -> (i64, i64);
691
692impl Engine {
693    pub fn new() -> Self {
694        Self {
695            catalog: Catalog::new(),
696            tx_catalogs: BTreeMap::new(),
697            current_tx: None,
698            next_tx_id: 1,
699            clock: None,
700            salt_fn: None,
701            max_query_rows: None,
702            users: UserStore::new(),
703            publications: publications::Publications::new(),
704            subscriptions: subscriptions::Subscriptions::new(),
705            statistics: statistics::Statistics::new(),
706            plan_cache: plan_cache::PlanCache::new(),
707            query_stats: query_stats::QueryStats::new(),
708            activity_provider: None,
709            audit_chain_provider: None,
710            audit_verifier: None,
711            slow_query_threshold_us: None,
712            slow_query_logger: None,
713        }
714    }
715
716    /// Construct an engine restored from a previously-snapshotted catalog
717    /// (see `snapshot()`).
718    pub fn restore(catalog: Catalog) -> Self {
719        Self {
720            catalog,
721            tx_catalogs: BTreeMap::new(),
722            current_tx: None,
723            next_tx_id: 1,
724            clock: None,
725            salt_fn: None,
726            max_query_rows: None,
727            users: UserStore::new(),
728            publications: publications::Publications::new(),
729            subscriptions: subscriptions::Subscriptions::new(),
730            statistics: statistics::Statistics::new(),
731            plan_cache: plan_cache::PlanCache::new(),
732            query_stats: query_stats::QueryStats::new(),
733            activity_provider: None,
734            audit_chain_provider: None,
735            audit_verifier: None,
736            slow_query_threshold_us: None,
737            slow_query_logger: None,
738        }
739    }
740
741    /// Restore an engine + user table from a v4.1 envelope produced
742    /// by `snapshot_with_users()`. Falls back to plain catalog-only
743    /// restore if the envelope magic isn't present (so v3.x snapshot
744    /// files still load). v6.1.2 adds the optional publications
745    /// trailer (envelope v3); a v1/v2 envelope deserialises to an
746    /// empty publication table.
747    pub fn restore_envelope(buf: &[u8]) -> Result<Self, EngineError> {
748        match split_envelope(buf) {
749            EnvelopeParse::Pair {
750                catalog: catalog_bytes,
751                users: user_bytes,
752                publications: pub_bytes,
753                subscriptions: sub_bytes,
754                statistics: stats_bytes,
755            } => {
756                let catalog = Catalog::deserialize(catalog_bytes).map_err(EngineError::Storage)?;
757                let users = users::deserialize_users(user_bytes)
758                    .map_err(|e| EngineError::Unsupported(alloc::format!("users restore: {e}")))?;
759                let publications = match pub_bytes {
760                    Some(b) => publications::Publications::deserialize(b).map_err(|e| {
761                        EngineError::Unsupported(alloc::format!("publications restore: {e:?}"))
762                    })?,
763                    None => publications::Publications::new(),
764                };
765                let subscriptions = match sub_bytes {
766                    Some(b) => subscriptions::Subscriptions::deserialize(b).map_err(|e| {
767                        EngineError::Unsupported(alloc::format!("subscriptions restore: {e:?}"))
768                    })?,
769                    None => subscriptions::Subscriptions::new(),
770                };
771                let statistics = match stats_bytes {
772                    Some(b) => statistics::Statistics::deserialize(b).map_err(|e| {
773                        EngineError::Unsupported(alloc::format!("statistics restore: {e:?}"))
774                    })?,
775                    None => statistics::Statistics::new(),
776                };
777                Ok(Self {
778                    catalog,
779                    tx_catalogs: BTreeMap::new(),
780                    current_tx: None,
781                    next_tx_id: 1,
782                    clock: None,
783                    salt_fn: None,
784                    max_query_rows: None,
785                    users,
786                    publications,
787                    subscriptions,
788                    statistics,
789                    plan_cache: plan_cache::PlanCache::new(),
790                    query_stats: query_stats::QueryStats::new(),
791                    activity_provider: None,
792                    audit_chain_provider: None,
793                    audit_verifier: None,
794                    slow_query_threshold_us: None,
795                    slow_query_logger: None,
796                })
797            }
798            EnvelopeParse::CrcMismatch { expected, computed } => {
799                Err(EngineError::Storage(StorageError::Corrupt(alloc::format!(
800                    "snapshot envelope CRC32 mismatch (expected={expected:#010x}, computed={computed:#010x})"
801                ))))
802            }
803            EnvelopeParse::Bare => {
804                let catalog = Catalog::deserialize(buf).map_err(EngineError::Storage)?;
805                Ok(Self::restore(catalog))
806            }
807        }
808    }
809
810    pub const fn users(&self) -> &UserStore {
811        &self.users
812    }
813
814    /// `salt` is supplied by the caller (the host has a random
815    /// source; the engine is `no_std`). Caller should pass a fresh
816    /// 16-byte random value per user.
817    pub fn create_user(
818        &mut self,
819        name: &str,
820        password: &str,
821        role: Role,
822        salt: [u8; 16],
823    ) -> Result<(), UserError> {
824        self.users.create(name, password, role, salt)?;
825        // v4.8: also derive SCRAM-SHA-256 secrets so PG-wire SASL
826        // auth can verify without re-running PBKDF2 per attempt.
827        // Uses a fresh salt from the host RNG (falls back to a
828        // deterministic per-username salt when no RNG is wired, same
829        // as the legacy hash path).
830        let scram_salt = self.salt_fn.map_or_else(
831            || {
832                let mut s = [0u8; users::SCRAM_SALT_LEN];
833                let digest = spg_crypto::hash(name.as_bytes());
834                // Use bytes 16..32 of BLAKE3 so we don't reuse the
835                // exact same fallback salt as the BLAKE3 hash path.
836                s.copy_from_slice(&digest[16..32]);
837                s
838            },
839            |f| f(),
840        );
841        self.users
842            .enable_scram(name, password, scram_salt, users::SCRAM_DEFAULT_ITERS)?;
843        Ok(())
844    }
845
846    pub fn drop_user(&mut self, name: &str) -> Result<(), UserError> {
847        self.users.drop(name)
848    }
849
850    pub fn verify_user(&self, name: &str, password: &str) -> Option<Role> {
851        self.users.verify(name, password)
852    }
853
854    /// Builder: attach a wall clock so `NOW()` / `CURRENT_TIMESTAMP` /
855    /// `CURRENT_DATE` evaluate to a real value instead of erroring out.
856    #[must_use]
857    pub const fn with_clock(mut self, clock: ClockFn) -> Self {
858        self.clock = Some(clock);
859        self
860    }
861
862    /// Builder: attach an OS-backed RNG for per-user password salts.
863    /// The host (`spg-server`) typically wires this to `/dev/urandom`.
864    #[must_use]
865    pub const fn with_salt_fn(mut self, f: SaltFn) -> Self {
866        self.salt_fn = Some(f);
867        self
868    }
869
870    /// Builder: cap the number of rows a single SELECT may return.
871    /// Exceeding the cap raises `EngineError::RowLimitExceeded` —
872    /// the bound is checked inside the executor so a runaway
873    /// catalog scan can't allocate millions of rows before the
874    /// server gets a chance to reject the result.
875    #[must_use]
876    pub const fn with_max_query_rows(mut self, n: usize) -> Self {
877        self.max_query_rows = Some(n);
878        self
879    }
880
881    /// The *committed* catalog. Note: during a transaction this returns the
882    /// pre-TX state — `SELECT` inside a TX goes through `execute()` and reads
883    /// the shadow. Tests that inspect outside-TX state should use this.
884    pub const fn catalog(&self) -> &Catalog {
885        &self.catalog
886    }
887
888    /// Serialize the *committed* catalog to bytes. v0.6 was full-snapshot; v0.9
889    /// adds the rule that an open TX's shadow is never snapshotted — only the
890    /// post-COMMIT state is persisted. v4.1 wraps the catalog in an envelope
891    /// when there are users to persist; an empty user table snapshots as the
892    /// bare catalog format (backwards-compat with v3.x readers). v6.1.2
893    /// adds publications to the envelope condition: either non-empty
894    /// users OR non-empty publications now triggers the envelope path.
895    pub fn snapshot(&self) -> Vec<u8> {
896        if self.users.is_empty()
897            && self.publications.is_empty()
898            && self.subscriptions.is_empty()
899            && self.statistics.is_empty()
900        {
901            self.catalog.serialize()
902        } else {
903            build_envelope(
904                &self.catalog.serialize(),
905                &users::serialize_users(&self.users),
906                &self.publications.serialize(),
907                &self.subscriptions.serialize(),
908                &self.statistics.serialize(),
909            )
910        }
911    }
912
913    /// True when at least one TX slot is in flight. v4.41.1 runtime
914    /// invariant: at most one slot active at a time (dispatch holds
915    /// `engine.write()` across the entire wrap). v4.42 will let this
916    /// return true with multiple slots concurrently.
917    pub fn in_transaction(&self) -> bool {
918        !self.tx_catalogs.is_empty()
919    }
920
921    /// v4.41.1 allocate a fresh TX handle. Used by spg-server dispatch
922    /// to scope each implicit-wrap BEGIN..stmt..COMMIT to its own slot
923    /// in `tx_catalogs`. v4.42 — the commit-barrier leader allocates
924    /// one of these per task in its group, runs `BEGIN`+sql+`COMMIT`
925    /// sequentially under a single `engine.write()` so each task's
926    /// mutations accumulate into shared state, then either keeps the
927    /// accumulated state (fsync OK) or restores the pre-image via
928    /// `replace_catalog` (fsync err).
929    pub fn alloc_tx_id(&mut self) -> TxId {
930        let id = TxId(self.next_tx_id);
931        self.next_tx_id = self.next_tx_id.saturating_add(1);
932        id
933    }
934
935    /// v4.42 — atomically replace the live catalog. Used by the
936    /// commit-barrier leader to roll back a group whose batched
937    /// fsync failed: the leader snapshots `engine.catalog().clone()`
938    /// (O(1) Arc bump after the v4.39/v4.40 persistent migration)
939    /// at group start, sequentially applies each task's BEGIN+sql+
940    /// COMMIT under the same write lock to accumulate mutations
941    /// into shared state, batches the WAL bytes, fsyncs once, and
942    /// on failure calls this with the pre-image to undo every
943    /// task in the group at once.
944    ///
945    /// **Does NOT touch `tx_catalogs` / `current_tx`.** Any
946    /// explicit-TX slot from a concurrent client (created via the
947    /// legacy `IMPLICIT_TX`-less dispatch path or via the future
948    /// MVCC-readers v5+ work) has its own snapshot baked into the
949    /// slot — restoring `self.catalog` to the pre-image leaves
950    /// those slots untouched, exactly as they were when the leader
951    /// took the lock. The leader's own implicit-TX slots are all
952    /// already discarded (`exec_commit` removed them as each
953    /// task's COMMIT ran) by the time this is reached.
954    pub fn replace_catalog(&mut self, catalog: Catalog) {
955        self.catalog = catalog;
956    }
957
958    /// v6.7.0 — public shim around `Catalog::freeze_oldest_to_cold`
959    /// so tests + the spg-server freezer can drive a freeze without
960    /// reaching into the private `active_catalog_mut`. v6.7.4
961    /// parallel freezer will build on this surface.
962    ///
963    /// Marks the table's cached `cold_row_count` stale because the
964    /// freeze added cold locators that ANALYZE hasn't yet refreshed.
965    pub fn freeze_oldest_to_cold(
966        &mut self,
967        table_name: &str,
968        index_name: &str,
969        max_rows: usize,
970    ) -> Result<spg_storage::FreezeReport, EngineError> {
971        let report = self
972            .active_catalog_mut()
973            .freeze_oldest_to_cold(table_name, index_name, max_rows)
974            .map_err(EngineError::Storage)?;
975        if let Some(t) = self.active_catalog_mut().get_mut(table_name) {
976            t.mark_cold_row_count_stale();
977        }
978        Ok(report)
979    }
980
981    /// v6.7.5 — public shim used by the spg-server follower's
982    /// segment-forwarding receiver. Registers a cold-tier segment
983    /// at a specific id (the master's id, as transmitted on the
984    /// wire) so the follower's BTree-Cold locators stay byte-
985    /// identical with the master's. Wraps
986    /// `Catalog::load_segment_bytes_at` under the standard
987    /// clone-mutate-replace pattern.
988    ///
989    /// Returns `Ok(())` on success **and** on the "slot already
990    /// occupied" case — a follower mid-reconnect may receive a
991    /// segment chunk for a segment_id it already has on disk
992    /// (forwarded last session); the caller should treat that
993    /// path as a no-op rather than a fatal error.
994    pub fn receive_cold_segment(
995        &mut self,
996        segment_id: u32,
997        bytes: Vec<u8>,
998    ) -> Result<(), EngineError> {
999        let mut new_cat = self.catalog.clone();
1000        match new_cat.load_segment_bytes_at(segment_id, bytes) {
1001            Ok(()) => {
1002                self.replace_catalog(new_cat);
1003                Ok(())
1004            }
1005            Err(StorageError::Corrupt(msg)) if msg.contains("already occupied") => Ok(()),
1006            Err(e) => Err(EngineError::Storage(e)),
1007        }
1008    }
1009
1010    /// v6.7.3 — public shim around `Catalog::compact_cold_segments`
1011    /// driving every BTree index on every user table. Returns one
1012    /// `(table, index, report)` triple for each merge that
1013    /// actually happened (no-op (table, index) pairs are filtered
1014    /// out so callers can size persist-side work to the live
1015    /// merges). Caller is responsible for persisting each
1016    /// `report.merged_segment_bytes` and updating the on-disk
1017    /// segment registry; engine layer is no_std and never
1018    /// touches disk.
1019    ///
1020    /// Marks every touched table's cached `cold_row_count` stale
1021    /// — compaction GC'd some shadowed rows, so the count must be
1022    /// re-derived on the next ANALYZE.
1023    pub fn compact_cold_segments_with_target(
1024        &mut self,
1025        target_segment_bytes: u64,
1026    ) -> Result<Vec<(String, String, CompactReport)>, EngineError> {
1027        let table_names = self.active_catalog().table_names();
1028        let mut reports: Vec<(String, String, CompactReport)> = Vec::new();
1029        for tname in table_names {
1030            if is_internal_table_name(&tname) {
1031                continue;
1032            }
1033            let idx_names: Vec<String> = {
1034                let Some(t) = self.active_catalog().get(&tname) else {
1035                    continue;
1036                };
1037                t.indices()
1038                    .iter()
1039                    .filter(|i| matches!(i.kind, IndexKind::BTree(_)))
1040                    .map(|i| i.name.clone())
1041                    .collect()
1042            };
1043            for iname in idx_names {
1044                let report = self
1045                    .active_catalog_mut()
1046                    .compact_cold_segments(&tname, &iname, target_segment_bytes)
1047                    .map_err(EngineError::Storage)?;
1048                if report.merged_segment_id.is_some() {
1049                    if let Some(t) = self.active_catalog_mut().get_mut(&tname) {
1050                        t.mark_cold_row_count_stale();
1051                    }
1052                    reports.push((tname.clone(), iname, report));
1053                }
1054            }
1055        }
1056        Ok(reports)
1057    }
1058
1059    fn active_catalog(&self) -> &Catalog {
1060        match self.current_tx {
1061            Some(t) => self
1062                .tx_catalogs
1063                .get(&t)
1064                .map_or(&self.catalog, |s| &s.catalog),
1065            None => &self.catalog,
1066        }
1067    }
1068
1069    fn active_catalog_mut(&mut self) -> &mut Catalog {
1070        let tx = self.current_tx;
1071        match tx {
1072            Some(t) => match self.tx_catalogs.get_mut(&t) {
1073                Some(s) => &mut s.catalog,
1074                None => &mut self.catalog,
1075            },
1076            None => &mut self.catalog,
1077        }
1078    }
1079
1080    /// Read-only execute path. Succeeds for `SELECT` / `SHOW TABLES`
1081    /// / `SHOW COLUMNS`; returns `EngineError::WriteRequired` for
1082    /// every other statement, so the caller can fall through to the
1083    /// `&mut self` `execute` path under a write lock. Engine state is
1084    /// not mutated even on the success path (`rewrite_clock_calls`
1085    /// and `resolve_order_by_position` both mutate the locally-owned
1086    /// AST, not `self`).
1087    ///
1088    /// **v4.0 concurrency**: this is the entry point the server takes
1089    /// under an `RwLock::read()` so multiple `SELECT` clients run in
1090    /// parallel without serialising on a single mutex.
1091    pub fn execute_readonly(&self, sql: &str) -> Result<QueryResult, EngineError> {
1092        self.execute_readonly_with_cancel(sql, CancelToken::none())
1093    }
1094
1095    /// v4.5 — read path with cooperative cancellation. Token's
1096    /// `is_cancelled` is checked at the start (so a watchdog that
1097    /// already fired returns Cancelled immediately) and at row-loop
1098    /// checkpoints inside `exec_select`. SHOW paths are O(small) and
1099    /// don't bother checking.
1100    pub fn execute_readonly_with_cancel(
1101        &self,
1102        sql: &str,
1103        cancel: CancelToken<'_>,
1104    ) -> Result<QueryResult, EngineError> {
1105        cancel.check()?;
1106        let mut stmt = parser::parse_statement(sql)?;
1107        let now_micros = self.clock.map(|f| f());
1108        rewrite_clock_calls(&mut stmt, now_micros);
1109        if let Statement::Select(s) = &mut stmt {
1110            resolve_order_by_position(s);
1111            // v6.2.3 — cost-based JOIN reorder (read path).
1112            reorder::reorder_joins(s, &self.catalog, &self.statistics);
1113        }
1114        let result = match stmt {
1115            Statement::Select(s) => self.exec_select_cancel(&s, cancel),
1116            Statement::ShowTables => Ok(self.exec_show_tables()),
1117            Statement::ShowColumns(table) => self.exec_show_columns(&table),
1118            Statement::ShowUsers => Ok(self.exec_show_users()),
1119            Statement::ShowPublications => Ok(self.exec_show_publications()),
1120            Statement::ShowSubscriptions => Ok(self.exec_show_subscriptions()),
1121            Statement::WaitForWalPosition { .. } => Err(EngineError::Unsupported(
1122                "WAIT FOR WAL POSITION must be handled by the server layer".into(),
1123            )),
1124            Statement::Explain(e) => self.exec_explain(&e, cancel),
1125            _ => Err(EngineError::WriteRequired),
1126        };
1127        self.enforce_row_limit(result)
1128    }
1129
1130    /// v4.2: cap result-set size. Applied after the executor
1131    /// materialises rows but before they leave the engine — wrapping
1132    /// every Rows-returning exec_* function would scatter the check.
1133    fn enforce_row_limit(
1134        &self,
1135        result: Result<QueryResult, EngineError>,
1136    ) -> Result<QueryResult, EngineError> {
1137        if let (Ok(QueryResult::Rows { rows, .. }), Some(cap)) = (&result, self.max_query_rows)
1138            && rows.len() > cap
1139        {
1140            return Err(EngineError::RowLimitExceeded(cap));
1141        }
1142        result
1143    }
1144
1145    pub fn execute(&mut self, sql: &str) -> Result<QueryResult, EngineError> {
1146        self.execute_in_with_cancel(sql, IMPLICIT_TX, CancelToken::none())
1147    }
1148
1149    /// v4.5 — write path with cooperative cancellation. Same dispatch
1150    /// as `execute_in_with_cancel(sql, IMPLICIT_TX, cancel)`. Kept as
1151    /// a separate entry point for backward-compat with the v4.5
1152    /// public API.
1153    pub fn execute_with_cancel(
1154        &mut self,
1155        sql: &str,
1156        cancel: CancelToken<'_>,
1157    ) -> Result<QueryResult, EngineError> {
1158        self.execute_in_with_cancel(sql, IMPLICIT_TX, cancel)
1159    }
1160
1161    /// v4.41.1 multi-slot write entry. Routes `sql` through the TX
1162    /// slot identified by `tx_id` so spg-server dispatch can scope
1163    /// each implicit-wrap BEGIN..stmt..COMMIT to its own slot in
1164    /// `tx_catalogs`. `IMPLICIT_TX` is the legacy single-slot path
1165    /// every other caller (engine self-tests, replay, spg-embedded)
1166    /// implicitly takes via `execute()` / `execute_with_cancel()`.
1167    pub fn execute_in(&mut self, sql: &str, tx_id: TxId) -> Result<QueryResult, EngineError> {
1168        self.execute_in_with_cancel(sql, tx_id, CancelToken::none())
1169    }
1170
1171    /// v4.41.1 write path with cooperative cancellation + explicit TX
1172    /// scope. Sets `self.current_tx` for the duration of the call so
1173    /// every `exec_*` helper transparently sees its TX's shadow
1174    /// catalog and savepoint stack; restores on exit so the field is
1175    /// only valid mid-call (no leakage across calls).
1176    pub fn execute_in_with_cancel(
1177        &mut self,
1178        sql: &str,
1179        tx_id: TxId,
1180        cancel: CancelToken<'_>,
1181    ) -> Result<QueryResult, EngineError> {
1182        let saved = self.current_tx;
1183        self.current_tx = Some(tx_id);
1184        let result = self.execute_inner_with_cancel(sql, cancel);
1185        self.current_tx = saved;
1186        result
1187    }
1188
1189    /// v6.1.1 — parse and pre-process a SQL string ONCE so the
1190    /// resulting [`Statement`] can be cached and re-executed via
1191    /// [`Engine::execute_prepared`]. Returns the same `Statement`
1192    /// the simple-query path would synthesise internally (clock
1193    /// rewrites + ORDER BY position-ref resolution applied at
1194    /// prepare time, since both are session-independent). The
1195    /// `$N` placeholders in the SQL stay as `Expr::Placeholder(n)`
1196    /// nodes; they're resolved to concrete values per-call by
1197    /// `execute_prepared`'s substitution walk.
1198    ///
1199    /// Pgwire's `Parse` (P) message lands here.
1200    pub fn prepare(&self, sql: &str) -> Result<Statement, ParseError> {
1201        let mut stmt = parser::parse_statement(sql)?;
1202        let now_micros = self.clock.map(|f| f());
1203        rewrite_clock_calls(&mut stmt, now_micros);
1204        if let Statement::Select(s) = &mut stmt {
1205            // v6.4.1 — expand `GROUP BY ALL` to every non-aggregate
1206            // SELECT-list item BEFORE position / alias resolution so
1207            // downstream passes see the explicit list.
1208            expand_group_by_all(s);
1209            resolve_order_by_position(s);
1210            // v6.2.3 — cost-based JOIN reorder. No-op for
1211            // single-table FROMs or any non-INNER join shape.
1212            reorder::reorder_joins(s, &self.catalog, &self.statistics);
1213        }
1214        Ok(stmt)
1215    }
1216
1217    /// v6.3.0 — cached prepare. Returns a cloned `Statement` from
1218    /// the plan cache on hit, runs the full `prepare()` path on miss
1219    /// and inserts the resulting plan before returning. Skipping the
1220    /// parse + JOIN-reorder pipeline on hit is the dominant win for
1221    /// JDBC / sqlx / pgx clients that reuse the same SQL string.
1222    ///
1223    /// Returns a cloned `Statement` (not a borrow) because the
1224    /// pgwire layer owns its `PreparedStmt` map per-session and the
1225    /// engine-level cache must stay available for other sessions.
1226    /// Clone cost on a 5-table JOIN AST is well under the parse cost
1227    /// it replaces.
1228    pub fn prepare_cached(&mut self, sql: &str) -> Result<Statement, ParseError> {
1229        // v6.3.1 — version-aware lookup. If the cached plan was
1230        // prepared before the most recent ANALYZE, evict and replan.
1231        let current_version = self.statistics.version();
1232        if let Some(plan) = self.plan_cache.get(sql) {
1233            if plan.statistics_version == current_version {
1234                return Ok(plan.stmt.clone());
1235            }
1236            // Stale entry — fall through to evict + re-prepare.
1237        }
1238        self.plan_cache.evict(sql);
1239        let stmt = self.prepare(sql)?;
1240        let source_tables = plan_cache::collect_source_tables(&stmt);
1241        let plan = plan_cache::PreparedPlan {
1242            stmt: stmt.clone(),
1243            statistics_version: current_version,
1244            source_tables,
1245            describe_columns: alloc::vec::Vec::new(),
1246        };
1247        self.plan_cache.insert(String::from(sql), plan);
1248        Ok(stmt)
1249    }
1250
1251    /// v6.3.0 — read-only accessor for tests and v6.3.1 invalidation.
1252    pub fn plan_cache(&self) -> &plan_cache::PlanCache {
1253        &self.plan_cache
1254    }
1255
1256    /// v6.3.0 — mutable accessor for v6.3.1 invalidation hooks.
1257    pub fn plan_cache_mut(&mut self) -> &mut plan_cache::PlanCache {
1258        &mut self.plan_cache
1259    }
1260
1261    /// v6.3.3 — Describe a prepared `Statement` without executing.
1262    /// Returns `(parameter_oids, output_columns)`. Empty
1263    /// `output_columns` means the statement has no row-producing
1264    /// shape we could resolve here (JOIN, subquery, non-SELECT, …)
1265    /// — pgwire layer maps that to a `NoData` reply.
1266    pub fn describe_prepared(
1267        &self,
1268        stmt: &Statement,
1269    ) -> (Vec<u32>, Vec<ColumnSchema>) {
1270        describe::describe_prepared(stmt, self.active_catalog())
1271    }
1272
1273    /// v6.1.1 — execute a [`Statement`] previously returned by
1274    /// [`Engine::prepare`], substituting `Expr::Placeholder(n)`
1275    /// nodes for the corresponding [`Value`] in `params` (1-based
1276    /// per PG: `$1` → `params[0]`). Bind-time string parameters
1277    /// are decoded into typed `Value`s by the pgwire layer before
1278    /// this call so the resulting AST hits the same execution
1279    /// path as a simple query — no SQL re-parse.
1280    ///
1281    /// Pgwire's `Execute` (E) message after a `Bind` (B) lands here.
1282    pub fn execute_prepared(
1283        &mut self,
1284        mut stmt: Statement,
1285        params: &[Value],
1286    ) -> Result<QueryResult, EngineError> {
1287        substitute_placeholders(&mut stmt, params)?;
1288        self.execute_stmt_with_cancel(stmt, CancelToken::none())
1289    }
1290
1291    fn execute_inner_with_cancel(
1292        &mut self,
1293        sql: &str,
1294        cancel: CancelToken<'_>,
1295    ) -> Result<QueryResult, EngineError> {
1296        cancel.check()?;
1297        let stmt = self.prepare(sql)?;
1298        // v6.5.1 — wrap the executor with a wall-clock window so we
1299        // can record into spg_stat_query. Skip when the engine has
1300        // no clock attached (no_std embedded callers).
1301        let start_us = self.clock.map(|f| f());
1302        let result = self.execute_stmt_with_cancel(stmt, cancel);
1303        if let (Some(t0), Ok(_)) = (start_us, &result) {
1304            let now = self.clock.map_or(t0, |f| f());
1305            let elapsed = now.saturating_sub(t0).max(0) as u64;
1306            self.query_stats.record(sql, elapsed, now as u64);
1307            // v6.5.6 — slow-query log: fire callback when elapsed
1308            // exceeds the configured floor.
1309            if let (Some(threshold), Some(logger)) =
1310                (self.slow_query_threshold_us, self.slow_query_logger)
1311                && elapsed >= threshold
1312            {
1313                logger(sql, elapsed);
1314            }
1315        }
1316        result
1317    }
1318
1319    fn execute_stmt_with_cancel(
1320        &mut self,
1321        stmt: Statement,
1322        cancel: CancelToken<'_>,
1323    ) -> Result<QueryResult, EngineError> {
1324        cancel.check()?;
1325        let result = match stmt {
1326            Statement::CreateTable(s) => self.exec_create_table(s),
1327            Statement::CreateIndex(s) => self.exec_create_index(s),
1328            Statement::Insert(s) => self.exec_insert(s),
1329            Statement::Update(s) => self.exec_update_cancel(&s, cancel),
1330            Statement::Delete(s) => self.exec_delete_cancel(&s, cancel),
1331            Statement::Select(s) => self.exec_select_cancel(&s, cancel),
1332            Statement::Begin => self.exec_begin(),
1333            Statement::Commit => self.exec_commit(),
1334            Statement::Rollback => self.exec_rollback(),
1335            Statement::Savepoint(name) => self.exec_savepoint(name),
1336            Statement::RollbackToSavepoint(name) => self.exec_rollback_to_savepoint(&name),
1337            Statement::ReleaseSavepoint(name) => self.exec_release_savepoint(&name),
1338            Statement::ShowTables => Ok(self.exec_show_tables()),
1339            Statement::ShowColumns(table) => self.exec_show_columns(&table),
1340            Statement::ShowUsers => Ok(self.exec_show_users()),
1341            Statement::ShowPublications => Ok(self.exec_show_publications()),
1342            Statement::ShowSubscriptions => Ok(self.exec_show_subscriptions()),
1343            Statement::CreateUser(s) => self.exec_create_user(&s),
1344            Statement::DropUser(name) => self.exec_drop_user(&name),
1345            Statement::Explain(e) => self.exec_explain(&e, cancel),
1346            Statement::AlterIndex(s) => self.exec_alter_index(s),
1347            Statement::AlterTable(s) => self.exec_alter_table(s),
1348            Statement::CreatePublication(s) => self.exec_create_publication(s),
1349            Statement::DropPublication(name) => self.exec_drop_publication(&name),
1350            Statement::CreateSubscription(s) => self.exec_create_subscription(s),
1351            Statement::DropSubscription(name) => self.exec_drop_subscription(&name),
1352            // v6.1.7 — WAIT FOR WAL POSITION needs `lag_state`,
1353            // which lives in spg-server's ServerState. The engine
1354            // surfaces a clear error; the server-layer dispatch
1355            // intercepts the SQL before it reaches the engine on
1356            // a server build, so this arm only fires for
1357            // engine-only callers (spg-embedded, lib tests).
1358            Statement::WaitForWalPosition { .. } => Err(EngineError::Unsupported(
1359                "WAIT FOR WAL POSITION must be handled by the server layer".into(),
1360            )),
1361            // v6.2.0 — ANALYZE recomputes per-column histograms.
1362            Statement::Analyze(target) => self.exec_analyze(target.as_deref()),
1363            // v6.7.3 — COMPACT COLD SEGMENTS.
1364            Statement::CompactColdSegments => self.exec_compact_cold_segments(),
1365        };
1366        self.enforce_row_limit(result)
1367    }
1368
1369    /// v6.1.2 — `CREATE PUBLICATION` runtime path. Duplicate names
1370    /// surface as `EngineError::Unsupported` so the existing PG-wire
1371    /// error mapping stays uniform; the message carries the name so
1372    /// operators can grep replication-log noise. Inside-transaction
1373    /// invocation is rejected (matches `CREATE USER` / `DROP USER`
1374    /// stance) — replication-catalog mutation is a connection-level
1375    /// administrative op, not a transactional one.
1376    fn exec_create_publication(
1377        &mut self,
1378        s: CreatePublicationStatement,
1379    ) -> Result<QueryResult, EngineError> {
1380        // v6.1.4 — the v6.1.2 "no DDL inside a transaction" guard
1381        // was over-cautious: it also blocked the auto-commit wrap
1382        // path (which begins an internal TX around every WAL-
1383        // logged statement). PG itself allows CREATE PUBLICATION
1384        // inside a transaction (it rolls back with the TX).
1385        self.publications
1386            .create(s.name, s.scope)
1387            .map_err(|e| EngineError::Unsupported(alloc::format!("CREATE PUBLICATION: {e:?}")))?;
1388        Ok(QueryResult::CommandOk {
1389            affected: 1,
1390            modified_catalog: true,
1391        })
1392    }
1393
1394    /// v6.1.2 — `DROP PUBLICATION` runtime path. PG-compatible silent
1395    /// no-op when the publication doesn't exist (returns `affected=0`
1396    /// in that case so the wire-level command tag distinguishes
1397    /// "dropped" from "no-op", though both succeed).
1398    fn exec_drop_publication(&mut self, name: &str) -> Result<QueryResult, EngineError> {
1399        let removed = self.publications.drop(name);
1400        Ok(QueryResult::CommandOk {
1401            affected: usize::from(removed),
1402            modified_catalog: removed,
1403        })
1404    }
1405
1406    /// v6.1.2 — read access to the publication catalog. Used by
1407    /// the v6.1.5 publisher-side WAL filter, by `SHOW PUBLICATIONS`
1408    /// (v6.1.3+), and by e2e tests that need to assert state without
1409    /// going through the wire.
1410    pub const fn publications(&self) -> &publications::Publications {
1411        &self.publications
1412    }
1413
1414    /// v6.1.4 — `CREATE SUBSCRIPTION` runtime path. Defaults
1415    /// `enabled = true` and `last_received_pos = 0` for a freshly-
1416    /// created subscription. The actual worker thread is spawned
1417    /// by spg-server once the engine returns success.
1418    fn exec_create_subscription(
1419        &mut self,
1420        s: CreateSubscriptionStatement,
1421    ) -> Result<QueryResult, EngineError> {
1422        // See exec_create_publication — the in_transaction gate
1423        // was over-cautious; the auto-commit wrap path holds an
1424        // internal TX that this check was incorrectly blocking.
1425        let sub = subscriptions::Subscription {
1426            conn_str: s.conn_str,
1427            publications: s.publications,
1428            enabled: true,
1429            last_received_pos: 0,
1430        };
1431        self.subscriptions
1432            .create(s.name, sub)
1433            .map_err(|e| EngineError::Unsupported(alloc::format!("CREATE SUBSCRIPTION: {e:?}")))?;
1434        Ok(QueryResult::CommandOk {
1435            affected: 1,
1436            modified_catalog: true,
1437        })
1438    }
1439
1440    /// v6.1.4 — `DROP SUBSCRIPTION`. Silent no-op when the name
1441    /// doesn't exist (PG-compatible). The associated worker is
1442    /// torn down by spg-server when it observes the catalog
1443    /// change at the next snapshot or via the engine's
1444    /// subscriptions accessor (the worker polls the catalog on
1445    /// reconnect; v6.1.5's filter-side will tighten this to an
1446    /// explicit signal).
1447    fn exec_drop_subscription(&mut self, name: &str) -> Result<QueryResult, EngineError> {
1448        let removed = self.subscriptions.drop(name);
1449        Ok(QueryResult::CommandOk {
1450            affected: usize::from(removed),
1451            modified_catalog: removed,
1452        })
1453    }
1454
1455    /// v6.1.4 — read access to the subscription catalog. Used by
1456    /// the subscription worker (read its own row to find its
1457    /// publications + last applied position), by SHOW SUBSCRIPTIONS,
1458    /// and by e2e tests asserting state directly.
1459    pub const fn subscriptions(&self) -> &subscriptions::Subscriptions {
1460        &self.subscriptions
1461    }
1462
1463    /// v6.1.4 — write access to `last_received_pos`. Worker
1464    /// calls this after each apply batch (under the engine's
1465    /// write-lock). Returns `false` when the subscription was
1466    /// dropped between when the worker received the record and
1467    /// when this call landed.
1468    pub fn subscription_advance(&mut self, name: &str, pos: u64) -> bool {
1469        self.subscriptions.update_last_received_pos(name, pos)
1470    }
1471
1472    /// v6.1.4 — `SHOW SUBSCRIPTIONS` row materialisation. Returns
1473    /// `(name, conn_str, publications, enabled, last_received_pos)`
1474    /// ordered by subscription name. The `publications` column is
1475    /// the comma-joined list ("p1, p2") for ergonomic SHOW output;
1476    /// callers wanting structured access read `Engine::subscriptions`.
1477    fn exec_show_subscriptions(&self) -> QueryResult {
1478        let columns = alloc::vec![
1479            ColumnSchema::new("name", DataType::Text, false),
1480            ColumnSchema::new("conn_str", DataType::Text, false),
1481            ColumnSchema::new("publications", DataType::Text, false),
1482            ColumnSchema::new("enabled", DataType::Bool, false),
1483            ColumnSchema::new("last_received_pos", DataType::BigInt, false),
1484        ];
1485        let rows: Vec<Row> = self
1486            .subscriptions
1487            .iter()
1488            .map(|(name, sub)| {
1489                Row::new(alloc::vec![
1490                    Value::Text(name.clone()),
1491                    Value::Text(sub.conn_str.clone()),
1492                    Value::Text(sub.publications.join(", ")),
1493                    Value::Bool(sub.enabled),
1494                    Value::BigInt(i64::try_from(sub.last_received_pos).unwrap_or(i64::MAX)),
1495                ])
1496            })
1497            .collect();
1498        QueryResult::Rows { columns, rows }
1499    }
1500
1501    /// v6.2.0 — materialise `spg_statistic` rows. One row per
1502    /// `(table, column)` pair tracked in `Statistics`, with
1503    /// `histogram_bounds` rendered as a `[v0, v1, ...]` string —
1504    /// the same canonical form vector literals use for round-trip.
1505    fn exec_spg_statistic(&self) -> QueryResult {
1506        let columns = alloc::vec![
1507            ColumnSchema::new("table_name", DataType::Text, false),
1508            ColumnSchema::new("column_name", DataType::Text, false),
1509            ColumnSchema::new("null_frac", DataType::Float, false),
1510            ColumnSchema::new("n_distinct", DataType::BigInt, false),
1511            ColumnSchema::new("histogram_bounds", DataType::Text, false),
1512            // v6.7.0 — appended column (v6.2.0 stability contract
1513            // allows APPEND to spg_statistic, not reorder/rename).
1514            // Reports the cached per-table cold-row count; same
1515            // value across every column row of the same table.
1516            ColumnSchema::new("cold_row_count", DataType::BigInt, false),
1517        ];
1518        let rows: Vec<Row> = self
1519            .statistics
1520            .iter()
1521            .map(|((t, c), s)| {
1522                let cold = self
1523                    .catalog
1524                    .get(t)
1525                    .map_or(0, |table| table.cold_row_count());
1526                Row::new(alloc::vec![
1527                    Value::Text(t.clone()),
1528                    Value::Text(c.clone()),
1529                    Value::Float(f64::from(s.null_frac)),
1530                    Value::BigInt(i64::try_from(s.n_distinct).unwrap_or(i64::MAX)),
1531                    Value::Text(render_histogram_bounds(&s.histogram_bounds)),
1532                    Value::BigInt(i64::try_from(cold).unwrap_or(i64::MAX)),
1533                ])
1534            })
1535            .collect();
1536        QueryResult::Rows { columns, rows }
1537    }
1538
1539    /// v6.5.0 — materialise `spg_stat_replication` rows. One row
1540    /// per subscription with `(name, conn_str, publications,
1541    /// last_received_pos, enabled)`. Surface mirrors
1542    /// `SHOW SUBSCRIPTIONS` but follows the virtual-table dispatch
1543    /// shape so it composes with SELECT clauses (WHERE, projection
1544    /// onto specific columns, etc).
1545    fn exec_spg_stat_replication(&self) -> QueryResult {
1546        let columns = alloc::vec![
1547            ColumnSchema::new("name", DataType::Text, false),
1548            ColumnSchema::new("conn_str", DataType::Text, false),
1549            ColumnSchema::new("publications", DataType::Text, false),
1550            ColumnSchema::new("last_received_pos", DataType::BigInt, false),
1551            ColumnSchema::new("enabled", DataType::Bool, false),
1552        ];
1553        let rows: Vec<Row> = self
1554            .subscriptions
1555            .iter()
1556            .map(|(name, sub)| {
1557                Row::new(alloc::vec![
1558                    Value::Text(name.clone()),
1559                    Value::Text(sub.conn_str.clone()),
1560                    Value::Text(sub.publications.join(",")),
1561                    Value::BigInt(i64::try_from(sub.last_received_pos).unwrap_or(i64::MAX)),
1562                    Value::Bool(sub.enabled),
1563                ])
1564            })
1565            .collect();
1566        QueryResult::Rows { columns, rows }
1567    }
1568
1569    /// v6.5.0 — materialise `spg_stat_segment` rows. One row per
1570    /// cold-tier segment with `(segment_id, num_rows, num_pages,
1571    /// total_bytes)`.
1572    ///
1573    /// v6.7.0 — appended `table_name` column resolves the v6.5.0
1574    /// carve-out. Walks every user table's BTree indices to find
1575    /// which table's Cold locators point at each segment. Empty
1576    /// string for orphan segments (loaded via SPG_PRELOAD_COLD_SEGMENT
1577    /// before any index registered a locator). The walk is
1578    /// O(tables × indices × keys); cached per call, not across
1579    /// calls — re-walked on every `SELECT * FROM spg_stat_segment`.
1580    fn exec_spg_stat_segment(&self) -> QueryResult {
1581        let columns = alloc::vec![
1582            ColumnSchema::new("segment_id", DataType::BigInt, false),
1583            ColumnSchema::new("table_name", DataType::Text, false),
1584            ColumnSchema::new("num_rows", DataType::BigInt, false),
1585            ColumnSchema::new("num_pages", DataType::BigInt, false),
1586            ColumnSchema::new("total_bytes", DataType::BigInt, false),
1587        ];
1588        // v6.7.0 — build a segment_id → table_name map by walking
1589        // every user table's BTree indices once. O(tables × indices
1590        // × keys) for the v6.5.0 carve-out resolution; acceptable
1591        // because spg_stat_segment is operator-facing (not on a
1592        // hot-loop path).
1593        let mut segment_owners: alloc::collections::BTreeMap<u32, String> = BTreeMap::new();
1594        for tname in self.catalog.table_names() {
1595            if is_internal_table_name(&tname) {
1596                continue;
1597            }
1598            let Some(t) = self.catalog.get(&tname) else {
1599                continue;
1600            };
1601            for idx in t.indices() {
1602                if let spg_storage::IndexKind::BTree(map) = &idx.kind {
1603                    for (_, locs) in map.iter() {
1604                        for loc in locs {
1605                            if let spg_storage::RowLocator::Cold { segment_id, .. } = loc {
1606                                segment_owners.entry(*segment_id).or_insert_with(|| tname.clone());
1607                            }
1608                        }
1609                    }
1610                }
1611            }
1612        }
1613        let rows: Vec<Row> = self
1614            .catalog
1615            .cold_segment_ids_global()
1616            .iter()
1617            .filter_map(|&id| {
1618                let seg = self.catalog.cold_segment(id)?;
1619                let meta = seg.meta();
1620                let owner = segment_owners
1621                    .get(&id)
1622                    .cloned()
1623                    .unwrap_or_default();
1624                Some(Row::new(alloc::vec![
1625                    Value::BigInt(i64::from(id)),
1626                    Value::Text(owner),
1627                    Value::BigInt(i64::try_from(meta.num_rows).unwrap_or(i64::MAX)),
1628                    Value::BigInt(i64::from(meta.num_pages)),
1629                    Value::BigInt(i64::try_from(meta.total_bytes).unwrap_or(i64::MAX)),
1630                ]))
1631            })
1632            .collect();
1633        QueryResult::Rows { columns, rows }
1634    }
1635
1636    /// v6.5.1 — materialise `spg_stat_query` rows. One row per
1637    /// distinct SQL text recorded since the engine booted, capped
1638    /// at `QUERY_STATS_MAX` (1024). Columns:
1639    ///   sql, exec_count, total_us, mean_us, max_us, last_seen_us
1640    /// mean_us = total_us / exec_count (saturating).
1641    fn exec_spg_stat_query(&self) -> QueryResult {
1642        let columns = alloc::vec![
1643            ColumnSchema::new("sql", DataType::Text, false),
1644            ColumnSchema::new("exec_count", DataType::BigInt, false),
1645            ColumnSchema::new("total_us", DataType::BigInt, false),
1646            ColumnSchema::new("mean_us", DataType::BigInt, false),
1647            ColumnSchema::new("max_us", DataType::BigInt, false),
1648            ColumnSchema::new("last_seen_us", DataType::BigInt, false),
1649        ];
1650        let rows: Vec<Row> = self
1651            .query_stats
1652            .snapshot()
1653            .into_iter()
1654            .map(|(sql, s)| {
1655                let mean = if s.exec_count == 0 {
1656                    0
1657                } else {
1658                    s.total_us / s.exec_count
1659                };
1660                Row::new(alloc::vec![
1661                    Value::Text(sql),
1662                    Value::BigInt(i64::try_from(s.exec_count).unwrap_or(i64::MAX)),
1663                    Value::BigInt(i64::try_from(s.total_us).unwrap_or(i64::MAX)),
1664                    Value::BigInt(i64::try_from(mean).unwrap_or(i64::MAX)),
1665                    Value::BigInt(i64::try_from(s.max_us).unwrap_or(i64::MAX)),
1666                    Value::BigInt(i64::try_from(s.last_seen_us).unwrap_or(i64::MAX)),
1667                ])
1668            })
1669            .collect();
1670        QueryResult::Rows { columns, rows }
1671    }
1672
1673    /// v6.5.2 — register a connection-state provider. spg-server
1674    /// calls this at startup with a function that snapshots its
1675    /// per-pgwire-connection registry. Engine reads through the
1676    /// callback on `SELECT * FROM spg_stat_activity`.
1677    #[must_use]
1678    pub const fn with_activity_provider(mut self, f: ActivityProvider) -> Self {
1679        self.activity_provider = Some(f);
1680        self
1681    }
1682
1683    /// v6.5.3 — register audit chain provider + verifier.
1684    #[must_use]
1685    pub const fn with_audit_providers(
1686        mut self,
1687        chain: AuditChainProvider,
1688        verify: AuditVerifier,
1689    ) -> Self {
1690        self.audit_chain_provider = Some(chain);
1691        self.audit_verifier = Some(verify);
1692        self
1693    }
1694
1695    /// v6.5.6 — register a slow-query log callback. `threshold_us`
1696    /// is the floor (in microseconds); only executes above the floor
1697    /// fire the callback. spg-server wires this from
1698    /// `SPG_SLOW_QUERY_THRESHOLD_MS` (default 100 ms).
1699    #[must_use]
1700    pub const fn with_slow_query_log(
1701        mut self,
1702        threshold_us: u64,
1703        logger: SlowQueryLogger,
1704    ) -> Self {
1705        self.slow_query_threshold_us = Some(threshold_us);
1706        self.slow_query_logger = Some(logger);
1707        self
1708    }
1709
1710    /// v6.5.6 — operator knob for plan cache cap. spg-server reads
1711    /// `SPG_PLAN_CACHE_MAX` env at startup; uses this to override
1712    /// the compile-time default of 256.
1713    pub fn set_plan_cache_max(&mut self, n: usize) {
1714        self.plan_cache.set_max_entries(n);
1715    }
1716
1717    /// v6.5.2 — materialise `spg_stat_activity` rows. Pulls a fresh
1718    /// snapshot from the registered `ActivityProvider`. Returns an
1719    /// empty result set when no provider is registered (the no_std
1720    /// embedded path with no pgwire layer).
1721    fn exec_spg_stat_activity(&self) -> QueryResult {
1722        let columns = alloc::vec![
1723            ColumnSchema::new("pid", DataType::Int, false),
1724            ColumnSchema::new("user", DataType::Text, false),
1725            ColumnSchema::new("started_at_us", DataType::BigInt, false),
1726            ColumnSchema::new("current_sql", DataType::Text, false),
1727            ColumnSchema::new("wait_event", DataType::Text, false),
1728            ColumnSchema::new("elapsed_us", DataType::BigInt, false),
1729            ColumnSchema::new("in_transaction", DataType::Bool, false),
1730        ];
1731        let rows: Vec<Row> = self
1732            .activity_provider
1733            .map(|f| f())
1734            .unwrap_or_default()
1735            .into_iter()
1736            .map(|r| {
1737                Row::new(alloc::vec![
1738                    Value::Int(i32::try_from(r.pid).unwrap_or(i32::MAX)),
1739                    Value::Text(r.user),
1740                    Value::BigInt(r.started_at_us),
1741                    Value::Text(r.current_sql),
1742                    Value::Text(r.wait_event),
1743                    Value::BigInt(r.elapsed_us),
1744                    Value::Bool(r.in_transaction),
1745                ])
1746            })
1747            .collect();
1748        QueryResult::Rows { columns, rows }
1749    }
1750
1751    /// v6.5.4 — materialise `spg_table_ddl` rows. One row per user
1752    /// table with `(table_name, ddl)`. Reconstructed from catalog
1753    /// state on demand.
1754    fn exec_spg_table_ddl(&self) -> QueryResult {
1755        let columns = alloc::vec![
1756            ColumnSchema::new("table_name", DataType::Text, false),
1757            ColumnSchema::new("ddl", DataType::Text, false),
1758        ];
1759        let rows: Vec<Row> = self
1760            .catalog
1761            .table_names()
1762            .into_iter()
1763            .filter(|n| !is_internal_table_name(n))
1764            .filter_map(|name| {
1765                let table = self.catalog.get(&name)?;
1766                let ddl = render_create_table(&name, &table.schema().columns);
1767                Some(Row::new(alloc::vec![
1768                    Value::Text(name),
1769                    Value::Text(ddl),
1770                ]))
1771            })
1772            .collect();
1773        QueryResult::Rows { columns, rows }
1774    }
1775
1776    /// v6.5.4 — materialise `spg_role_ddl` rows. One row per user
1777    /// with `(role_name, ddl)`. Password is redacted (matches the
1778    /// `Statement::CreateUser` Display which prints `'<redacted>'`).
1779    fn exec_spg_role_ddl(&self) -> QueryResult {
1780        let columns = alloc::vec![
1781            ColumnSchema::new("role_name", DataType::Text, false),
1782            ColumnSchema::new("ddl", DataType::Text, false),
1783        ];
1784        let rows: Vec<Row> = self
1785            .users
1786            .iter()
1787            .map(|(name, rec)| {
1788                let ddl = alloc::format!(
1789                    "CREATE USER {name} WITH PASSWORD '<redacted>' ROLE '{}'",
1790                    rec.role.as_str(),
1791                );
1792                Row::new(alloc::vec![Value::Text(String::from(name)), Value::Text(ddl)])
1793            })
1794            .collect();
1795        QueryResult::Rows { columns, rows }
1796    }
1797
1798    /// v6.5.4 — materialise `spg_database_ddl`: single row whose
1799    /// `ddl` column concatenates every user table's CREATE +
1800    /// every role's CREATE in deterministic catalog order. Suitable
1801    /// for piping back through `Engine::execute` to recreate a
1802    /// schema-equivalent database.
1803    fn exec_spg_database_ddl(&self) -> QueryResult {
1804        let columns = alloc::vec![ColumnSchema::new("ddl", DataType::Text, false)];
1805        let mut out = String::new();
1806        for (name, rec) in self.users.iter() {
1807            out.push_str(&alloc::format!(
1808                "CREATE USER {name} WITH PASSWORD '<redacted>' ROLE '{}';\n",
1809                rec.role.as_str(),
1810            ));
1811        }
1812        for name in self.catalog.table_names() {
1813            if is_internal_table_name(&name) {
1814                continue;
1815            }
1816            if let Some(table) = self.catalog.get(&name) {
1817                out.push_str(&render_create_table(&name, &table.schema().columns));
1818                out.push_str(";\n");
1819            }
1820        }
1821        QueryResult::Rows {
1822            columns,
1823            rows: alloc::vec![Row::new(alloc::vec![Value::Text(out)])],
1824        }
1825    }
1826
1827    /// v6.5.3 — materialise `spg_audit_chain` rows. Pulls a fresh
1828    /// snapshot from the registered provider; empty when no
1829    /// provider is set.
1830    fn exec_spg_audit_chain(&self) -> QueryResult {
1831        let columns = alloc::vec![
1832            ColumnSchema::new("seq", DataType::BigInt, false),
1833            ColumnSchema::new("ts_ms", DataType::BigInt, false),
1834            ColumnSchema::new("prev_hash", DataType::Text, false),
1835            ColumnSchema::new("entry_hash", DataType::Text, false),
1836            ColumnSchema::new("sql", DataType::Text, false),
1837        ];
1838        let rows: Vec<Row> = self
1839            .audit_chain_provider
1840            .map(|f| f())
1841            .unwrap_or_default()
1842            .into_iter()
1843            .map(|r| {
1844                Row::new(alloc::vec![
1845                    Value::BigInt(r.seq),
1846                    Value::BigInt(r.ts_ms),
1847                    Value::Text(r.prev_hash_hex),
1848                    Value::Text(r.entry_hash_hex),
1849                    Value::Text(r.sql),
1850                ])
1851            })
1852            .collect();
1853        QueryResult::Rows { columns, rows }
1854    }
1855
1856    /// v6.5.3 — materialise `spg_audit_verify` single-row result.
1857    /// `(verified_count, broken_at_seq)` — broken_at_seq is `-1`
1858    /// on a clean chain. Returns one row with both values 0 when
1859    /// no verifier is registered (no-data fallback for embedded
1860    /// callers).
1861    fn exec_spg_audit_verify(&self) -> QueryResult {
1862        let columns = alloc::vec![
1863            ColumnSchema::new("verified_count", DataType::BigInt, false),
1864            ColumnSchema::new("broken_at_seq", DataType::BigInt, false),
1865        ];
1866        let (verified, broken) = self.audit_verifier.map(|f| f()).unwrap_or((0, -1));
1867        let row = Row::new(alloc::vec![
1868            Value::BigInt(verified),
1869            Value::BigInt(broken),
1870        ]);
1871        QueryResult::Rows {
1872            columns,
1873            rows: alloc::vec![row],
1874        }
1875    }
1876
1877    /// v6.5.1 — read-only accessor for tests + v6.5.6 ops resets.
1878    pub fn query_stats(&self) -> &query_stats::QueryStats {
1879        &self.query_stats
1880    }
1881
1882    /// v6.5.1 — mutable accessor (clear, etc).
1883    pub fn query_stats_mut(&mut self) -> &mut query_stats::QueryStats {
1884        &mut self.query_stats
1885    }
1886
1887    /// v6.2.0 — read access to the per-column statistics table.
1888    /// Used by the planner (v6.2.2 selectivity functions read this),
1889    /// by `SELECT * FROM spg_statistic`, and by e2e tests.
1890    pub const fn statistics(&self) -> &statistics::Statistics {
1891        &self.statistics
1892    }
1893
1894    /// v6.2.1 — return tables whose modified-row count crossed the
1895    /// auto-analyze threshold since the last ANALYZE on that table.
1896    /// The threshold is `0.1 × max(row_count, MIN_ROWS_FOR_AUTO_
1897    /// ANALYZE)` — combines PG-style fractional + absolute lower
1898    /// bound so a fresh / tiny table doesn't get hammered on every
1899    /// INSERT.
1900    ///
1901    /// Designed to be cheap: walks every user table's
1902    /// `Catalog::table_names()` + reads `statistics::modified_
1903    /// since_last_analyze()` (BTreeMap lookup). The background
1904    /// worker calls this under `engine.read()` then drops the lock
1905    /// before re-acquiring `engine.write()` for the actual ANALYZE.
1906    pub fn tables_needing_analyze(&self) -> Vec<String> {
1907        const MIN_ROWS: u64 = 100;
1908        let mut out = Vec::new();
1909        for name in self.catalog.table_names() {
1910            if is_internal_table_name(&name) {
1911                continue;
1912            }
1913            let Some(table) = self.catalog.get(&name) else {
1914                continue;
1915            };
1916            let row_count = table.rows().len() as u64;
1917            let modified = self.statistics.modified_since_last_analyze(&name);
1918            // Threshold: ceil(0.1 × max(row_count, MIN_ROWS)),
1919            // computed in integer arithmetic so spg-engine stays
1920            // no_std without pulling in libm. `(n + 9) / 10` is
1921            // `ceil(n / 10)` for non-negative `n`.
1922            let base = row_count.max(MIN_ROWS);
1923            let threshold = base.saturating_add(9) / 10;
1924            if modified >= threshold {
1925                out.push(name);
1926            }
1927        }
1928        out
1929    }
1930
1931    /// v6.2.0 — `ANALYZE [<table>]` runtime. Bare `ANALYZE` walks
1932    /// every user table; `ANALYZE <name>` re-stats one. For each
1933    /// target table, single-pass scan + per-column histogram +
1934    /// `null_frac` + `n_distinct`. Replaces the table's prior
1935    /// stats; resets the modified-row counter.
1936    ///
1937    /// v6.2.0 doesn't sample — it scans the full table. v6.2.x
1938    /// can add reservoir sampling at the > 100 K-row mark; not a
1939    /// scope blocker for the current commit since rows ≤ 100 K
1940    /// analyse in milliseconds.
1941    fn exec_analyze(&mut self, target: Option<&str>) -> Result<QueryResult, EngineError> {
1942        let names: Vec<String> = if let Some(name) = target {
1943            // Verify the table exists; surface a clear error if not.
1944            if self.catalog.get(name).is_none() {
1945                return Err(EngineError::Storage(StorageError::TableNotFound {
1946                    name: name.to_string(),
1947                }));
1948            }
1949            alloc::vec![name.to_string()]
1950        } else {
1951            self.catalog
1952                .table_names()
1953                .into_iter()
1954                .filter(|n| !is_internal_table_name(n))
1955                .collect()
1956        };
1957        let mut analysed = 0usize;
1958        for table_name in &names {
1959            self.analyze_one_table(table_name)?;
1960            analysed += 1;
1961        }
1962        // v6.3.1 — plan cache invalidation. Bump stats version so
1963        // future lookups see the new generation, and selectively
1964        // evict every plan whose `source_tables` overlap with the
1965        // ANALYZE target set. Bare ANALYZE (all tables) clears the
1966        // whole cache.
1967        if analysed > 0 {
1968            self.statistics.bump_version();
1969            if target.is_some() {
1970                for t in &names {
1971                    self.plan_cache.evict_referencing(t);
1972                }
1973            } else {
1974                self.plan_cache.clear();
1975            }
1976        }
1977        Ok(QueryResult::CommandOk {
1978            affected: analysed,
1979            modified_catalog: true,
1980        })
1981    }
1982
1983    /// v6.7.3 — `COMPACT COLD SEGMENTS` runtime path. Drives the
1984    /// engine-layer compaction shim with the default
1985    /// 4 MiB segment-size threshold. spg-server intercepts the
1986    /// SQL before it reaches the engine on a server build —
1987    /// it reads `SPG_COMPACTION_TARGET_SEGMENT_BYTES`, calls
1988    /// `Engine::compact_cold_segments_with_target` directly with
1989    /// the env value, and persists every merged segment to
1990    /// `<db>.spg/segments/`. This arm only fires for engine-only
1991    /// callers (spg-embedded, lib tests); in that mode merged
1992    /// segments live in memory and are dropped at process exit.
1993    fn exec_compact_cold_segments(&mut self) -> Result<QueryResult, EngineError> {
1994        let target = COMPACTION_TARGET_DEFAULT_BYTES;
1995        let reports = self.compact_cold_segments_with_target(target)?;
1996        let columns = alloc::vec![
1997            ColumnSchema::new("table_name", DataType::Text, false),
1998            ColumnSchema::new("index_name", DataType::Text, false),
1999            ColumnSchema::new("sources_merged", DataType::BigInt, false),
2000            ColumnSchema::new("merged_segment_id", DataType::BigInt, false),
2001            ColumnSchema::new("merged_rows", DataType::BigInt, false),
2002            ColumnSchema::new("deleted_rows_pruned", DataType::BigInt, false),
2003            ColumnSchema::new("bytes_reclaimed_estimate", DataType::BigInt, false),
2004        ];
2005        let rows: Vec<Row> = reports
2006            .into_iter()
2007            .map(|(tname, iname, report)| {
2008                Row::new(alloc::vec![
2009                    Value::Text(tname),
2010                    Value::Text(iname),
2011                    Value::BigInt(i64::try_from(report.sources.len()).unwrap_or(i64::MAX)),
2012                    Value::BigInt(i64::from(report.merged_segment_id.unwrap_or(0))),
2013                    Value::BigInt(i64::try_from(report.merged_rows).unwrap_or(i64::MAX)),
2014                    Value::BigInt(
2015                        i64::try_from(report.deleted_rows_pruned).unwrap_or(i64::MAX),
2016                    ),
2017                    Value::BigInt(
2018                        i64::try_from(report.bytes_reclaimed_estimate).unwrap_or(i64::MAX),
2019                    ),
2020                ])
2021            })
2022            .collect();
2023        Ok(QueryResult::Rows { columns, rows })
2024    }
2025
2026    /// Walk a single table's rows once and (re-)populate per-column
2027    /// stats. Drops the existing stats for `table` first so columns
2028    /// that have been DROP-ed between ANALYZEs don't leave stale
2029    /// rows.
2030    fn analyze_one_table(&mut self, table_name: &str) -> Result<(), EngineError> {
2031        let table = self.catalog.get(table_name).ok_or_else(|| {
2032            EngineError::Storage(StorageError::TableNotFound {
2033                name: table_name.to_string(),
2034            })
2035        })?;
2036        let schema = table.schema().clone();
2037        let row_count = table.rows().len();
2038        // For each column, collect (sorted) non-NULL textual values
2039        // + count NULLs; then ask `statistics::build_histogram` to
2040        // produce the 101 bounds and `estimate_n_distinct` the
2041        // distinct count.
2042        self.statistics.clear_table(table_name);
2043        for (col_pos, col_schema) in schema.columns.iter().enumerate() {
2044            // v6.2.0 skip: vector columns have their own stats
2045            // shape (HNSW graph topology). v6.2 deliberation #1.
2046            if matches!(col_schema.ty, DataType::Vector { .. }) {
2047                continue;
2048            }
2049            let mut non_null_values: Vec<Value> = Vec::with_capacity(row_count);
2050            let mut nulls: u64 = 0;
2051            for row in table.rows() {
2052                match row.values.get(col_pos) {
2053                    Some(Value::Null) | None => nulls += 1,
2054                    Some(v) => non_null_values.push(v.clone()),
2055                }
2056            }
2057            // Sort by type-aware ordering (Int as int, Text as
2058            // lex, etc.) so histogram bounds reflect the column's
2059            // natural order — not lexicographic on the string
2060            // representation, which would put "9" after "49".
2061            non_null_values.sort_by(|a, b| sort_values_for_histogram(a, b));
2062            let non_null: Vec<String> = non_null_values
2063                .iter()
2064                .map(canonical_value_repr)
2065                .collect();
2066            let null_frac = if row_count == 0 {
2067                0.0
2068            } else {
2069                #[allow(clippy::cast_precision_loss)]
2070                let f = nulls as f32 / row_count as f32;
2071                f
2072            };
2073            let n_distinct = statistics::estimate_n_distinct(&non_null);
2074            let histogram_bounds = statistics::build_histogram(&non_null);
2075            self.statistics.set(
2076                table_name.to_string(),
2077                col_schema.name.clone(),
2078                statistics::ColumnStats {
2079                    null_frac,
2080                    n_distinct,
2081                    histogram_bounds,
2082                },
2083            );
2084        }
2085        self.statistics.reset_modified(table_name);
2086        // v6.7.0 — refresh the per-table cold_rows cache. Walk the
2087        // BTree indices and count Cold locators (MAX across
2088        // indices); store the result on the table. Surfaced via
2089        // `spg_statistic.cold_row_count` (new column) and
2090        // `spg_stat_segment.table_name` (new column).
2091        let cold_count = {
2092            let table = self
2093                .active_catalog()
2094                .get(table_name)
2095                .expect("table still present");
2096            table.count_cold_locators()
2097        };
2098        let table_mut = self
2099            .active_catalog_mut()
2100            .get_mut(table_name)
2101            .expect("table still present");
2102        table_mut.set_cold_row_count(cold_count);
2103        Ok(())
2104    }
2105
2106    /// v6.1.3 — `SHOW PUBLICATIONS` row materialisation. Returns
2107    /// `(name, scope, table_count)` ordered by publication name.
2108    ///   - `scope` is the human-readable string:
2109    ///       `"FOR ALL TABLES"` /
2110    ///       `"FOR TABLE t1, t2"` /
2111    ///       `"FOR ALL TABLES EXCEPT t1, t2"`.
2112    ///   - `table_count` is NULL for `AllTables`, the list length
2113    ///     otherwise. NULLability lets clients distinguish "publish
2114    ///     everything" from "publish exactly 0 tables" (the v6.1.3
2115    ///     parser forbids the empty list, but the column shape is
2116    ///     ready for the v6.1.5 publisher-side semantics).
2117    fn exec_show_publications(&self) -> QueryResult {
2118        let columns = alloc::vec![
2119            ColumnSchema::new("name", DataType::Text, false),
2120            ColumnSchema::new("scope", DataType::Text, false),
2121            ColumnSchema::new("table_count", DataType::Int, true),
2122        ];
2123        let rows: Vec<Row> = self
2124            .publications
2125            .iter()
2126            .map(|(name, scope)| {
2127                let (scope_str, count_val) = match scope {
2128                    spg_sql::ast::PublicationScope::AllTables => {
2129                        ("FOR ALL TABLES".to_string(), Value::Null)
2130                    }
2131                    spg_sql::ast::PublicationScope::ForTables(ts) => (
2132                        alloc::format!("FOR TABLE {}", ts.join(", ")),
2133                        Value::Int(i32::try_from(ts.len()).unwrap_or(i32::MAX)),
2134                    ),
2135                    spg_sql::ast::PublicationScope::AllTablesExcept(ts) => (
2136                        alloc::format!("FOR ALL TABLES EXCEPT {}", ts.join(", ")),
2137                        Value::Int(i32::try_from(ts.len()).unwrap_or(i32::MAX)),
2138                    ),
2139                };
2140                Row::new(alloc::vec![
2141                    Value::Text(name.clone()),
2142                    Value::Text(scope_str),
2143                    count_val,
2144                ])
2145            })
2146            .collect();
2147        QueryResult::Rows { columns, rows }
2148    }
2149
2150    /// v4.1 `SHOW USERS` — `(name, role)` per row, ordered by name.
2151    fn exec_show_users(&self) -> QueryResult {
2152        let columns = alloc::vec![
2153            ColumnSchema::new("name", DataType::Text, false),
2154            ColumnSchema::new("role", DataType::Text, false),
2155        ];
2156        let rows: Vec<Row> = self
2157            .users
2158            .iter()
2159            .map(|(name, rec)| {
2160                Row::new(alloc::vec![
2161                    Value::Text(name.to_string()),
2162                    Value::Text(rec.role.as_str().to_string()),
2163                ])
2164            })
2165            .collect();
2166        QueryResult::Rows { columns, rows }
2167    }
2168
2169    fn exec_create_user(&mut self, s: &CreateUserStatement) -> Result<QueryResult, EngineError> {
2170        if self.in_transaction() {
2171            return Err(EngineError::Unsupported(
2172                "CREATE USER is not allowed inside a transaction".into(),
2173            ));
2174        }
2175        let role = users::Role::parse(&s.role).ok_or_else(|| {
2176            EngineError::Unsupported(alloc::format!("invalid role: {:?}", s.role))
2177        })?;
2178        // Prefer the host-injected RNG. Falls back to a deterministic
2179        // salt derived from the username only when no RNG is wired —
2180        // acceptable for tests; the server always installs one.
2181        let salt = self.salt_fn.map_or_else(
2182            || {
2183                let mut s_bytes = [0u8; 16];
2184                let digest = spg_crypto::hash(s.name.as_bytes());
2185                s_bytes.copy_from_slice(&digest[..16]);
2186                s_bytes
2187            },
2188            |f| f(),
2189        );
2190        self.users
2191            .create(&s.name, &s.password, role, salt)
2192            .map_err(|e| EngineError::Unsupported(alloc::format!("CREATE USER: {e}")))?;
2193        Ok(QueryResult::CommandOk {
2194            affected: 1,
2195            modified_catalog: true,
2196        })
2197    }
2198
2199    fn exec_drop_user(&mut self, name: &str) -> Result<QueryResult, EngineError> {
2200        if self.in_transaction() {
2201            return Err(EngineError::Unsupported(
2202                "DROP USER is not allowed inside a transaction".into(),
2203            ));
2204        }
2205        self.users
2206            .drop(name)
2207            .map_err(|e| EngineError::Unsupported(alloc::format!("DROP USER: {e}")))?;
2208        Ok(QueryResult::CommandOk {
2209            affected: 1,
2210            modified_catalog: true,
2211        })
2212    }
2213
2214    /// v4.4 `UPDATE <table> SET col = expr [, ...] [WHERE cond]`.
2215    /// Filter pass uses the same WHERE eval as `exec_select`. Per
2216    /// matched row, evaluate each RHS expression against the *old*
2217    /// row, then call `Table::update_row` which rebuilds indices.
2218    /// Indexed columns are correctly reflected because rebuild
2219    /// happens after the cell rewrite.
2220    fn exec_update_cancel(
2221        &mut self,
2222        stmt: &spg_sql::ast::UpdateStatement,
2223        cancel: CancelToken<'_>,
2224    ) -> Result<QueryResult, EngineError> {
2225        // v5.2.3: if the WHERE is a PK equality and matches a cold-
2226        // tier row, promote it back to the hot tier *before* the
2227        // hot-row walk. The promote pushes the row to the end of
2228        // `table.rows`, where the upcoming SET-evaluation loop will
2229        // pick it up and apply the assignments. Lookups for the key
2230        // never observe a gap because `promote_cold_row` inserts the
2231        // hot row before retiring the cold locator.
2232        if let Some(w) = &stmt.where_ {
2233            let schema_cols = self
2234                .active_catalog()
2235                .get(&stmt.table)
2236                .ok_or_else(|| {
2237                    EngineError::Storage(StorageError::TableNotFound {
2238                        name: stmt.table.clone(),
2239                    })
2240                })?
2241                .schema()
2242                .columns
2243                .clone();
2244            if let Some((col_pos, key)) = try_pk_predicate(w, &schema_cols, stmt.table.as_str())
2245                && let Some(idx_name) = self
2246                    .active_catalog()
2247                    .get(&stmt.table)
2248                    .and_then(|t| t.index_on(col_pos).map(|i| i.name.clone()))
2249            {
2250                // Promote may be a no-op (key is hot-only or absent);
2251                // we don't care about the return value here — the
2252                // subsequent hot walk will either match or not.
2253                let _ = self
2254                    .active_catalog_mut()
2255                    .promote_cold_row(&stmt.table, &idx_name, &key);
2256            }
2257        }
2258
2259        let table = self
2260            .active_catalog_mut()
2261            .get_mut(&stmt.table)
2262            .ok_or_else(|| {
2263                EngineError::Storage(StorageError::TableNotFound {
2264                    name: stmt.table.clone(),
2265                })
2266            })?;
2267        let schema_cols: Vec<ColumnSchema> = table.schema().columns.clone();
2268        // Resolve each SET target to a column position once, validate
2269        // up front so a typo'd column doesn't leave a partial mutation
2270        // behind.
2271        let mut targets: Vec<(usize, &Expr)> = Vec::with_capacity(stmt.assignments.len());
2272        for (col, expr) in &stmt.assignments {
2273            let pos = schema_cols
2274                .iter()
2275                .position(|c| c.name == *col)
2276                .ok_or_else(|| {
2277                    EngineError::Eval(EvalError::ColumnNotFound { name: col.clone() })
2278                })?;
2279            targets.push((pos, expr));
2280        }
2281        let ctx = EvalContext::new(&schema_cols, Some(stmt.table.as_str()));
2282        // Walk every row, evaluate WHERE then SET expressions. We
2283        // gather (position, new_values) tuples first and apply them
2284        // afterwards so the WHERE/RHS evaluation reads the original
2285        // row state — matches PG semantics (UPDATE doesn't see its
2286        // own writes).
2287        let mut planned: Vec<(usize, Vec<Value>)> = Vec::new();
2288        for (i, row) in table.rows().iter().enumerate() {
2289            // v4.5: cooperative cancel checkpoint every 256 rows so
2290            // a runaway UPDATE without WHERE doesn't drag past the
2291            // server's query-timeout watchdog.
2292            if i.is_multiple_of(256) {
2293                cancel.check()?;
2294            }
2295            if let Some(w) = &stmt.where_ {
2296                let cond = eval::eval_expr(w, row, &ctx)?;
2297                if !matches!(cond, Value::Bool(true)) {
2298                    continue;
2299                }
2300            }
2301            let mut new_vals = row.values.clone();
2302            for (pos, expr) in &targets {
2303                let v = eval::eval_expr(expr, row, &ctx)?;
2304                new_vals[*pos] =
2305                    coerce_value(v, schema_cols[*pos].ty, &schema_cols[*pos].name, *pos)?;
2306            }
2307            planned.push((i, new_vals));
2308        }
2309        // v7.6.6 — capture pre-update row values for the FK
2310        // enforcement passes below. `planned` carries new values
2311        // only; pair them with the old row.
2312        let plan_with_old: Vec<(usize, Vec<Value>, Vec<Value>)> = planned
2313            .iter()
2314            .map(|(pos, new_vals)| (*pos, table.rows()[*pos].values.clone(), new_vals.clone()))
2315            .collect();
2316        let self_fks = table.schema().foreign_keys.clone();
2317        let affected = planned.len();
2318        // Release mutable borrow on `table` for the FK passes.
2319        let _ = table;
2320        // v7.6.6 — Stage 2a: outbound FK check. For every row whose
2321        // local FK columns changed, the new value must exist in the
2322        // parent.
2323        if !self_fks.is_empty() {
2324            let new_rows: Vec<Vec<Value>> = planned
2325                .iter()
2326                .map(|(_pos, new_vals)| new_vals.clone())
2327                .collect();
2328            enforce_fk_inserts(self.active_catalog(), &stmt.table, &self_fks, &new_rows)?;
2329        }
2330        // v7.6.6 — Stage 2b: inbound FK check. For every row that
2331        // changed value in a column that *some other table* uses as
2332        // a FK parent column, react per `on_update` action.
2333        let child_plan = plan_fk_parent_updates(self.active_catalog(), &stmt.table, &plan_with_old)?;
2334        // Stage 3a — apply each child-side action.
2335        for step in &child_plan {
2336            apply_fk_child_step(self.active_catalog_mut(), step)?;
2337        }
2338        // Stage 3b — apply the original UPDATE.
2339        let table = self
2340            .active_catalog_mut()
2341            .get_mut(&stmt.table)
2342            .ok_or_else(|| {
2343                EngineError::Storage(StorageError::TableNotFound {
2344                    name: stmt.table.clone(),
2345                })
2346            })?;
2347        // v7.9.4 — snapshot post-update values for RETURNING.
2348        let updated_for_returning: Vec<Vec<Value>> =
2349            if stmt.returning.is_some() {
2350                planned.iter().map(|(_pos, vals)| vals.clone()).collect()
2351            } else {
2352                Vec::new()
2353            };
2354        for (pos, vals) in planned {
2355            table.update_row(pos, vals)?;
2356        }
2357        let _ = table;
2358        // v6.2.1 — auto-analyze modified-row tracking for UPDATE.
2359        if !self.in_transaction() && affected > 0 {
2360            self.statistics
2361                .record_modifications(&stmt.table, affected as u64);
2362        }
2363        // v7.9.4 — RETURNING projection.
2364        if let Some(items) = &stmt.returning {
2365            return self.build_returning_rows(
2366                &stmt.table,
2367                items,
2368                updated_for_returning,
2369            );
2370        }
2371        Ok(QueryResult::CommandOk {
2372            affected,
2373            modified_catalog: !self.in_transaction(),
2374        })
2375    }
2376
2377    /// v4.4 `DELETE FROM <table> [WHERE cond]`. Collects matching
2378    /// positions then delegates to `Table::delete_rows` (single index
2379    /// rebuild for the batch).
2380    fn exec_delete_cancel(
2381        &mut self,
2382        stmt: &spg_sql::ast::DeleteStatement,
2383        cancel: CancelToken<'_>,
2384    ) -> Result<QueryResult, EngineError> {
2385        // v5.2.3: PK-targeted DELETE → first retire any cold-tier
2386        // locator for the key. The cold row body stays in the
2387        // segment (becoming shadowed garbage that a future
2388        // compaction pass reclaims) but the index no longer
2389        // resolves it. The shadow count contributes to the
2390        // affected total; the subsequent hot walk handles any hot
2391        // rows for the same key.
2392        let mut cold_shadow_count: usize = 0;
2393        if let Some(w) = &stmt.where_ {
2394            let schema_cols = self
2395                .active_catalog()
2396                .get(&stmt.table)
2397                .ok_or_else(|| {
2398                    EngineError::Storage(StorageError::TableNotFound {
2399                        name: stmt.table.clone(),
2400                    })
2401                })?
2402                .schema()
2403                .columns
2404                .clone();
2405            if let Some((col_pos, key)) = try_pk_predicate(w, &schema_cols, stmt.table.as_str())
2406                && let Some(idx_name) = self
2407                    .active_catalog()
2408                    .get(&stmt.table)
2409                    .and_then(|t| t.index_on(col_pos).map(|i| i.name.clone()))
2410            {
2411                cold_shadow_count = self
2412                    .active_catalog_mut()
2413                    .shadow_cold_row(&stmt.table, &idx_name, &key)
2414                    .unwrap_or(0);
2415            }
2416        }
2417
2418        let table = self
2419            .active_catalog_mut()
2420            .get_mut(&stmt.table)
2421            .ok_or_else(|| {
2422                EngineError::Storage(StorageError::TableNotFound {
2423                    name: stmt.table.clone(),
2424                })
2425            })?;
2426        let schema_cols: Vec<ColumnSchema> = table.schema().columns.clone();
2427        let ctx = EvalContext::new(&schema_cols, Some(stmt.table.as_str()));
2428        let mut positions: Vec<usize> = Vec::new();
2429        // v7.6.3 — collect every to-delete row's full Value tuple
2430        // alongside its position, so the FK enforcement pass can
2431        // run after the mut borrow drops.
2432        let mut to_delete_rows: Vec<Vec<Value>> = Vec::new();
2433        for (i, row) in table.rows().iter().enumerate() {
2434            if i.is_multiple_of(256) {
2435                cancel.check()?;
2436            }
2437            let keep = if let Some(w) = &stmt.where_ {
2438                let cond = eval::eval_expr(w, row, &ctx)?;
2439                !matches!(cond, Value::Bool(true))
2440            } else {
2441                false
2442            };
2443            if !keep {
2444                positions.push(i);
2445                to_delete_rows.push(row.values.clone());
2446            }
2447        }
2448        // v7.6.3 / v7.6.4 — Stage 2: FK enforcement on the immutable
2449        // catalog. Release the mut borrow and run reverse-scan
2450        // against every child table whose FK targets this table.
2451        // RESTRICT / NoAction raise an error; CASCADE returns a
2452        // cascade plan that stage 3 applies after the primary delete.
2453        // SET NULL / SET DEFAULT remain Unsupported until v7.6.5.
2454        let _ = table;
2455        let cascade_plan = plan_fk_parent_deletions(
2456            self.active_catalog(),
2457            &stmt.table,
2458            &positions,
2459            &to_delete_rows,
2460        )?;
2461        // Stage 3a — apply each FK child step (SET NULL / SET
2462        // DEFAULT / CASCADE delete) before deleting the parent.
2463        // The plan is already ordered: nulls/defaults first, then
2464        // cascade deletes (so a row mutated and later deleted
2465        // surfaces as deleted — though v7.6.5 doesn't produce
2466        // that overlap today).
2467        for step in &cascade_plan {
2468            apply_fk_child_step(self.active_catalog_mut(), step)?;
2469        }
2470        // Stage 3b — actually delete the original target rows.
2471        let table = self
2472            .active_catalog_mut()
2473            .get_mut(&stmt.table)
2474            .ok_or_else(|| {
2475                EngineError::Storage(StorageError::TableNotFound {
2476                    name: stmt.table.clone(),
2477                })
2478            })?;
2479        let affected = table.delete_rows(&positions) + cold_shadow_count;
2480        let _ = table;
2481        // v6.2.1 — auto-analyze modified-row tracking for DELETE.
2482        if !self.in_transaction() && affected > 0 {
2483            self.statistics
2484                .record_modifications(&stmt.table, affected as u64);
2485        }
2486        // v7.9.4 — RETURNING projection over the soon-to-be-gone
2487        // rows. `to_delete_rows` was snapshotted in stage 1 before
2488        // mutation, so the projection sees the pre-delete state
2489        // (matches PG semantics: DELETE RETURNING returns the row
2490        // as it was just before removal).
2491        if let Some(items) = &stmt.returning {
2492            return self.build_returning_rows(
2493                &stmt.table,
2494                items,
2495                to_delete_rows,
2496            );
2497        }
2498        Ok(QueryResult::CommandOk {
2499            affected,
2500            modified_catalog: !self.in_transaction(),
2501        })
2502    }
2503
2504    /// `SHOW TABLES` — one row per table in the active catalog.
2505    /// Column name is `name` so result-set consumers can downstream
2506    /// `SELECT name FROM ...` style logic if needed.
2507    /// v4.26: `EXPLAIN [ANALYZE] <select>`. Returns a single-column
2508    /// `QUERY PLAN` text table — first line names the top operator
2509    /// (Scan / Aggregate / Window / etc.), indented children list
2510    /// FROM joins, WHERE filters, ORDER BY / LIMIT, projection
2511    /// shape, and any active index hits. `ANALYZE` execs the inner
2512    /// SELECT and appends actual-row + elapsed-micros annotations.
2513    #[allow(clippy::format_push_string)]
2514    fn exec_explain(
2515        &self,
2516        e: &spg_sql::ast::ExplainStatement,
2517        cancel: CancelToken<'_>,
2518    ) -> Result<QueryResult, EngineError> {
2519        let mut lines = Vec::<String>::new();
2520        explain_select(&e.inner, self, 0, &mut lines);
2521        if e.suggest {
2522            // v6.8.3 — index advisor. Walks the SELECT's FROM
2523            // tables + WHERE column refs; for each (table, column)
2524            // pair that lacks an index, append a SUGGEST line with
2525            // a copy-pastable `CREATE INDEX` statement. This is a
2526            // pure-syntax heuristic — no cardinality estimation —
2527            // matching the v6.8.3 design intent of "tell the
2528            // operator where indexes are missing", not "give the
2529            // mathematically optimal index set".
2530            let suggestions = build_index_suggestions(&e.inner, self);
2531            for s in suggestions {
2532                lines.push(s);
2533            }
2534        } else if e.analyze {
2535            // v6.2.4 — EXPLAIN ANALYZE annotates each operator line
2536            // with `(rows=N)` where the row count is computable
2537            // without re-executing the full query:
2538            //   - Top-level operator (first non-indented line):
2539            //     rows = final result.len()
2540            //   - "From: <table> [full scan]" lines: rows =
2541            //     table.rows().len() (catalog read; no execution)
2542            //   - "From: <table> [index seek]": indeterminate —
2543            //     the index step would need re-execution; v6.2.5
2544            //     adds per-operator wall-clock + hot/cold rows
2545            //     instrumentation that makes this concrete.
2546            //   - Everything else: marked `(—)` so the surface
2547            //     stays well-defined without silently dropping
2548            //     stats. v6.2.5 fills in via inline executor
2549            //     instrumentation.
2550            // Total elapsed lands on a trailing `Total: …` line.
2551            let started = self.clock.map(|f| f());
2552            let exec = self.exec_select_cancel(&e.inner, cancel)?;
2553            let elapsed_micros = match (self.clock, started) {
2554                (Some(f), Some(s)) => Some(f().saturating_sub(s)),
2555                _ => None,
2556            };
2557            let row_count = if let QueryResult::Rows { rows, .. } = &exec {
2558                rows.len()
2559            } else {
2560                0
2561            };
2562            annotate_explain_lines(&mut lines, row_count, self);
2563            let mut total = alloc::format!("Total: rows={row_count}");
2564            if let Some(us) = elapsed_micros {
2565                total.push_str(&alloc::format!(" elapsed={us}us"));
2566            }
2567            lines.push(total);
2568        }
2569        let columns = alloc::vec![ColumnSchema::new("QUERY PLAN", DataType::Text, false)];
2570        let rows: Vec<Row> = lines
2571            .into_iter()
2572            .map(|l| Row::new(alloc::vec![Value::Text(l)]))
2573            .collect();
2574        Ok(QueryResult::Rows { columns, rows })
2575    }
2576
2577    fn exec_show_tables(&self) -> QueryResult {
2578        let columns = alloc::vec![ColumnSchema::new("name", DataType::Text, false)];
2579        let rows: Vec<Row> = self
2580            .active_catalog()
2581            .table_names()
2582            .into_iter()
2583            .map(|n| Row::new(alloc::vec![Value::Text(n)]))
2584            .collect();
2585        QueryResult::Rows { columns, rows }
2586    }
2587
2588    /// `SHOW COLUMNS FROM <table>` — one row per column with the
2589    /// declared name, SQL type rendering, and nullability flag.
2590    fn exec_show_columns(&self, table_name: &str) -> Result<QueryResult, EngineError> {
2591        let table =
2592            self.active_catalog()
2593                .get(table_name)
2594                .ok_or_else(|| StorageError::TableNotFound {
2595                    name: table_name.into(),
2596                })?;
2597        let columns = alloc::vec![
2598            ColumnSchema::new("name", DataType::Text, false),
2599            ColumnSchema::new("type", DataType::Text, false),
2600            ColumnSchema::new("nullable", DataType::Bool, false),
2601        ];
2602        let rows: Vec<Row> = table
2603            .schema()
2604            .columns
2605            .iter()
2606            .map(|c| {
2607                Row::new(alloc::vec![
2608                    Value::Text(c.name.clone()),
2609                    Value::Text(alloc::format!("{}", c.ty)),
2610                    Value::Bool(c.nullable),
2611                ])
2612            })
2613            .collect();
2614        Ok(QueryResult::Rows { columns, rows })
2615    }
2616
2617    fn exec_begin(&mut self) -> Result<QueryResult, EngineError> {
2618        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2619        if self.tx_catalogs.contains_key(&tx_id) {
2620            return Err(EngineError::TransactionAlreadyOpen);
2621        }
2622        self.tx_catalogs.insert(
2623            tx_id,
2624            TxState {
2625                catalog: self.catalog.clone(),
2626                savepoints: Vec::new(),
2627            },
2628        );
2629        Ok(QueryResult::CommandOk {
2630            affected: 0,
2631            modified_catalog: false,
2632        })
2633    }
2634
2635    fn exec_commit(&mut self) -> Result<QueryResult, EngineError> {
2636        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2637        let state = self
2638            .tx_catalogs
2639            .remove(&tx_id)
2640            .ok_or(EngineError::NoActiveTransaction)?;
2641        self.catalog = state.catalog;
2642        // All savepoints become permanent at COMMIT and the stack
2643        // resets for the next TX (`state.savepoints` is discarded with
2644        // `state`).
2645        Ok(QueryResult::CommandOk {
2646            affected: 0,
2647            modified_catalog: true,
2648        })
2649    }
2650
2651    fn exec_rollback(&mut self) -> Result<QueryResult, EngineError> {
2652        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2653        if self.tx_catalogs.remove(&tx_id).is_none() {
2654            return Err(EngineError::NoActiveTransaction);
2655        }
2656        // savepoints discarded with the TxState
2657        Ok(QueryResult::CommandOk {
2658            affected: 0,
2659            modified_catalog: false,
2660        })
2661    }
2662
2663    fn exec_savepoint(&mut self, name: String) -> Result<QueryResult, EngineError> {
2664        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2665        let state = self
2666            .tx_catalogs
2667            .get_mut(&tx_id)
2668            .ok_or(EngineError::NoActiveTransaction)?;
2669        // PG re-uses an existing savepoint name by dropping the older
2670        // entry and pushing a fresh one — match that behaviour so
2671        // application code can `SAVEPOINT sp; ...; SAVEPOINT sp` freely.
2672        state.savepoints.retain(|(n, _)| n != &name);
2673        let snapshot = state.catalog.clone();
2674        state.savepoints.push((name, snapshot));
2675        Ok(QueryResult::CommandOk {
2676            affected: 0,
2677            modified_catalog: false,
2678        })
2679    }
2680
2681    fn exec_rollback_to_savepoint(&mut self, name: &str) -> Result<QueryResult, EngineError> {
2682        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2683        let state = self
2684            .tx_catalogs
2685            .get_mut(&tx_id)
2686            .ok_or(EngineError::NoActiveTransaction)?;
2687        let pos = state
2688            .savepoints
2689            .iter()
2690            .rposition(|(n, _)| n == name)
2691            .ok_or_else(|| {
2692                EngineError::Unsupported(alloc::format!("savepoint not found: {name}"))
2693            })?;
2694        // The savepoint stays on the stack (PG semantics): a later
2695        // `RELEASE` or further `ROLLBACK TO` is still allowed. Everything
2696        // after it is discarded.
2697        let snapshot = state.savepoints[pos].1.clone();
2698        state.savepoints.truncate(pos + 1);
2699        state.catalog = snapshot;
2700        Ok(QueryResult::CommandOk {
2701            affected: 0,
2702            modified_catalog: false,
2703        })
2704    }
2705
2706    fn exec_release_savepoint(&mut self, name: &str) -> Result<QueryResult, EngineError> {
2707        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2708        let state = self
2709            .tx_catalogs
2710            .get_mut(&tx_id)
2711            .ok_or(EngineError::NoActiveTransaction)?;
2712        let pos = state
2713            .savepoints
2714            .iter()
2715            .rposition(|(n, _)| n == name)
2716            .ok_or_else(|| {
2717                EngineError::Unsupported(alloc::format!("savepoint not found: {name}"))
2718            })?;
2719        // RELEASE keeps the work since the savepoint, just discards the
2720        // bookmark plus everything nested under it.
2721        state.savepoints.truncate(pos);
2722        Ok(QueryResult::CommandOk {
2723            affected: 0,
2724            modified_catalog: false,
2725        })
2726    }
2727
2728    /// v6.0.4 — synchronous `ALTER INDEX <name> REBUILD [WITH
2729    /// (encoding = …)]`. Walks every table in the active catalog
2730    /// looking for an index matching `stmt.name`, then delegates the
2731    /// rebuild (including any encoding switch) to
2732    /// `Table::rebuild_nsw_index`. The "live" non-blocking
2733    /// optimisation is v6.0.4.1 / v6.1.x territory.
2734    /// v6.7.2 — `ALTER TABLE t SET hot_tier_bytes = X`. Dispatch
2735    /// arm. Currently the only setting is `hot_tier_bytes`; later
2736    /// v6.7.x can extend `AlterTableTarget` without touching this
2737    /// arm structure.
2738    fn exec_alter_table(
2739        &mut self,
2740        s: spg_sql::ast::AlterTableStatement,
2741    ) -> Result<QueryResult, EngineError> {
2742        match s.target {
2743            spg_sql::ast::AlterTableTarget::SetHotTierBytes(n) => {
2744                let table = self
2745                    .active_catalog_mut()
2746                    .get_mut(&s.name)
2747                    .ok_or_else(|| {
2748                        EngineError::Storage(StorageError::TableNotFound {
2749                            name: s.name.clone(),
2750                        })
2751                    })?;
2752                table.schema_mut().hot_tier_bytes = Some(n);
2753            }
2754            spg_sql::ast::AlterTableTarget::AddForeignKey(fk) => {
2755                // v7.6.8 — resolve FK against the live catalog first
2756                // (validates parent table, columns, indices). Then
2757                // verify every existing row in the child table
2758                // satisfies the new constraint. Then install it.
2759                let cols_snapshot = self
2760                    .active_catalog()
2761                    .get(&s.name)
2762                    .ok_or_else(|| {
2763                        EngineError::Storage(StorageError::TableNotFound {
2764                            name: s.name.clone(),
2765                        })
2766                    })?
2767                    .schema()
2768                    .columns
2769                    .clone();
2770                let storage_fk = resolve_foreign_key(
2771                    &s.name,
2772                    &cols_snapshot,
2773                    fk,
2774                    self.active_catalog(),
2775                )?;
2776                // Verify existing rows. Treat them as a virtual
2777                // INSERT batch — reusing the v7.6.2 enforce helper.
2778                let existing_rows: Vec<Vec<Value>> = self
2779                    .active_catalog()
2780                    .get(&s.name)
2781                    .expect("checked above")
2782                    .rows()
2783                    .iter()
2784                    .map(|r| r.values.clone())
2785                    .collect();
2786                enforce_fk_inserts(
2787                    self.active_catalog(),
2788                    &s.name,
2789                    core::slice::from_ref(&storage_fk),
2790                    &existing_rows,
2791                )?;
2792                // Reject duplicate constraint name.
2793                let table = self
2794                    .active_catalog_mut()
2795                    .get_mut(&s.name)
2796                    .expect("checked above");
2797                if let Some(name) = &storage_fk.name
2798                    && table
2799                        .schema()
2800                        .foreign_keys
2801                        .iter()
2802                        .any(|f| f.name.as_ref() == Some(name))
2803                {
2804                    return Err(EngineError::Unsupported(alloc::format!(
2805                        "ALTER TABLE ADD CONSTRAINT: a constraint named {name:?} already exists"
2806                    )));
2807                }
2808                table.schema_mut().foreign_keys.push(storage_fk);
2809            }
2810            spg_sql::ast::AlterTableTarget::DropForeignKey(name) => {
2811                let table = self
2812                    .active_catalog_mut()
2813                    .get_mut(&s.name)
2814                    .ok_or_else(|| {
2815                        EngineError::Storage(StorageError::TableNotFound {
2816                            name: s.name.clone(),
2817                        })
2818                    })?;
2819                let fks = &mut table.schema_mut().foreign_keys;
2820                let before = fks.len();
2821                fks.retain(|f| f.name.as_ref() != Some(&name));
2822                if fks.len() == before {
2823                    return Err(EngineError::Unsupported(alloc::format!(
2824                        "ALTER TABLE DROP CONSTRAINT: no FK named {name:?} on {:?}",
2825                        s.name
2826                    )));
2827                }
2828            }
2829        }
2830        Ok(QueryResult::CommandOk {
2831            affected: 0,
2832            modified_catalog: !self.in_transaction(),
2833        })
2834    }
2835
2836    fn exec_alter_index(
2837        &mut self,
2838        stmt: spg_sql::ast::AlterIndexStatement,
2839    ) -> Result<QueryResult, EngineError> {
2840        // Translate the optional SQL-side encoding choice into the
2841        // storage-side enum; the same SqlVecEncoding -> VecEncoding
2842        // bridge `column_type_to_data_type` uses.
2843        let spg_sql::ast::AlterIndexStatement {
2844            name: idx_name,
2845            target,
2846        } = stmt;
2847        let spg_sql::ast::AlterIndexTarget::Rebuild { encoding } = target;
2848        let target = encoding.map(|e| match e {
2849            SqlVecEncoding::F32 => VecEncoding::F32,
2850            SqlVecEncoding::Sq8 => VecEncoding::Sq8,
2851            SqlVecEncoding::F16 => VecEncoding::F16,
2852        });
2853        // Linear scan: index names are globally unique within a
2854        // catalog (enforced by add_nsw_index_inner) so the first
2855        // match is the only one. Save the table name to avoid
2856        // borrowing while we then take a mut borrow.
2857        let table_name = {
2858            let cat = self.active_catalog();
2859            let mut found: Option<String> = None;
2860            for tname in cat.table_names() {
2861                if let Some(t) = cat.get(&tname)
2862                    && t.indices().iter().any(|i| i.name == idx_name)
2863                {
2864                    found = Some(tname);
2865                    break;
2866                }
2867            }
2868            found.ok_or_else(|| {
2869                EngineError::Storage(StorageError::IndexNotFound {
2870                    name: idx_name.clone(),
2871                })
2872            })?
2873        };
2874        let table = self
2875            .active_catalog_mut()
2876            .get_mut(&table_name)
2877            .expect("table found above");
2878        table.rebuild_nsw_index(&idx_name, target)?;
2879        // v6.3.1 — ALTER INDEX REBUILD potentially with new encoding
2880        // changes cost characteristics; evict any cached plans.
2881        self.plan_cache.evict_referencing(&table_name);
2882        Ok(QueryResult::CommandOk {
2883            affected: 0,
2884            modified_catalog: !self.in_transaction(),
2885        })
2886    }
2887
2888    fn exec_create_index(
2889        &mut self,
2890        stmt: CreateIndexStatement,
2891    ) -> Result<QueryResult, EngineError> {
2892        let table = self
2893            .active_catalog_mut()
2894            .get_mut(&stmt.table)
2895            .ok_or_else(|| {
2896                EngineError::Storage(StorageError::TableNotFound {
2897                    name: stmt.table.clone(),
2898                })
2899            })?;
2900        // `IF NOT EXISTS` reduces DuplicateIndex to a no-op CommandOk.
2901        if stmt.if_not_exists && table.indices().iter().any(|i| i.name == stmt.name) {
2902            return Ok(QueryResult::CommandOk {
2903                affected: 0,
2904                modified_catalog: false,
2905            });
2906        }
2907        let table_name = stmt.table.clone();
2908        // v6.8.0 — resolve INCLUDE column names to positions. Done
2909        // before `add_index` so a typo error surfaces before any
2910        // catalog mutation lands.
2911        let included_positions: Vec<usize> = if stmt.included_columns.is_empty() {
2912            Vec::new()
2913        } else {
2914            let schema = table.schema();
2915            stmt.included_columns
2916                .iter()
2917                .map(|c| {
2918                    schema.column_position(c).ok_or_else(|| {
2919                        EngineError::Storage(StorageError::ColumnNotFound {
2920                            column: c.clone(),
2921                        })
2922                    })
2923                })
2924                .collect::<Result<Vec<_>, _>>()?
2925        };
2926        match stmt.method {
2927            IndexMethod::BTree => table.add_index(stmt.name.clone(), &stmt.column)?,
2928            IndexMethod::Hnsw => {
2929                if !included_positions.is_empty() {
2930                    return Err(EngineError::Unsupported(
2931                        "INCLUDE columns are not supported on HNSW indexes".into(),
2932                    ));
2933                }
2934                table.add_nsw_index(stmt.name.clone(), &stmt.column, spg_storage::NSW_DEFAULT_M)?;
2935            }
2936            // v6.7.1 — BRIN. Pure metadata; no in-memory data.
2937            IndexMethod::Brin => {
2938                if !included_positions.is_empty() {
2939                    return Err(EngineError::Unsupported(
2940                        "INCLUDE columns are not supported on BRIN indexes".into(),
2941                    ));
2942                }
2943                table.add_brin_index(stmt.name.clone(), &stmt.column)?;
2944            }
2945        }
2946        if !included_positions.is_empty()
2947            && let Some(idx) = table.indices_mut().iter_mut().find(|i| i.name == stmt.name)
2948        {
2949            idx.included_columns = included_positions;
2950        }
2951        // v6.8.1 — persist partial-index predicate. Stored as the
2952        // expression's Display form so the catalog snapshot stays
2953        // pure (storage has no spg-sql dependency). The runtime
2954        // maintenance path treats partial indexes identically to
2955        // full indexes for v6.8.1 (over-maintenance is safe; the
2956        // planner-side "use partial when query WHERE implies the
2957        // predicate" pass is STABILITY carve-out).
2958        if let Some(pred_expr) = &stmt.partial_predicate {
2959            let canonical = pred_expr.to_string();
2960            if matches!(stmt.method, IndexMethod::Hnsw | IndexMethod::Brin) {
2961                return Err(EngineError::Unsupported(
2962                    "WHERE predicates are not supported on HNSW or BRIN indexes".into(),
2963                ));
2964            }
2965            if let Some(idx) = table.indices_mut().iter_mut().find(|i| i.name == stmt.name) {
2966                idx.partial_predicate = Some(canonical);
2967            }
2968        }
2969        // v6.8.2 — persist expression index key. Same Display-form
2970        // storage; the runtime maintenance pass evaluates each
2971        // row's expression to derive the index key, but for v6.8.2
2972        // the engine falls through to the bare-column-reference
2973        // path and the expression is preserved for format-layer
2974        // round-trip + future planner work. Carved-out in
2975        // STABILITY § "Out of v6.8".
2976        if let Some(key_expr) = &stmt.expression {
2977            if matches!(stmt.method, IndexMethod::Hnsw | IndexMethod::Brin) {
2978                return Err(EngineError::Unsupported(
2979                    "Expression keys are not supported on HNSW or BRIN indexes".into(),
2980                ));
2981            }
2982            let canonical = key_expr.to_string();
2983            if let Some(idx) = table.indices_mut().iter_mut().find(|i| i.name == stmt.name) {
2984                idx.expression = Some(canonical);
2985            }
2986        }
2987        // v6.3.1 — adding an index can change the optimal plan for
2988        // any cached query that references this table.
2989        self.plan_cache.evict_referencing(&table_name);
2990        Ok(QueryResult::CommandOk {
2991            affected: 0,
2992            modified_catalog: !self.in_transaction(),
2993        })
2994    }
2995
2996    fn exec_create_table(
2997        &mut self,
2998        stmt: CreateTableStatement,
2999    ) -> Result<QueryResult, EngineError> {
3000        if stmt.if_not_exists && self.active_catalog().get(&stmt.name).is_some() {
3001            return Ok(QueryResult::CommandOk {
3002                affected: 0,
3003                modified_catalog: false,
3004            });
3005        }
3006        let table_name = stmt.name.clone();
3007        let cols = stmt
3008            .columns
3009            .into_iter()
3010            .map(column_def_to_schema)
3011            .collect::<Result<Vec<_>, _>>()?;
3012        // v7.6.1 — resolve every FK in the statement against the
3013        // already-known catalog. Validates: parent table exists,
3014        // parent column names exist, arity matches, parent columns
3015        // have a PK / UNIQUE index. Self-referencing FKs (parent
3016        // table == this table) resolve against the column list we
3017        // just built — they don't need the catalog yet.
3018        let mut fks: Vec<spg_storage::ForeignKeyConstraint> =
3019            Vec::with_capacity(stmt.foreign_keys.len());
3020        for fk in stmt.foreign_keys {
3021            fks.push(resolve_foreign_key(
3022                &table_name,
3023                &cols,
3024                fk,
3025                self.active_catalog(),
3026            )?);
3027        }
3028        let mut schema = TableSchema::new(table_name, cols);
3029        schema.foreign_keys = fks;
3030        self.active_catalog_mut().create_table(schema)?;
3031        Ok(QueryResult::CommandOk {
3032            affected: 0,
3033            modified_catalog: !self.in_transaction(),
3034        })
3035    }
3036
3037    fn exec_insert(&mut self, stmt: InsertStatement) -> Result<QueryResult, EngineError> {
3038        let table = self
3039            .active_catalog_mut()
3040            .get_mut(&stmt.table)
3041            .ok_or_else(|| {
3042                EngineError::Storage(StorageError::TableNotFound {
3043                    name: stmt.table.clone(),
3044                })
3045            })?;
3046        // v3.1.5: clone the columns vector only (not the whole
3047        // TableSchema — saves one String alloc for the table name).
3048        // We need an owned snapshot because we'll call `table.insert`
3049        // (mutable borrow on `table`) inside the row loop while
3050        // reading schema fields.
3051        let column_meta: Vec<ColumnSchema> = table.schema().columns.clone();
3052        let schema_cols_len = column_meta.len();
3053        // Build a permutation `tuple_pos[c] = Some(j)` meaning schema
3054        // column `c` is filled from the `j`-th tuple slot; `None` means
3055        // "fill with NULL". Validated once and reused for every row.
3056        let tuple_pos: Option<Vec<Option<usize>>> = match &stmt.columns {
3057            None => None, // 1-1 mapping, fast path
3058            Some(cols) => {
3059                let mut map = alloc::vec![None; schema_cols_len];
3060                for (j, name) in cols.iter().enumerate() {
3061                    let idx = column_meta
3062                        .iter()
3063                        .position(|c| c.name == *name)
3064                        .ok_or_else(|| {
3065                            EngineError::Eval(EvalError::ColumnNotFound { name: name.clone() })
3066                        })?;
3067                    if map[idx].is_some() {
3068                        return Err(EngineError::Storage(StorageError::ArityMismatch {
3069                            expected: schema_cols_len,
3070                            actual: cols.len(),
3071                        }));
3072                    }
3073                    map[idx] = Some(j);
3074                }
3075                // Omitted columns must either be nullable, carry a
3076                // DEFAULT, or be AUTO_INCREMENT. Catch NOT NULL
3077                // omissions up front so the WAL stays clean.
3078                for (i, col) in column_meta.iter().enumerate() {
3079                    if map[i].is_none()
3080                        && !col.nullable
3081                        && col.default.is_none()
3082                        && !col.auto_increment
3083                    {
3084                        return Err(EngineError::Storage(StorageError::NullInNotNull {
3085                            column: col.name.clone(),
3086                        }));
3087                    }
3088                }
3089                Some(map)
3090            }
3091        };
3092        let expected_tuple_len = stmt.columns.as_ref().map_or(schema_cols_len, Vec::len);
3093        // v7.6.2 — snapshot this table's FK list before the
3094        // mutable-borrow window so we can run parent lookups
3095        // against the immutable catalog after parsing. Empty vec is
3096        // the no-FK fast path; clone cost is O(fks * arity) which
3097        // is < 100 ns for typical schemas.
3098        let fks = table.schema().foreign_keys.clone();
3099        let mut affected = 0usize;
3100        // Stage 1 — parse + AUTO_INC + coerce all rows under the
3101        // single mutable borrow.
3102        let mut all_values: Vec<Vec<Value>> = Vec::with_capacity(stmt.rows.len());
3103        for tuple in stmt.rows {
3104            if tuple.len() != expected_tuple_len {
3105                return Err(EngineError::Storage(StorageError::ArityMismatch {
3106                    expected: expected_tuple_len,
3107                    actual: tuple.len(),
3108                }));
3109            }
3110            // Fast path: no column-list permutation → tuple slot j
3111            // maps to schema column j. We can zip schema with tuple
3112            // and skip the `raw_tuple` staging allocation entirely.
3113            let values: Vec<Value> = if let Some(map) = &tuple_pos {
3114                // Permuted path: still need raw_tuple to index by `map[i]`.
3115                let raw_tuple: Vec<Value> = tuple
3116                    .into_iter()
3117                    .map(literal_expr_to_value)
3118                    .collect::<Result<_, _>>()?;
3119                let mut out = Vec::with_capacity(schema_cols_len);
3120                for (i, col) in column_meta.iter().enumerate() {
3121                    let mut raw = match map[i] {
3122                        Some(j) => raw_tuple[j].clone(),
3123                        None => col.default.clone().unwrap_or(Value::Null),
3124                    };
3125                    if col.auto_increment && raw.is_null() {
3126                        let next = table.next_auto_value(i).ok_or_else(|| {
3127                            EngineError::Unsupported(alloc::format!(
3128                                "AUTO_INCREMENT applies to integer columns only (column `{}`)",
3129                                col.name
3130                            ))
3131                        })?;
3132                        raw = Value::BigInt(next);
3133                    }
3134                    out.push(coerce_value(raw, col.ty, &col.name, i)?);
3135                }
3136                out
3137            } else {
3138                // 1-1 mapping fast path: single Vec alloc, no raw_tuple.
3139                let mut out = Vec::with_capacity(schema_cols_len);
3140                for (i, (col, expr)) in column_meta.iter().zip(tuple).enumerate() {
3141                    let mut raw = literal_expr_to_value(expr)?;
3142                    if col.auto_increment && raw.is_null() {
3143                        let next = table.next_auto_value(i).ok_or_else(|| {
3144                            EngineError::Unsupported(alloc::format!(
3145                                "AUTO_INCREMENT applies to integer columns only (column `{}`)",
3146                                col.name
3147                            ))
3148                        })?;
3149                        raw = Value::BigInt(next);
3150                    }
3151                    out.push(coerce_value(raw, col.ty, &col.name, i)?);
3152                }
3153                out
3154            };
3155            all_values.push(values);
3156        }
3157        // Stage 2 — FK enforcement on the immutable catalog.
3158        // Non-lexical lifetimes release the mutable borrow on
3159        // `table` here since stage 1 was the last use. The
3160        // parent-table lookup runs before any row is committed.
3161        let _ = table;
3162        if !fks.is_empty() {
3163            enforce_fk_inserts(self.active_catalog(), &stmt.table, &fks, &all_values)?;
3164        }
3165        // v7.9.8 / v7.9.9 — ON CONFLICT handling.
3166        //   - `DO NOTHING` filters `all_values` to non-conflicting
3167        //     rows + drops within-batch duplicates.
3168        //   - `DO UPDATE SET …` ALSO filters, but for each
3169        //     conflicting row it queues an UPDATE on the existing
3170        //     row using the incoming row's values as `EXCLUDED.*`.
3171        let mut pending_updates: Vec<(usize, Vec<Value>)> = Vec::new();
3172        let mut skipped_count = 0usize;
3173        if let Some(clause) = &stmt.on_conflict {
3174            let conflict_cols = resolve_on_conflict_columns(
3175                self.active_catalog(),
3176                &stmt.table,
3177                clause.target_columns.as_slice(),
3178            )?;
3179            let mut kept: Vec<Vec<Value>> = Vec::with_capacity(all_values.len());
3180            let mut seen_keys: Vec<Vec<Value>> = Vec::new();
3181            for values in all_values {
3182                let key_tuple: Vec<&Value> =
3183                    conflict_cols.iter().map(|&c| &values[c]).collect();
3184                // SQL spec: NULL in any conflict column means "no
3185                // conflict possible" (NULL ≠ NULL for uniqueness).
3186                let has_null_key = key_tuple.iter().any(|v| matches!(v, Value::Null));
3187                let collides_with_table = !has_null_key
3188                    && on_conflict_keys_exist(
3189                        self.active_catalog(),
3190                        &stmt.table,
3191                        &conflict_cols,
3192                        &key_tuple,
3193                    );
3194                let key_tuple_owned: Vec<Value> =
3195                    key_tuple.iter().map(|v| (*v).clone()).collect();
3196                let collides_with_batch = !has_null_key
3197                    && seen_keys.iter().any(|k| k == &key_tuple_owned);
3198                let collides = collides_with_table || collides_with_batch;
3199                match (&clause.action, collides) {
3200                    (_, false) => {
3201                        seen_keys.push(key_tuple_owned);
3202                        kept.push(values);
3203                    }
3204                    (spg_sql::ast::OnConflictAction::Nothing, true) => {
3205                        skipped_count += 1;
3206                    }
3207                    (
3208                        spg_sql::ast::OnConflictAction::Update {
3209                            assignments,
3210                            where_,
3211                        },
3212                        true,
3213                    ) => {
3214                        if !collides_with_table {
3215                            skipped_count += 1;
3216                            continue;
3217                        }
3218                        let target_pos = lookup_row_position_by_keys(
3219                            self.active_catalog(),
3220                            &stmt.table,
3221                            &conflict_cols,
3222                            &key_tuple,
3223                        )
3224                        .ok_or_else(|| {
3225                            EngineError::Unsupported(
3226                                "ON CONFLICT DO UPDATE: conflict detected but row \
3227                                 position could not be resolved (cold-tier row?)"
3228                                    .into(),
3229                            )
3230                        })?;
3231                        let updated = apply_on_conflict_assignments(
3232                            self.active_catalog(),
3233                            &stmt.table,
3234                            target_pos,
3235                            &values,
3236                            assignments,
3237                            where_.as_ref(),
3238                        )?;
3239                        if let Some(new_row) = updated {
3240                            pending_updates.push((target_pos, new_row));
3241                        } else {
3242                            skipped_count += 1;
3243                        }
3244                    }
3245                }
3246            }
3247            all_values = kept;
3248        }
3249        // Stage 3 — insert all rows under a fresh mutable borrow.
3250        let table = self
3251            .active_catalog_mut()
3252            .get_mut(&stmt.table)
3253            .ok_or_else(|| {
3254                EngineError::Storage(StorageError::TableNotFound {
3255                    name: stmt.table.clone(),
3256                })
3257            })?;
3258        // v7.9.4 — keep RETURNING projection rows separate per
3259        // INSERT and per UPDATE branch so DO UPDATE pushes the new
3260        // post-update state, not the incoming-only values.
3261        let mut returning_rows: Vec<Vec<Value>> = Vec::new();
3262        for values in all_values {
3263            if stmt.returning.is_some() {
3264                returning_rows.push(values.clone());
3265            }
3266            table.insert(Row::new(values))?;
3267            affected += 1;
3268        }
3269        // v7.9.9 — apply ON CONFLICT DO UPDATE rewrites collected
3270        // in the conflict-resolution pass. update_row handles
3271        // index maintenance + body re-encoding.
3272        for (pos, new_row) in pending_updates {
3273            if stmt.returning.is_some() {
3274                returning_rows.push(new_row.clone());
3275            }
3276            table.update_row(pos, new_row)?;
3277            affected += 1;
3278        }
3279        let _ = skipped_count;
3280        // v7.9.4/v7.9.9 — RETURNING streams the rows that ended
3281        // up in the table after this statement (insert or
3282        // post-update on conflict).
3283        if let Some(items) = &stmt.returning {
3284            let _ = table;
3285            return self.build_returning_rows(
3286                &stmt.table,
3287                items,
3288                returning_rows,
3289            );
3290        }
3291        // v6.2.1 — auto-analyze: track per-table modified-row
3292        // counter so the background sweep can decide when to
3293        // re-ANALYZE. Cheap path on the autocommit-wrap hot loop
3294        // — one BTreeMap entry update per INSERT batch.
3295        if !self.in_transaction() && affected > 0 {
3296            self.statistics
3297                .record_modifications(&stmt.table, affected as u64);
3298        }
3299        Ok(QueryResult::CommandOk {
3300            affected,
3301            modified_catalog: !self.in_transaction(),
3302        })
3303    }
3304
3305    /// v4.5: SELECT with cooperative cancellation. The token is
3306    /// honoured between UNION peers and inside the bare-SELECT row
3307    /// loop; HNSW kNN graph walks and the aggregate executor don't
3308    /// honour it yet (deferred — those paths bound their work
3309    /// internally by `LIMIT k` and `GROUP BY` cardinality).
3310    /// v6.10.2 — cold-tier time-travel scan. Resolves the segment
3311    /// by id, decodes each row body against the table's current
3312    /// schema, applies the SELECT's projection + optional WHERE +
3313    /// optional LIMIT, returns a `Rows` result. JOINs / aggregates
3314    /// / ORDER BY are unsupported on this path (STABILITY carve-
3315    /// out); operators wanting them should restore the segment
3316    /// into a regular table first.
3317    fn exec_select_as_of_segment(
3318        &self,
3319        stmt: &SelectStatement,
3320        from: &spg_sql::ast::FromClause,
3321        segment_id: u32,
3322    ) -> Result<QueryResult, EngineError> {
3323        // v6.10.2 scope: no joins, no aggregates, no ORDER BY,
3324        // no GROUP BY / HAVING / UNION / OFFSET / DISTINCT.
3325        if !from.joins.is_empty()
3326            || stmt.group_by.is_some()
3327            || stmt.having.is_some()
3328            || !stmt.unions.is_empty()
3329            || !stmt.order_by.is_empty()
3330            || stmt.offset.is_some()
3331            || stmt.distinct
3332            || aggregate::uses_aggregate(stmt)
3333        {
3334            return Err(EngineError::Unsupported(
3335                "AS OF SEGMENT supports SELECT projection + WHERE + LIMIT only \
3336                 (joins / aggregates / ORDER BY are STABILITY § \"Out of v6.10\")"
3337                    .into(),
3338            ));
3339        }
3340        let table = self
3341            .active_catalog()
3342            .get(&from.primary.name)
3343            .ok_or_else(|| StorageError::TableNotFound {
3344                name: from.primary.name.clone(),
3345            })?;
3346        let schema = table.schema().clone();
3347        let schema_cols = &schema.columns;
3348        let alias = from
3349            .primary
3350            .alias
3351            .as_deref()
3352            .unwrap_or(from.primary.name.as_str());
3353        let ctx = EvalContext::new(schema_cols, Some(alias));
3354        let seg = self
3355            .active_catalog()
3356            .cold_segment(segment_id)
3357            .ok_or_else(|| {
3358                EngineError::Unsupported(alloc::format!(
3359                    "AS OF SEGMENT: cold segment {segment_id} not registered"
3360                ))
3361            })?;
3362        let mut out_rows: Vec<Row> = Vec::new();
3363        let mut limit_remaining: Option<usize> =
3364            stmt.limit.as_ref().and_then(|n| usize::try_from(*n).ok());
3365        for (_key, body) in seg.scan() {
3366            let (row, _consumed) = spg_storage::decode_row_body_dense(&body, &schema)
3367                .map_err(EngineError::Storage)?;
3368            if let Some(where_expr) = &stmt.where_ {
3369                let cond = self.eval_expr_simple(where_expr, &row, &ctx)?;
3370                if !matches!(cond, Value::Bool(true)) {
3371                    continue;
3372                }
3373            }
3374            // Projection.
3375            let projected = self.project_row_simple(&row, &stmt.items, schema_cols, alias)?;
3376            out_rows.push(projected);
3377            if let Some(rem) = limit_remaining.as_mut() {
3378                if *rem == 0 {
3379                    out_rows.pop();
3380                    break;
3381                }
3382                *rem -= 1;
3383            }
3384        }
3385        // Output column schema: derive from SELECT items.
3386        let columns = self.derive_output_columns(&stmt.items, schema_cols, alias);
3387        Ok(QueryResult::Rows {
3388            columns,
3389            rows: out_rows,
3390        })
3391    }
3392
3393    /// v6.10.2 — simple-path WHERE eval that doesn't go through
3394    /// the correlated-subquery / Memoize machinery. AS OF SEGMENT
3395    /// scan paths predicate against a snapshot frozen segment, no
3396    /// cross-row state.
3397    fn eval_expr_simple(
3398        &self,
3399        expr: &Expr,
3400        row: &Row,
3401        ctx: &EvalContext,
3402    ) -> Result<Value, EngineError> {
3403        let cancel = CancelToken::none();
3404        self.eval_expr_with_correlated(expr, row, ctx, cancel, None)
3405    }
3406
3407    /// v7.9.4 — INSERT / UPDATE / DELETE RETURNING projector.
3408    /// Given the table name, the user-supplied projection items,
3409    /// and the mutated rows (post-insert / post-update values, or
3410    /// pre-delete snapshot), build a `QueryResult::Rows` whose
3411    /// schema describes the projected columns. Mailrs migration
3412    /// blocker #1.
3413    fn build_returning_rows(
3414        &self,
3415        table_name: &str,
3416        items: &[SelectItem],
3417        mutated_rows: Vec<Vec<Value>>,
3418    ) -> Result<QueryResult, EngineError> {
3419        let table = self.active_catalog().get(table_name).ok_or_else(|| {
3420            EngineError::Storage(StorageError::TableNotFound {
3421                name: table_name.into(),
3422            })
3423        })?;
3424        let schema_cols = table.schema().columns.clone();
3425        let columns = self.derive_output_columns(items, &schema_cols, table_name);
3426        let mut out_rows: Vec<Row> = Vec::with_capacity(mutated_rows.len());
3427        for values in mutated_rows {
3428            let row = Row::new(values);
3429            let projected = self.project_row_simple(&row, items, &schema_cols, table_name)?;
3430            out_rows.push(projected);
3431        }
3432        Ok(QueryResult::Rows {
3433            columns,
3434            rows: out_rows,
3435        })
3436    }
3437
3438    /// v6.10.2 — projection for AS OF SEGMENT. Resolves
3439    /// `SelectItem::Wildcard` to all schema columns and
3440    /// `SelectItem::Expr` via the regular eval path.
3441    fn project_row_simple(
3442        &self,
3443        row: &Row,
3444        items: &[SelectItem],
3445        schema_cols: &[ColumnSchema],
3446        alias: &str,
3447    ) -> Result<Row, EngineError> {
3448        let ctx = EvalContext::new(schema_cols, Some(alias));
3449        let cancel = CancelToken::none();
3450        let mut out_vals = Vec::new();
3451        for item in items {
3452            match item {
3453                SelectItem::Wildcard => {
3454                    out_vals.extend(row.values.iter().cloned());
3455                }
3456                SelectItem::Expr { expr, .. } => {
3457                    let v = self.eval_expr_with_correlated(expr, row, &ctx, cancel, None)?;
3458                    out_vals.push(v);
3459                }
3460            }
3461        }
3462        Ok(Row::new(out_vals))
3463    }
3464
3465    /// v6.10.2 — derive the output `ColumnSchema` list for an
3466    /// AS OF SEGMENT projection. Wildcards take the full schema;
3467    /// expressions take the alias if present or a synthetic
3468    /// `?column?` (PG convention) otherwise.
3469    fn derive_output_columns(
3470        &self,
3471        items: &[SelectItem],
3472        schema_cols: &[ColumnSchema],
3473        _alias: &str,
3474    ) -> Vec<ColumnSchema> {
3475        let mut out = Vec::new();
3476        for item in items {
3477            match item {
3478                SelectItem::Wildcard => {
3479                    out.extend(schema_cols.iter().cloned());
3480                }
3481                SelectItem::Expr { alias, .. } => {
3482                    let name = alias
3483                        .clone()
3484                        .unwrap_or_else(|| "?column?".to_string());
3485                    // Default to Text; the caller's row values
3486                    // carry the actual type. v6.10.2 scope.
3487                    out.push(ColumnSchema::new(name, DataType::Text, true));
3488                }
3489            }
3490        }
3491        out
3492    }
3493
3494    fn exec_select_cancel(
3495        &self,
3496        stmt: &SelectStatement,
3497        cancel: CancelToken<'_>,
3498    ) -> Result<QueryResult, EngineError> {
3499        cancel.check()?;
3500        // v6.10.2 — cold-tier time-travel short-circuit. When the
3501        // primary TableRef carries `AS OF SEGMENT '<id>'`, run a
3502        // dedicated cold-segment scan instead of the regular
3503        // hot+index path. The scope is intentionally narrow for
3504        // v6.10.2 — bare `SELECT * FROM <t> AS OF SEGMENT 'id'`,
3505        // optionally with a single-column-equality WHERE. JOINs /
3506        // aggregates / ORDER BY / subqueries on top of a time-
3507        // travelled scan are STABILITY § "Out of v6.10".
3508        if let Some(from) = &stmt.from
3509            && let Some(seg_id) = from.primary.as_of_segment
3510        {
3511            return self.exec_select_as_of_segment(stmt, from, seg_id);
3512        }
3513        // v6.2.0 / v6.5.0 — virtual-table short-circuits. Detected
3514        // pre-CTE because they don't read from the catalog and
3515        // shouldn't participate in regular FROM resolution.
3516        if let Some(from) = &stmt.from
3517            && from.joins.is_empty()
3518            && stmt.where_.is_none()
3519            && stmt.group_by.is_none()
3520            && stmt.having.is_none()
3521            && stmt.unions.is_empty()
3522            && stmt.order_by.is_empty()
3523            && stmt.limit.is_none()
3524            && stmt.offset.is_none()
3525            && !stmt.distinct
3526            && stmt.items.iter().all(|i| matches!(i, SelectItem::Wildcard))
3527        {
3528            let lower = from.primary.name.to_ascii_lowercase();
3529            match lower.as_str() {
3530                "spg_statistic" => return Ok(self.exec_spg_statistic()),
3531                // v6.5.0 — observability v2 virtual tables.
3532                "spg_stat_replication" => return Ok(self.exec_spg_stat_replication()),
3533                "spg_stat_segment" => return Ok(self.exec_spg_stat_segment()),
3534                "spg_stat_query" => return Ok(self.exec_spg_stat_query()),
3535                "spg_stat_activity" => return Ok(self.exec_spg_stat_activity()),
3536                "spg_audit_chain" => return Ok(self.exec_spg_audit_chain()),
3537                "spg_audit_verify" => return Ok(self.exec_spg_audit_verify()),
3538                "spg_table_ddl" => return Ok(self.exec_spg_table_ddl()),
3539                "spg_role_ddl" => return Ok(self.exec_spg_role_ddl()),
3540                "spg_database_ddl" => return Ok(self.exec_spg_database_ddl()),
3541                _ => {}
3542            }
3543        }
3544        // v4.11: CTEs materialise into a temporary enriched catalog
3545        // *before* anything else — the body SELECT can then refer
3546        // to CTE names via the regular FROM-clause resolution.
3547        // Uncorrelated only: each CTE body runs once against the
3548        // current catalog, not against later CTEs' results (left-
3549        // to-right materialisation would relax this, but we keep
3550        // it simple for v4.11 MVP).
3551        if !stmt.ctes.is_empty() {
3552            return self.exec_with_ctes(stmt, cancel);
3553        }
3554        // v4.10: subqueries (uncorrelated) are resolved here, before
3555        // the executor sees the row loop. We clone the statement so
3556        // we can mutate without disturbing the caller's AST — most
3557        // queries pass through with no subquery nodes and the clone
3558        // is cheap; with subqueries the materialisation cost
3559        // dominates anyway.
3560        let mut stmt_owned;
3561        let stmt_ref: &SelectStatement = if expr_tree_has_subquery(stmt) {
3562            stmt_owned = stmt.clone();
3563            self.resolve_select_subqueries(&mut stmt_owned, cancel)?;
3564            &stmt_owned
3565        } else {
3566            stmt
3567        };
3568        if stmt_ref.unions.is_empty() {
3569            return self.exec_bare_select_cancel(stmt_ref, cancel);
3570        }
3571        // UNION path: clone-strip the head into a bare block (its own
3572        // DISTINCT and any inner ORDER BY are dropped by parser rule —
3573        // the wrapper SelectStatement carries them), execute, then chain
3574        // peers with left-associative dedup semantics.
3575        let mut head = stmt_ref.clone();
3576        head.unions = Vec::new();
3577        head.order_by = Vec::new();
3578        head.limit = None;
3579        let QueryResult::Rows { columns, mut rows } =
3580            self.exec_bare_select_cancel(&head, cancel)?
3581        else {
3582            unreachable!("bare SELECT cannot return CommandOk")
3583        };
3584        for (kind, peer) in &stmt_ref.unions {
3585            let QueryResult::Rows {
3586                columns: peer_cols,
3587                rows: peer_rows,
3588            } = self.exec_bare_select_cancel(peer, cancel)?
3589            else {
3590                unreachable!("bare SELECT cannot return CommandOk")
3591            };
3592            if peer_cols.len() != columns.len() {
3593                return Err(EngineError::Unsupported(alloc::format!(
3594                    "UNION arity mismatch: head has {} columns, peer has {}",
3595                    columns.len(),
3596                    peer_cols.len()
3597                )));
3598            }
3599            rows.extend(peer_rows);
3600            if matches!(kind, UnionKind::Distinct) {
3601                rows = dedup_rows(rows);
3602            }
3603        }
3604        // ORDER BY at the top of a UNION applies to the combined result.
3605        // Eval against the projected schema (NOT the source table).
3606        if !stmt.order_by.is_empty() {
3607            let synth_ctx = EvalContext::new(&columns, None);
3608            let descs: Vec<bool> = stmt.order_by.iter().map(|o| o.desc).collect();
3609            let mut tagged: Vec<(Vec<f64>, Row)> = Vec::with_capacity(rows.len());
3610            for r in rows {
3611                let keys = build_order_keys(&stmt.order_by, &r, &synth_ctx)?;
3612                tagged.push((keys, r));
3613            }
3614            sort_by_keys(&mut tagged, &descs);
3615            rows = tagged.into_iter().map(|(_, r)| r).collect();
3616        }
3617        apply_offset_and_limit(&mut rows, stmt.offset, stmt.limit);
3618        Ok(QueryResult::Rows { columns, rows })
3619    }
3620
3621    #[allow(clippy::too_many_lines)]
3622    #[allow(clippy::too_many_lines)] // huge match — splitting fragments the planner
3623    fn exec_bare_select_cancel(
3624        &self,
3625        stmt: &SelectStatement,
3626        cancel: CancelToken<'_>,
3627    ) -> Result<QueryResult, EngineError> {
3628        // v4.12: window-function path. When the projection contains
3629        // any `name(args) OVER (...)` we route to the dedicated
3630        // executor — partition + sort + per-row window value before
3631        // the regular projection.
3632        if select_has_window(stmt) {
3633            return self.exec_select_with_window(stmt, cancel);
3634        }
3635        // Constant SELECT (no FROM) — evaluate each item once against an
3636        // empty dummy row. Useful for `SELECT 1`, `SELECT coalesce(...)`,
3637        // `SELECT '7'::INT`. Column references will surface as
3638        // ColumnNotFound on eval since the schema is empty.
3639        let Some(from) = &stmt.from else {
3640            let empty_schema: Vec<ColumnSchema> = Vec::new();
3641            let ctx = EvalContext::new(&empty_schema, None);
3642            let projection = build_projection(&stmt.items, &empty_schema, "")?;
3643            let dummy_row = Row::new(Vec::new());
3644            let mut values = Vec::with_capacity(projection.len());
3645            for p in &projection {
3646                values.push(eval::eval_expr(&p.expr, &dummy_row, &ctx)?);
3647            }
3648            let columns: Vec<ColumnSchema> = projection
3649                .into_iter()
3650                .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
3651                .collect();
3652            return Ok(QueryResult::Rows {
3653                columns,
3654                rows: alloc::vec![Row::new(values)],
3655            });
3656        };
3657        // Multi-table FROM (one or more joined peers) goes through the
3658        // nested-loop join executor. Single-table FROM stays on the
3659        // existing scan + index-seek path.
3660        if !from.joins.is_empty() {
3661            return self.exec_joined_select(stmt, from);
3662        }
3663        let primary = &from.primary;
3664        let table = self.active_catalog().get(&primary.name).ok_or_else(|| {
3665            StorageError::TableNotFound {
3666                name: primary.name.clone(),
3667            }
3668        })?;
3669        let schema_cols = &table.schema().columns;
3670        // The qualifier accepted on column refs is the alias (if any) else the
3671        // bare table name.
3672        let alias = primary.alias.as_deref().unwrap_or(primary.name.as_str());
3673        let ctx = EvalContext::new(schema_cols, Some(alias));
3674
3675        // NSW kNN planner: `ORDER BY col <-> literal LIMIT k` with no
3676        // WHERE and an NSW index on `col` skips the full scan. The
3677        // walk returns rows already in ascending-distance order, so
3678        // ORDER BY / LIMIT are honoured implicitly.
3679        if let Some(nsw_rows) = try_nsw_knn(stmt, table, schema_cols, alias) {
3680            return materialise_in_order(stmt, table, schema_cols, alias, &nsw_rows);
3681        }
3682
3683        // Index seek: if WHERE is `col = literal` (or commuted) and the
3684        // referenced column has an index, dispatch each locator through
3685        // the catalog (hot tier → borrow, cold tier → page-read +
3686        // decode) and iterate just those rows. Otherwise fall back to a
3687        // full scan over the hot tier (cold-tier rows are only reached
3688        // via index seek in v5.1 — full table scans against cold-tier
3689        // data ship in v5.2 with the freezer's per-segment scan API).
3690        let indexed_rows: Option<Vec<Cow<'_, Row>>> = stmt
3691            .where_
3692            .as_ref()
3693            .and_then(|w| try_index_seek(w, schema_cols, self.active_catalog(), table, alias));
3694
3695        // Aggregate path: filter rows first, then hand off to the
3696        // aggregate executor which does its own projection + ORDER BY.
3697        if aggregate::uses_aggregate(stmt) {
3698            let mut filtered: Vec<&Row> = Vec::new();
3699            // v6.2.6 — Memoize: per-query LRU cache for correlated
3700            // scalar subqueries. Fresh per row-loop entry so each
3701            // SELECT execution gets an isolated cache.
3702            let mut memo = memoize::MemoizeCache::new();
3703            if let Some(rows) = &indexed_rows {
3704                for cow in rows {
3705                    let row = cow.as_ref();
3706                    if let Some(where_expr) = &stmt.where_ {
3707                        let cond = self.eval_expr_with_correlated(
3708                            where_expr,
3709                            row,
3710                            &ctx,
3711                            cancel,
3712                            Some(&mut memo),
3713                        )?;
3714                        if !matches!(cond, Value::Bool(true)) {
3715                            continue;
3716                        }
3717                    }
3718                    filtered.push(row);
3719                }
3720            } else {
3721                for i in 0..table.row_count() {
3722                    let row = &table.rows()[i];
3723                    if let Some(where_expr) = &stmt.where_ {
3724                        let cond = self.eval_expr_with_correlated(
3725                            where_expr,
3726                            row,
3727                            &ctx,
3728                            cancel,
3729                            Some(&mut memo),
3730                        )?;
3731                        if !matches!(cond, Value::Bool(true)) {
3732                            continue;
3733                        }
3734                    }
3735                    filtered.push(row);
3736                }
3737            }
3738            let mut agg = aggregate::run(stmt, &filtered, schema_cols, Some(alias))?;
3739            apply_offset_and_limit(&mut agg.rows, stmt.offset, stmt.limit);
3740            return Ok(QueryResult::Rows {
3741                columns: agg.columns,
3742                rows: agg.rows,
3743            });
3744        }
3745
3746        let projection = build_projection(&stmt.items, schema_cols, alias)?;
3747
3748        // Materialise the filter pass into `(order_key, projected_row)`
3749        // tuples. The order key is `None` when there's no ORDER BY clause.
3750        let mut tagged: Vec<(Vec<f64>, Row)> = Vec::new();
3751        // v6.2.6 — Memoize per-row WHERE eval shares one cache.
3752        let mut memo = memoize::MemoizeCache::new();
3753        // Inline the per-row work in a closure so the indexed and full-
3754        // scan branches share the body.
3755        let mut process_row = |row: &Row, loop_idx: usize| -> Result<(), EngineError> {
3756            if loop_idx.is_multiple_of(256) {
3757                cancel.check()?;
3758            }
3759            if let Some(where_expr) = &stmt.where_ {
3760                let cond = self.eval_expr_with_correlated(
3761                    where_expr,
3762                    row,
3763                    &ctx,
3764                    cancel,
3765                    Some(&mut memo),
3766                )?;
3767                if !matches!(cond, Value::Bool(true)) {
3768                    return Ok(());
3769                }
3770            }
3771            let mut values = Vec::with_capacity(projection.len());
3772            for p in &projection {
3773                values.push(eval::eval_expr(&p.expr, row, &ctx)?);
3774            }
3775            let order_keys = if stmt.order_by.is_empty() {
3776                Vec::new()
3777            } else {
3778                build_order_keys(&stmt.order_by, row, &ctx)?
3779            };
3780            tagged.push((order_keys, Row::new(values)));
3781            Ok(())
3782        };
3783        if let Some(rows) = &indexed_rows {
3784            for (loop_idx, cow) in rows.iter().enumerate() {
3785                process_row(cow.as_ref(), loop_idx)?;
3786            }
3787        } else {
3788            for i in 0..table.row_count() {
3789                process_row(&table.rows()[i], i)?;
3790            }
3791        }
3792
3793        if !stmt.order_by.is_empty() {
3794            // Partial-sort fast path: when LIMIT is small relative to
3795            // the row count, select_nth_unstable + sort just the
3796            // prefix is O(n + k log k) instead of O(n log n). DISTINCT
3797            // requires the full sort because de-dup happens after.
3798            let keep = if stmt.distinct {
3799                None
3800            } else {
3801                stmt.limit
3802                    .map(|l| l as usize + stmt.offset.map_or(0, |o| o as usize))
3803            };
3804            let descs: Vec<bool> = stmt.order_by.iter().map(|o| o.desc).collect();
3805            partial_sort_tagged(&mut tagged, keep, &descs);
3806        }
3807
3808        let mut output_rows: Vec<Row> = tagged.into_iter().map(|(_, r)| r).collect();
3809        if stmt.distinct {
3810            output_rows = dedup_rows(output_rows);
3811        }
3812        apply_offset_and_limit(&mut output_rows, stmt.offset, stmt.limit);
3813
3814        let columns: Vec<ColumnSchema> = projection
3815            .into_iter()
3816            .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
3817            .collect();
3818
3819        Ok(QueryResult::Rows {
3820            columns,
3821            rows: output_rows,
3822        })
3823    }
3824
3825    /// Multi-table SELECT executor (one or more JOIN peers).
3826    ///
3827    /// v1.10 builds the joined row set up-front via nested-loop joins,
3828    /// then runs WHERE + projection + ORDER BY against the combined
3829    /// rows. No index seek. Aggregates and DISTINCT still work because
3830    /// the executor delegates projection through the same shared paths.
3831    #[allow(clippy::too_many_lines)]
3832    fn exec_joined_select(
3833        &self,
3834        stmt: &SelectStatement,
3835        from: &FromClause,
3836    ) -> Result<QueryResult, EngineError> {
3837        // Resolve every table reference up front so we surface
3838        // TableNotFound before we start the cartesian work.
3839        let primary_table = self
3840            .active_catalog()
3841            .get(&from.primary.name)
3842            .ok_or_else(|| StorageError::TableNotFound {
3843                name: from.primary.name.clone(),
3844            })?;
3845        let primary_alias = from
3846            .primary
3847            .alias
3848            .as_deref()
3849            .unwrap_or(from.primary.name.as_str())
3850            .to_string();
3851        let mut joined_tables: Vec<(&Table, String, JoinKind, Option<&Expr>)> = Vec::new();
3852        for j in &from.joins {
3853            let t = self.active_catalog().get(&j.table.name).ok_or_else(|| {
3854                StorageError::TableNotFound {
3855                    name: j.table.name.clone(),
3856                }
3857            })?;
3858            let a = j
3859                .table
3860                .alias
3861                .as_deref()
3862                .unwrap_or(j.table.name.as_str())
3863                .to_string();
3864            joined_tables.push((t, a, j.kind, j.on.as_ref()));
3865        }
3866
3867        // Build the combined schema: composite "alias.col" names so the
3868        // qualified-column resolver can find anything by exact match.
3869        let mut combined_schema: Vec<ColumnSchema> = Vec::new();
3870        for col in &primary_table.schema().columns {
3871            combined_schema.push(ColumnSchema::new(
3872                alloc::format!("{primary_alias}.{}", col.name),
3873                col.ty,
3874                col.nullable,
3875            ));
3876        }
3877        for (t, a, _, _) in &joined_tables {
3878            for col in &t.schema().columns {
3879                combined_schema.push(ColumnSchema::new(
3880                    alloc::format!("{a}.{}", col.name),
3881                    col.ty,
3882                    col.nullable,
3883                ));
3884            }
3885        }
3886        let ctx = EvalContext::new(&combined_schema, None);
3887
3888        // Nested-loop join. Starting set: every primary row, padded with
3889        // (no joined columns yet).
3890        let mut working: Vec<Row> = primary_table.rows().iter().cloned().collect();
3891        let mut produced_len = primary_table.schema().columns.len();
3892        for (t, _, kind, on) in &joined_tables {
3893            let right_arity = t.schema().columns.len();
3894            let mut next: Vec<Row> = Vec::new();
3895            for left in &working {
3896                let mut left_matched = false;
3897                for right in t.rows() {
3898                    let mut combined_vals = left.values.clone();
3899                    combined_vals.extend(right.values.iter().cloned());
3900                    // Pad combined to the eventual full width so the
3901                    // partial schema still matches positions used by ON.
3902                    let combined = Row::new(combined_vals);
3903                    let keep = if let Some(on_expr) = on {
3904                        let cond = eval::eval_expr(on_expr, &combined, &ctx)?;
3905                        matches!(cond, Value::Bool(true))
3906                    } else {
3907                        // CROSS / comma-list: every pair survives.
3908                        true
3909                    };
3910                    if keep {
3911                        next.push(combined);
3912                        left_matched = true;
3913                    }
3914                }
3915                if !left_matched && matches!(kind, JoinKind::Left) {
3916                    // LEFT OUTER JOIN: emit the left row with NULLs on
3917                    // the right side when no peer matched.
3918                    let mut combined_vals = left.values.clone();
3919                    for _ in 0..right_arity {
3920                        combined_vals.push(Value::Null);
3921                    }
3922                    next.push(Row::new(combined_vals));
3923                }
3924            }
3925            working = next;
3926            produced_len += right_arity;
3927            debug_assert!(produced_len <= combined_schema.len());
3928        }
3929
3930        // WHERE filter against combined rows.
3931        let mut filtered: Vec<Row> = Vec::new();
3932        for row in working {
3933            if let Some(where_expr) = &stmt.where_ {
3934                let cond = eval::eval_expr(where_expr, &row, &ctx)?;
3935                if !matches!(cond, Value::Bool(true)) {
3936                    continue;
3937                }
3938            }
3939            filtered.push(row);
3940        }
3941
3942        // Aggregate path: handle GROUP BY / aggregate calls over the
3943        // joined+filtered rows.
3944        if aggregate::uses_aggregate(stmt) {
3945            let refs: Vec<&Row> = filtered.iter().collect();
3946            let mut agg = aggregate::run(stmt, &refs, &combined_schema, None)?;
3947            apply_offset_and_limit(&mut agg.rows, stmt.offset, stmt.limit);
3948            return Ok(QueryResult::Rows {
3949                columns: agg.columns,
3950                rows: agg.rows,
3951            });
3952        }
3953
3954        let projection = build_projection(&stmt.items, &combined_schema, "")?;
3955        let mut tagged: Vec<(Vec<f64>, Row)> = Vec::new();
3956        for row in &filtered {
3957            let mut values = Vec::with_capacity(projection.len());
3958            for p in &projection {
3959                values.push(eval::eval_expr(&p.expr, row, &ctx)?);
3960            }
3961            let order_keys = if stmt.order_by.is_empty() {
3962                Vec::new()
3963            } else {
3964                build_order_keys(&stmt.order_by, row, &ctx)?
3965            };
3966            tagged.push((order_keys, Row::new(values)));
3967        }
3968        if !stmt.order_by.is_empty() {
3969            let keep = if stmt.distinct {
3970                None
3971            } else {
3972                stmt.limit
3973                    .map(|l| l as usize + stmt.offset.map_or(0, |o| o as usize))
3974            };
3975            let descs: Vec<bool> = stmt.order_by.iter().map(|o| o.desc).collect();
3976            partial_sort_tagged(&mut tagged, keep, &descs);
3977        }
3978        let mut output_rows: Vec<Row> = tagged.into_iter().map(|(_, r)| r).collect();
3979        if stmt.distinct {
3980            output_rows = dedup_rows(output_rows);
3981        }
3982        apply_offset_and_limit(&mut output_rows, stmt.offset, stmt.limit);
3983        let columns: Vec<ColumnSchema> = projection
3984            .into_iter()
3985            .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
3986            .collect();
3987        Ok(QueryResult::Rows {
3988            columns,
3989            rows: output_rows,
3990        })
3991    }
3992}
3993
3994/// One row-producing projection: an expression to evaluate, the resulting
3995/// column's user-visible name, its inferred type, and nullability.
3996#[derive(Debug, Clone)]
3997struct ProjectedItem {
3998    expr: Expr,
3999    output_name: String,
4000    ty: DataType,
4001    nullable: bool,
4002}
4003
4004/// Dedupe a row set, preserving first-seen order. `Row`'s `PartialEq` is
4005/// structural (`Vec<Value>` ⇒ pairwise `Value` equality), which gives SQL
4006/// `NULL = NULL → TRUE` and `NaN = NaN → FALSE`. The first agrees with
4007/// the spec's "two NULLs are not distinct"; the second is a tolerated
4008/// quirk for v1 (no NaN literals are reachable from the SQL surface).
4009fn dedup_rows(rows: Vec<Row>) -> Vec<Row> {
4010    let mut out: Vec<Row> = Vec::with_capacity(rows.len());
4011    for r in rows {
4012        if !out.iter().any(|seen| seen == &r) {
4013            out.push(r);
4014        }
4015    }
4016    out
4017}
4018
4019/// Coerce a `Value` to an `f64` sort key for ORDER BY. Numbers map directly;
4020/// NULL sorts last (treated as `+∞`); booleans are 0.0 / 1.0; text uses lex
4021/// order via the byte values; vectors are not sortable.
4022fn value_to_order_key(v: &Value) -> Result<f64, EngineError> {
4023    match v {
4024        Value::Null => Ok(f64::INFINITY),
4025        Value::SmallInt(n) => Ok(f64::from(*n)),
4026        Value::Int(n) => Ok(f64::from(*n)),
4027        Value::Date(d) => Ok(f64::from(*d)),
4028        #[allow(clippy::cast_precision_loss)]
4029        Value::Timestamp(t) => Ok(*t as f64),
4030        #[allow(clippy::cast_precision_loss)]
4031        Value::Numeric { scaled, scale } => {
4032            // Scaled integer / 10^scale, computed via f64 for sort
4033            // ordering only. Precision losses here only matter for
4034            // ORDER BY tie-breaks well past 15 significant digits.
4035            // `f64::powi` lives in std; we hand-roll the loop so the
4036            // no_std engine crate doesn't need it.
4037            let mut divisor = 1.0_f64;
4038            for _ in 0..*scale {
4039                divisor *= 10.0;
4040            }
4041            Ok((*scaled as f64) / divisor)
4042        }
4043        #[allow(clippy::cast_precision_loss)]
4044        Value::BigInt(n) => Ok(*n as f64),
4045        Value::Float(x) => Ok(*x),
4046        Value::Bool(b) => Ok(if *b { 1.0 } else { 0.0 }),
4047        Value::Text(s) => {
4048            // Lex order by codepoints — good enough for ORDER BY name.
4049            // Map first 8 bytes packed into u64 as a coarse key; ties fall to
4050            // partial_cmp Equal. v1.x can swap in a real string comparator.
4051            let mut key: u64 = 0;
4052            for &b in s.as_bytes().iter().take(8) {
4053                key = (key << 8) | u64::from(b);
4054            }
4055            #[allow(clippy::cast_precision_loss)]
4056            Ok(key as f64)
4057        }
4058        Value::Vector(_) | Value::Sq8Vector(_) | Value::HalfVector(_) => {
4059            Err(EngineError::Unsupported(
4060                "ORDER BY of a raw vector column is not meaningful — use `<->`".into(),
4061            ))
4062        }
4063        Value::Interval { .. } => Err(EngineError::Unsupported(
4064            "ORDER BY of an INTERVAL is not supported in v2.11 \
4065             (months vs micros has no single canonical ordering)"
4066                .into(),
4067        )),
4068        Value::Json(_) => Err(EngineError::Unsupported(
4069            "ORDER BY of a JSON value is not supported — cast the document to text first".into(),
4070        )),
4071        // v7.5.0 — Value is #[non_exhaustive]; future variants need
4072        // an explicit ORDER BY mapping. Surface as Unsupported until
4073        // engine support is added.
4074        _ => Err(EngineError::Unsupported(
4075            "ORDER BY of this value type is not supported".into(),
4076        )),
4077    }
4078}
4079
4080/// Try to plan a WHERE clause as an equality lookup against an existing
4081/// index. Returns the candidate row indices on success; `None` means the
4082/// caller should fall back to a full scan.
4083///
4084/// v0.8 recognises a single top-level `col = literal` (in either operand
4085/// order). AND chains and range scans land in later milestones.
4086/// Look for `ORDER BY col <dist-op> literal LIMIT k` against an
4087/// NSW-indexed vector column. Recognised distance ops: `<->` (L2),
4088/// `<#>` (inner product), `<=>` (cosine). When a WHERE clause is
4089/// present, the planner does an "over-fetch and filter" pass — it
4090/// asks the graph for `k * over_fetch` candidates, evaluates WHERE
4091/// against each, and trims back to `k`. Returns the row indices in
4092/// ascending-distance order when the plan applies.
4093fn try_nsw_knn(
4094    stmt: &SelectStatement,
4095    table: &Table,
4096    schema_cols: &[ColumnSchema],
4097    table_alias: &str,
4098) -> Option<Vec<usize>> {
4099    if stmt.distinct {
4100        return None;
4101    }
4102    let limit = usize::try_from(stmt.limit?).ok()?;
4103    if limit == 0 {
4104        return None;
4105    }
4106    // v6.4.0 — NSW kNN dispatch needs a single ORDER BY key on the
4107    // distance metric. Multi-key ORDER BY falls through to the
4108    // generic sort path.
4109    if stmt.order_by.len() != 1 {
4110        return None;
4111    }
4112    let order = &stmt.order_by[0];
4113    // NSW kNN returns rows ascending by distance — DESC inverts the
4114    // natural order, so the planner can't handle it without a sort
4115    // pass. Fall back to the generic ORDER BY path.
4116    if order.desc {
4117        return None;
4118    }
4119    let Expr::Binary { lhs, op, rhs } = &order.expr else {
4120        return None;
4121    };
4122    let metric = match op {
4123        BinOp::L2Distance => spg_storage::NswMetric::L2,
4124        BinOp::InnerProduct => spg_storage::NswMetric::InnerProduct,
4125        BinOp::CosineDistance => spg_storage::NswMetric::Cosine,
4126        _ => return None,
4127    };
4128    // Accept both `col <op> literal` and `literal <op> col`.
4129    let ((Expr::Column(col), literal) | (literal, Expr::Column(col))) =
4130        (lhs.as_ref(), rhs.as_ref())
4131    else {
4132        return None;
4133    };
4134    if let Some(q) = &col.qualifier
4135        && q != table_alias
4136    {
4137        return None;
4138    }
4139    let col_pos = schema_cols.iter().position(|s| s.name == col.name)?;
4140    let query = literal_to_vector(literal)?;
4141    let idx = spg_storage::nsw_index_on(table, col_pos)?;
4142    if let Some(where_expr) = &stmt.where_ {
4143        // Over-fetch and filter. The factor (10×) is a heuristic that
4144        // covers typical selectivity for the corpus tests; v2.x will
4145        // make it configurable.
4146        let over_fetch = limit.saturating_mul(10).max(NSW_OVER_FETCH_FLOOR);
4147        let candidates = spg_storage::nsw_query(table, &idx.name, &query, over_fetch, metric);
4148        let ctx = EvalContext::new(schema_cols, Some(table_alias));
4149        let mut kept: Vec<usize> = Vec::with_capacity(limit);
4150        for i in candidates {
4151            let row = &table.rows()[i];
4152            let cond = eval::eval_expr(where_expr, row, &ctx).ok()?;
4153            if matches!(cond, Value::Bool(true)) {
4154                kept.push(i);
4155                if kept.len() >= limit {
4156                    break;
4157                }
4158            }
4159        }
4160        Some(kept)
4161    } else {
4162        Some(spg_storage::nsw_query(
4163            table, &idx.name, &query, limit, metric,
4164        ))
4165    }
4166}
4167
4168/// Lower bound on the over-fetch pool when WHERE is present — even
4169/// for tiny `LIMIT 1` queries we keep enough candidates to absorb a
4170/// few WHERE rejections.
4171const NSW_OVER_FETCH_FLOOR: usize = 32;
4172
4173/// Pull a `Vec<f32>` out of a literal-or-cast expression. Returns
4174/// `None` for anything we can't fold at plan time.
4175fn literal_to_vector(e: &Expr) -> Option<Vec<f32>> {
4176    match e {
4177        Expr::Literal(Literal::Vector(v)) => Some(v.clone()),
4178        Expr::Cast { expr, .. } => literal_to_vector(expr),
4179        _ => None,
4180    }
4181}
4182
4183/// Materialise rows in a planner-supplied order (used by the NSW path)
4184/// without re-running ORDER BY. The projection + LIMIT slot mirror the
4185/// equivalent block in `exec_bare_select`.
4186fn materialise_in_order(
4187    stmt: &SelectStatement,
4188    table: &Table,
4189    schema_cols: &[ColumnSchema],
4190    table_alias: &str,
4191    ordered_rows: &[usize],
4192) -> Result<QueryResult, EngineError> {
4193    let ctx = EvalContext::new(schema_cols, Some(table_alias));
4194    let projection = build_projection(&stmt.items, schema_cols, table_alias)?;
4195    let mut output_rows: Vec<Row> = Vec::with_capacity(ordered_rows.len());
4196    for &i in ordered_rows {
4197        let row = &table.rows()[i];
4198        let mut values = Vec::with_capacity(projection.len());
4199        for p in &projection {
4200            values.push(eval::eval_expr(&p.expr, row, &ctx)?);
4201        }
4202        output_rows.push(Row::new(values));
4203    }
4204    apply_offset_and_limit(&mut output_rows, stmt.offset, stmt.limit);
4205    let columns: Vec<ColumnSchema> = projection
4206        .into_iter()
4207        .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
4208        .collect();
4209    Ok(QueryResult::Rows {
4210        columns,
4211        rows: output_rows,
4212    })
4213}
4214
4215fn try_index_seek<'a>(
4216    where_expr: &Expr,
4217    schema_cols: &[ColumnSchema],
4218    catalog: &'a Catalog,
4219    table: &'a Table,
4220    table_alias: &str,
4221) -> Option<Vec<Cow<'a, Row>>> {
4222    let Expr::Binary {
4223        lhs,
4224        op: BinOp::Eq,
4225        rhs,
4226    } = where_expr
4227    else {
4228        return None;
4229    };
4230    let (col_pos, value) = resolve_col_literal_pair(lhs, rhs, schema_cols, table_alias)
4231        .or_else(|| resolve_col_literal_pair(rhs, lhs, schema_cols, table_alias))?;
4232    let idx = table.index_on(col_pos)?;
4233    let key = IndexKey::from_value(&value)?;
4234    let locators = idx.lookup_eq(&key);
4235    let table_name = table.schema().name.as_str();
4236    // v5.1: each locator dispatches to either the hot tier (zero-
4237    // copy borrow of `table.rows()[i]`) or a cold-tier segment
4238    // (one page read + dense row decode, ~µs scale). Cold rows are
4239    // returned as `Cow::Owned` so the caller's `&Row` iteration
4240    // doesn't see a tier distinction; pre-freezer (no cold
4241    // segments loaded) every locator is `Hot` and every entry is
4242    // `Cow::Borrowed` — identical cost to the pre-v5.1 path.
4243    let mut out: Vec<Cow<'a, Row>> = Vec::with_capacity(locators.len());
4244    for loc in locators {
4245        match *loc {
4246            spg_storage::RowLocator::Hot(i) => {
4247                if let Some(row) = table.rows().get(i) {
4248                    out.push(Cow::Borrowed(row));
4249                }
4250            }
4251            spg_storage::RowLocator::Cold { segment_id, .. } => {
4252                if let Some(row) = catalog.resolve_cold_locator(table_name, segment_id, &key) {
4253                    out.push(Cow::Owned(row));
4254                }
4255            }
4256        }
4257    }
4258    Some(out)
4259}
4260
4261/// v5.2.3: extract `(column_position, IndexKey)` when `where_expr`
4262/// is a simple `col = literal` predicate suitable for a `BTree` index
4263/// seek. Used by `exec_update_cancel` / `exec_delete_cancel` to
4264/// decide whether a write touches a cold-tier row (which requires
4265/// promote-on-write / shadow-on-delete) before falling through to
4266/// the hot-tier row walk.
4267///
4268/// Returns `None` for any predicate shape the planner can't push
4269/// down to an index seek — complex WHERE clauses always take the
4270/// hot-only path (cold rows are immutable to non-indexed writes
4271/// until a future scan-fanout sub-version).
4272fn try_pk_predicate(
4273    where_expr: &Expr,
4274    schema_cols: &[ColumnSchema],
4275    table_alias: &str,
4276) -> Option<(usize, IndexKey)> {
4277    let Expr::Binary {
4278        lhs,
4279        op: BinOp::Eq,
4280        rhs,
4281    } = where_expr
4282    else {
4283        return None;
4284    };
4285    let (col_pos, value) = resolve_col_literal_pair(lhs, rhs, schema_cols, table_alias)
4286        .or_else(|| resolve_col_literal_pair(rhs, lhs, schema_cols, table_alias))?;
4287    let key = IndexKey::from_value(&value)?;
4288    Some((col_pos, key))
4289}
4290
4291fn resolve_col_literal_pair(
4292    col_side: &Expr,
4293    lit_side: &Expr,
4294    schema_cols: &[ColumnSchema],
4295    table_alias: &str,
4296) -> Option<(usize, Value)> {
4297    let Expr::Column(c) = col_side else {
4298        return None;
4299    };
4300    if let Some(q) = &c.qualifier
4301        && q != table_alias
4302    {
4303        return None;
4304    }
4305    let pos = schema_cols.iter().position(|s| s.name == c.name)?;
4306    let Expr::Literal(l) = lit_side else {
4307        return None;
4308    };
4309    let v = match l {
4310        Literal::Integer(n) => {
4311            if let Ok(small) = i32::try_from(*n) {
4312                Value::Int(small)
4313            } else {
4314                Value::BigInt(*n)
4315            }
4316        }
4317        Literal::Float(x) => Value::Float(*x),
4318        Literal::String(s) => Value::Text(s.clone()),
4319        Literal::Bool(b) => Value::Bool(*b),
4320        Literal::Null => Value::Null,
4321        // Vector and Interval literals can't be used as B-tree index keys.
4322        // Tell the planner to fall back to full-scan.
4323        Literal::Vector(_) | Literal::Interval { .. } => return None,
4324    };
4325    Some((pos, v))
4326}
4327
4328/// Find the schema entry that a SELECT-list `Expr::Column` refers to.
4329/// Mirrors `resolve_column` in `eval.rs`, but returns a proper
4330/// `EngineError` so the projection-build path keeps `UnknownQualifier`
4331/// vs `ColumnNotFound` distinct.
4332fn resolve_projection_column<'a>(
4333    c: &ColumnName,
4334    schema_cols: &'a [ColumnSchema],
4335    table_alias: &str,
4336) -> Result<&'a ColumnSchema, EngineError> {
4337    if let Some(q) = &c.qualifier {
4338        let composite = alloc::format!("{q}.{name}", name = c.name);
4339        if let Some(s) = schema_cols.iter().find(|s| s.name == composite) {
4340            return Ok(s);
4341        }
4342        // Single-table case: the qualifier may equal the active alias —
4343        // then look for the bare column name.
4344        if q == table_alias
4345            && let Some(s) = schema_cols.iter().find(|s| s.name == c.name)
4346        {
4347            return Ok(s);
4348        }
4349        // For multi-table schemas the qualifier is unknown only if no
4350        // column bears the "<q>." prefix. For single-table, the alias
4351        // mismatch alone is enough.
4352        let prefix = alloc::format!("{q}.");
4353        let qualifier_known =
4354            q == table_alias || schema_cols.iter().any(|s| s.name.starts_with(&prefix));
4355        if !qualifier_known {
4356            return Err(EngineError::Eval(EvalError::UnknownQualifier {
4357                qualifier: q.clone(),
4358            }));
4359        }
4360        return Err(EngineError::Eval(EvalError::ColumnNotFound {
4361            name: c.name.clone(),
4362        }));
4363    }
4364    if let Some(s) = schema_cols.iter().find(|s| s.name == c.name) {
4365        return Ok(s);
4366    }
4367    let suffix = alloc::format!(".{name}", name = c.name);
4368    let mut matches = schema_cols.iter().filter(|s| s.name.ends_with(&suffix));
4369    let first = matches.next();
4370    let extra = matches.next();
4371    match (first, extra) {
4372        (Some(s), None) => Ok(s),
4373        (Some(_), Some(_)) => Err(EngineError::Eval(EvalError::TypeMismatch {
4374            detail: alloc::format!("ambiguous column reference: {}", c.name),
4375        })),
4376        _ => Err(EngineError::Eval(EvalError::ColumnNotFound {
4377            name: c.name.clone(),
4378        })),
4379    }
4380}
4381
4382fn build_projection(
4383    items: &[SelectItem],
4384    schema_cols: &[ColumnSchema],
4385    table_alias: &str,
4386) -> Result<Vec<ProjectedItem>, EngineError> {
4387    let mut out = Vec::new();
4388    for item in items {
4389        match item {
4390            SelectItem::Wildcard => {
4391                for col in schema_cols {
4392                    out.push(ProjectedItem {
4393                        expr: Expr::Column(ColumnName {
4394                            qualifier: None,
4395                            name: col.name.clone(),
4396                        }),
4397                        output_name: col.name.clone(),
4398                        ty: col.ty,
4399                        nullable: col.nullable,
4400                    });
4401                }
4402            }
4403            SelectItem::Expr { expr, alias } => {
4404                // Plain column ref keeps full schema info (real type +
4405                // nullability). Compound expressions evaluate fine but have
4406                // no static type — surface them as nullable TEXT, which is
4407                // what most clients render anyway.
4408                if let Expr::Column(c) = expr {
4409                    let sch = resolve_projection_column(c, schema_cols, table_alias)?;
4410                    let output_name = alias.clone().unwrap_or_else(|| c.name.clone());
4411                    out.push(ProjectedItem {
4412                        expr: expr.clone(),
4413                        output_name,
4414                        ty: sch.ty,
4415                        nullable: sch.nullable,
4416                    });
4417                } else {
4418                    let output_name = alias.clone().unwrap_or_else(|| expr.to_string());
4419                    out.push(ProjectedItem {
4420                        expr: expr.clone(),
4421                        output_name,
4422                        ty: DataType::Text,
4423                        nullable: true,
4424                    });
4425                }
4426            }
4427        }
4428    }
4429    Ok(out)
4430}
4431
4432/// Promote an integer to a NUMERIC value at the requested scale.
4433/// Rejects values that, after scaling, would overflow the column's
4434/// precision budget.
4435fn numeric_from_integer(
4436    n: i128,
4437    precision: u8,
4438    scale: u8,
4439    col_name: &str,
4440) -> Result<Value, EngineError> {
4441    let factor = pow10_i128(scale);
4442    let scaled = n.checked_mul(factor).ok_or_else(|| {
4443        EngineError::Unsupported(alloc::format!(
4444            "integer overflow scaling value for column `{col_name}` to scale {scale}"
4445        ))
4446    })?;
4447    check_precision(scaled, precision, col_name)?;
4448    Ok(Value::Numeric { scaled, scale })
4449}
4450
4451/// Float → NUMERIC. Uses round-half-away-from-zero on `x * 10^scale`,
4452/// then verifies the result fits the column's precision.
4453#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
4454fn numeric_from_float(
4455    x: f64,
4456    precision: u8,
4457    scale: u8,
4458    col_name: &str,
4459) -> Result<Value, EngineError> {
4460    if !x.is_finite() {
4461        return Err(EngineError::Unsupported(alloc::format!(
4462            "cannot store non-finite float in NUMERIC column `{col_name}`"
4463        )));
4464    }
4465    let mut factor = 1.0_f64;
4466    for _ in 0..scale {
4467        factor *= 10.0;
4468    }
4469    // Round half-away-from-zero by biasing then casting (`as i128`
4470    // truncates toward zero, so the bias + truncation gives the
4471    // desired rounding). `f64::floor` / `ceil` live in std; we don't
4472    // need them — the cast handles the truncation step.
4473    let shifted = x * factor;
4474    let biased = if shifted >= 0.0 {
4475        shifted + 0.5
4476    } else {
4477        shifted - 0.5
4478    };
4479    // Range-check before casting back to i128 — the cast itself is
4480    // saturating in Rust, which would silently truncate huge inputs.
4481    if !(-1e38..=1e38).contains(&biased) {
4482        return Err(EngineError::Unsupported(alloc::format!(
4483            "value {x} overflows NUMERIC range for column `{col_name}`"
4484        )));
4485    }
4486    let scaled = biased as i128;
4487    check_precision(scaled, precision, col_name)?;
4488    Ok(Value::Numeric { scaled, scale })
4489}
4490
4491/// Move a Numeric value from `src_scale` to `dst_scale`. Going up
4492/// multiplies by 10; going down rounds half-away-from-zero.
4493fn numeric_rescale(
4494    scaled: i128,
4495    src_scale: u8,
4496    precision: u8,
4497    dst_scale: u8,
4498    col_name: &str,
4499) -> Result<Value, EngineError> {
4500    let new_scaled = if dst_scale >= src_scale {
4501        let bump = pow10_i128(dst_scale - src_scale);
4502        scaled.checked_mul(bump).ok_or_else(|| {
4503            EngineError::Unsupported(alloc::format!(
4504                "overflow rescaling NUMERIC for column `{col_name}`"
4505            ))
4506        })?
4507    } else {
4508        let drop = pow10_i128(src_scale - dst_scale);
4509        let half = drop / 2;
4510        if scaled >= 0 {
4511            (scaled + half) / drop
4512        } else {
4513            (scaled - half) / drop
4514        }
4515    };
4516    check_precision(new_scaled, precision, col_name)?;
4517    Ok(Value::Numeric {
4518        scaled: new_scaled,
4519        scale: dst_scale,
4520    })
4521}
4522
4523/// Drop the fractional part of a scaled integer, returning the integer
4524/// portion (toward zero). Used for NUMERIC → INT casts.
4525const fn numeric_truncate_to_integer(scaled: i128, scale: u8) -> i128 {
4526    if scale == 0 {
4527        return scaled;
4528    }
4529    let factor = pow10_i128_const(scale);
4530    scaled / factor
4531}
4532
4533/// Verify a scaled NUMERIC value fits the column's declared precision.
4534/// `precision == 0` is the "unconstrained" form (bare `NUMERIC`); we
4535/// skip the check there.
4536fn check_precision(scaled: i128, precision: u8, col_name: &str) -> Result<(), EngineError> {
4537    if precision == 0 {
4538        return Ok(());
4539    }
4540    let limit = pow10_i128(precision);
4541    if scaled.unsigned_abs() >= limit.unsigned_abs() {
4542        return Err(EngineError::Unsupported(alloc::format!(
4543            "NUMERIC value exceeds precision {precision} for column `{col_name}`"
4544        )));
4545    }
4546    Ok(())
4547}
4548
4549const fn pow10_i128_const(p: u8) -> i128 {
4550    let mut acc: i128 = 1;
4551    let mut i = 0;
4552    while i < p {
4553        acc *= 10;
4554        i += 1;
4555    }
4556    acc
4557}
4558
4559fn pow10_i128(p: u8) -> i128 {
4560    pow10_i128_const(p)
4561}
4562
4563/// Walk a parsed `Statement`, swapping any `NOW()` /
4564/// `CURRENT_TIMESTAMP()` / `CURRENT_DATE()` function calls for a
4565/// literal cast that wraps the engine's per-statement clock reading.
4566/// When `now_micros` is `None`, calls stay as-is and surface as
4567/// `unknown function` at eval time — keeps the error path explicit.
4568/// v4.10: pre-walk the WHERE / projection / etc. of a SELECT and
4569/// replace every subquery node with a materialised literal. SPG
4570/// only supports uncorrelated subqueries — the inner SELECT does
4571/// not see outer-row columns, so the result is the same for every
4572/// outer row and can be evaluated once.
4573///
4574/// Returns the rewritten statement; the caller passes this to the
4575/// regular row-loop executor which no longer sees Subquery nodes
4576/// in its tree.
4577impl Engine {
4578    /// v4.12 window executor. Implements `ROW_NUMBER` / `RANK` /
4579    /// `DENSE_RANK` and the partition-aware aggregates `SUM` /
4580    /// `AVG` / `COUNT` / `MIN` / `MAX`. The plan is:
4581    /// 1. Apply the WHERE filter.
4582    /// 2. For each unique `WindowFunction` node in the projection,
4583    ///    partition + sort, compute the per-row value.
4584    /// 3. Append the window values as synthetic columns (`__win_N`)
4585    ///    to the row schema.
4586    /// 4. Rewrite the projection to read those columns.
4587    /// 5. Hand off to the regular project / ORDER BY / LIMIT pipe.
4588    #[allow(
4589        clippy::too_many_lines,
4590        clippy::type_complexity,
4591        clippy::needless_range_loop
4592    )] // window-eval is one cohesive pipe; splitting fragments
4593    fn exec_select_with_window(
4594        &self,
4595        stmt: &SelectStatement,
4596        cancel: CancelToken<'_>,
4597    ) -> Result<QueryResult, EngineError> {
4598        let from = stmt.from.as_ref().ok_or_else(|| {
4599            EngineError::Unsupported("window functions require a FROM clause".into())
4600        })?;
4601        // For v4.12 we only support a single-table FROM. Joins +
4602        // windows is queued for v5.x.
4603        if !from.joins.is_empty() {
4604            return Err(EngineError::Unsupported(
4605                "JOIN with window functions not yet supported".into(),
4606            ));
4607        }
4608        let primary = &from.primary;
4609        let table = self.active_catalog().get(&primary.name).ok_or_else(|| {
4610            StorageError::TableNotFound {
4611                name: primary.name.clone(),
4612            }
4613        })?;
4614        let alias = primary.alias.as_deref().unwrap_or(primary.name.as_str());
4615        let schema_cols = &table.schema().columns;
4616        let ctx = EvalContext::new(schema_cols, Some(alias));
4617
4618        // 1) Filter pass.
4619        let mut filtered: Vec<&Row> = Vec::new();
4620        for (i, row) in table.rows().iter().enumerate() {
4621            if i.is_multiple_of(256) {
4622                cancel.check()?;
4623            }
4624            if let Some(w) = &stmt.where_ {
4625                let cond = eval::eval_expr(w, row, &ctx)?;
4626                if !matches!(cond, Value::Bool(true)) {
4627                    continue;
4628                }
4629            }
4630            filtered.push(row);
4631        }
4632        let n_rows = filtered.len();
4633
4634        // 2) Collect unique window function nodes from projection.
4635        let mut window_nodes: Vec<Expr> = Vec::new();
4636        for item in &stmt.items {
4637            if let SelectItem::Expr { expr, .. } = item {
4638                collect_window_nodes(expr, &mut window_nodes);
4639            }
4640        }
4641
4642        // 3) For each window, compute per-row value.
4643        // Index: same order as window_nodes; for row i, win_vals[w][i].
4644        let mut win_vals: Vec<Vec<Value>> = Vec::with_capacity(window_nodes.len());
4645        for wnode in &window_nodes {
4646            let Expr::WindowFunction {
4647                name,
4648                args,
4649                partition_by,
4650                order_by,
4651                frame,
4652                null_treatment,
4653            } = wnode
4654            else {
4655                unreachable!("collect_window_nodes pushes only WindowFunction");
4656            };
4657            // Compute (partition_key, order_key, original_index) for each row.
4658            let mut indexed: Vec<(Vec<Value>, Vec<(Value, bool)>, usize)> =
4659                Vec::with_capacity(n_rows);
4660            for (i, row) in filtered.iter().enumerate() {
4661                let pkey: Vec<Value> = partition_by
4662                    .iter()
4663                    .map(|p| eval::eval_expr(p, row, &ctx))
4664                    .collect::<Result<_, _>>()?;
4665                let okey: Vec<(Value, bool)> = order_by
4666                    .iter()
4667                    .map(|(e, desc)| eval::eval_expr(e, row, &ctx).map(|v| (v, *desc)))
4668                    .collect::<Result<_, _>>()?;
4669                indexed.push((pkey, okey, i));
4670            }
4671            // Sort by (partition_key, order_key). Partition key uses
4672            // a stable encoded form; order key respects ASC/DESC.
4673            indexed.sort_by(|a, b| {
4674                let p_cmp = partition_key_cmp(&a.0, &b.0);
4675                if p_cmp != core::cmp::Ordering::Equal {
4676                    return p_cmp;
4677                }
4678                order_key_cmp(&a.1, &b.1)
4679            });
4680            // Per-partition compute.
4681            let mut out_vals: Vec<Value> = alloc::vec![Value::Null; n_rows];
4682            let mut p_start = 0;
4683            while p_start < indexed.len() {
4684                let mut p_end = p_start + 1;
4685                while p_end < indexed.len()
4686                    && partition_key_cmp(&indexed[p_start].0, &indexed[p_end].0)
4687                        == core::cmp::Ordering::Equal
4688                {
4689                    p_end += 1;
4690                }
4691                // Compute the function within this partition slice.
4692                compute_window_partition(
4693                    name,
4694                    args,
4695                    !order_by.is_empty(),
4696                    frame.as_ref(),
4697                    *null_treatment,
4698                    &indexed[p_start..p_end],
4699                    &filtered,
4700                    &ctx,
4701                    &mut out_vals,
4702                )?;
4703                p_start = p_end;
4704            }
4705            win_vals.push(out_vals);
4706        }
4707
4708        // 4) Build extended schema: original columns + synthetic.
4709        let mut ext_cols = schema_cols.clone();
4710        for i in 0..window_nodes.len() {
4711            ext_cols.push(ColumnSchema::new(
4712                alloc::format!("__win_{i}"),
4713                DataType::Text, // type doesn't matter for projection eval
4714                true,
4715            ));
4716        }
4717        // 5) Build extended rows: each row gets its window values appended.
4718        let mut ext_rows: Vec<Row> = Vec::with_capacity(n_rows);
4719        for i in 0..n_rows {
4720            let mut values = filtered[i].values.clone();
4721            for w in 0..window_nodes.len() {
4722                values.push(win_vals[w][i].clone());
4723            }
4724            ext_rows.push(Row::new(values));
4725        }
4726        // 6) Rewrite the projection: WindowFunction nodes → Column(__win_N).
4727        let mut rewritten_items: Vec<SelectItem> = Vec::with_capacity(stmt.items.len());
4728        for item in &stmt.items {
4729            let new_item = match item {
4730                SelectItem::Wildcard => SelectItem::Wildcard,
4731                SelectItem::Expr { expr, alias } => {
4732                    let mut e = expr.clone();
4733                    rewrite_window_to_columns(&mut e, &window_nodes);
4734                    SelectItem::Expr {
4735                        expr: e,
4736                        alias: alias.clone(),
4737                    }
4738                }
4739            };
4740            rewritten_items.push(new_item);
4741        }
4742
4743        // 7) Project into final rows.
4744        let ext_ctx = EvalContext::new(&ext_cols, Some(alias));
4745        let projection = build_projection(&rewritten_items, &ext_cols, alias)?;
4746        let mut tagged: Vec<(Vec<f64>, Row)> = Vec::with_capacity(n_rows);
4747        for (i, row) in ext_rows.iter().enumerate() {
4748            if i.is_multiple_of(256) {
4749                cancel.check()?;
4750            }
4751            let mut values = Vec::with_capacity(projection.len());
4752            for p in &projection {
4753                values.push(eval::eval_expr(&p.expr, row, &ext_ctx)?);
4754            }
4755            let order_keys = if stmt.order_by.is_empty() {
4756                Vec::new()
4757            } else {
4758                let mut keys = Vec::with_capacity(stmt.order_by.len());
4759                for o in &stmt.order_by {
4760                    let mut e = o.expr.clone();
4761                    rewrite_window_to_columns(&mut e, &window_nodes);
4762                    let key = eval::eval_expr(&e, row, &ext_ctx)?;
4763                    keys.push(value_to_order_key(&key)?);
4764                }
4765                keys
4766            };
4767            tagged.push((order_keys, Row::new(values)));
4768        }
4769        // ORDER BY + LIMIT/OFFSET on the projected rows.
4770        if !stmt.order_by.is_empty() {
4771            let descs: Vec<bool> = stmt.order_by.iter().map(|o| o.desc).collect();
4772            sort_by_keys(&mut tagged, &descs);
4773        }
4774        let mut out_rows: Vec<Row> = tagged.into_iter().map(|(_, r)| r).collect();
4775        apply_offset_and_limit(&mut out_rows, stmt.offset, stmt.limit);
4776        let final_cols: Vec<ColumnSchema> = projection
4777            .into_iter()
4778            .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
4779            .collect();
4780        Ok(QueryResult::Rows {
4781            columns: final_cols,
4782            rows: out_rows,
4783        })
4784    }
4785
4786    /// v4.11: materialise each CTE into a temp table inside a
4787    /// cloned catalog, then run the body SELECT against a fresh
4788    /// engine instance that owns the enriched catalog. The clone
4789    /// is moderately expensive — only paid by CTE-bearing queries.
4790    /// Subqueries inside CTE bodies / the main body resolve as
4791    /// usual; `clock_fn` is propagated so `NOW()` lines up.
4792    fn exec_with_ctes(
4793        &self,
4794        stmt: &SelectStatement,
4795        cancel: CancelToken<'_>,
4796    ) -> Result<QueryResult, EngineError> {
4797        cancel.check()?;
4798        let mut catalog = self.active_catalog().clone();
4799        for cte in &stmt.ctes {
4800            if catalog.get(&cte.name).is_some() {
4801                return Err(EngineError::Unsupported(alloc::format!(
4802                    "CTE name {:?} shadows an existing table; rename the CTE",
4803                    cte.name
4804                )));
4805            }
4806            let (columns, rows) = if cte.recursive {
4807                self.materialise_recursive_cte(cte, &catalog, cancel)?
4808            } else {
4809                let body_result = self.exec_select_cancel(&cte.body, cancel)?;
4810                let QueryResult::Rows { columns, rows } = body_result else {
4811                    return Err(EngineError::Unsupported(alloc::format!(
4812                        "CTE {:?} body did not return rows",
4813                        cte.name
4814                    )));
4815                };
4816                (columns, rows)
4817            };
4818            // v4.22: the projection builder labels any non-column
4819            // expression as Text — including literal SELECT 1.
4820            // Promote each column's type to whatever the rows
4821            // actually carry so the CTE storage table accepts them.
4822            let inferred = infer_column_types(&columns, &rows);
4823            let mut columns = inferred;
4824            // v4.22: apply optional `WITH name(a, b, c)` overrides.
4825            if !cte.column_overrides.is_empty() {
4826                if cte.column_overrides.len() != columns.len() {
4827                    return Err(EngineError::Unsupported(alloc::format!(
4828                        "CTE {:?} column list has {} names but body returns {} columns",
4829                        cte.name,
4830                        cte.column_overrides.len(),
4831                        columns.len()
4832                    )));
4833                }
4834                for (col, name) in columns.iter_mut().zip(cte.column_overrides.iter()) {
4835                    col.name.clone_from(name);
4836                }
4837            }
4838            let schema = TableSchema::new(cte.name.clone(), columns);
4839            catalog.create_table(schema).map_err(EngineError::Storage)?;
4840            let table = catalog
4841                .get_mut(&cte.name)
4842                .expect("just-created CTE table must exist");
4843            for row in rows {
4844                table.insert(row).map_err(EngineError::Storage)?;
4845            }
4846        }
4847        // Strip CTEs from the body before running on the temp engine
4848        // so we don't recurse forever.
4849        let mut body = stmt.clone();
4850        body.ctes = Vec::new();
4851        let mut temp = Engine::restore(catalog);
4852        if let Some(c) = self.clock {
4853            temp = temp.with_clock(c);
4854        }
4855        if let Some(f) = self.salt_fn {
4856            temp = temp.with_salt_fn(f);
4857        }
4858        temp.exec_select_cancel(&body, cancel)
4859    }
4860
4861    /// v4.22: materialise a WITH RECURSIVE CTE. The body must be a
4862    /// UNION (or UNION ALL) of an anchor that does not reference
4863    /// the CTE name, and one or more recursive terms that do. The
4864    /// anchor runs first; each subsequent iteration runs the
4865    /// recursive term against a temp catalog where the CTE name is
4866    /// bound to the *previous* iteration's output. Iteration stops
4867    /// when the recursive term yields no rows; UNION (DISTINCT)
4868    /// deduplicates against the accumulated result, UNION ALL does
4869    /// not. A hard cap on total rows prevents runaway queries.
4870    #[allow(clippy::too_many_lines)]
4871    fn materialise_recursive_cte(
4872        &self,
4873        cte: &spg_sql::ast::Cte,
4874        base_catalog: &Catalog,
4875        cancel: CancelToken<'_>,
4876    ) -> Result<(Vec<ColumnSchema>, Vec<Row>), EngineError> {
4877        const MAX_TOTAL_ROWS: usize = 1_000_000;
4878        const MAX_ITERATIONS: usize = 100_000;
4879        cancel.check()?;
4880        if cte.body.unions.is_empty() {
4881            return Err(EngineError::Unsupported(alloc::format!(
4882                "WITH RECURSIVE {:?} body must be a UNION of an anchor and a recursive term",
4883                cte.name
4884            )));
4885        }
4886        // Anchor: the body's leading SELECT, with unions stripped.
4887        let mut anchor = cte.body.clone();
4888        let union_terms = core::mem::take(&mut anchor.unions);
4889        anchor.ctes = Vec::new();
4890        // Anchor must not reference the CTE name.
4891        if select_refers_to(&anchor, &cte.name) {
4892            return Err(EngineError::Unsupported(alloc::format!(
4893                "WITH RECURSIVE {:?}: the anchor must not reference the CTE itself",
4894                cte.name
4895            )));
4896        }
4897        let anchor_result = self.exec_select_cancel(&anchor, cancel)?;
4898        let QueryResult::Rows {
4899            columns: anchor_cols,
4900            rows: anchor_rows,
4901        } = anchor_result
4902        else {
4903            return Err(EngineError::Unsupported(alloc::format!(
4904                "WITH RECURSIVE {:?}: anchor did not return rows",
4905                cte.name
4906            )));
4907        };
4908        // The projection builder labels non-column expressions Text;
4909        // refine column types from the anchor's actual values so the
4910        // intermediate iter-catalog tables accept them.
4911        let mut columns = infer_column_types(&anchor_cols, &anchor_rows);
4912        if !cte.column_overrides.is_empty() {
4913            if cte.column_overrides.len() != columns.len() {
4914                return Err(EngineError::Unsupported(alloc::format!(
4915                    "CTE {:?} column list has {} names but anchor returns {} columns",
4916                    cte.name,
4917                    cte.column_overrides.len(),
4918                    columns.len()
4919                )));
4920            }
4921            for (col, name) in columns.iter_mut().zip(cte.column_overrides.iter()) {
4922                col.name.clone_from(name);
4923            }
4924        }
4925        let mut all_rows: Vec<Row> = anchor_rows.clone();
4926        let mut working_set: Vec<Row> = anchor_rows;
4927        let mut seen: alloc::collections::BTreeSet<Vec<u8>> = alloc::collections::BTreeSet::new();
4928        // Track at least one "all UNION ALL" flag — if every union
4929        // kind is ALL we skip the dedup step (faster + matches PG).
4930        let all_union_all = union_terms.iter().all(|(k, _)| matches!(k, UnionKind::All));
4931        if !all_union_all {
4932            for r in &all_rows {
4933                seen.insert(encode_row_key(r));
4934            }
4935        }
4936        for iter in 0..MAX_ITERATIONS {
4937            cancel.check()?;
4938            if working_set.is_empty() {
4939                break;
4940            }
4941            // Build a fresh catalog: base + CTE bound to working_set.
4942            let mut iter_catalog = base_catalog.clone();
4943            let schema = TableSchema::new(cte.name.clone(), columns.clone());
4944            iter_catalog
4945                .create_table(schema)
4946                .map_err(EngineError::Storage)?;
4947            {
4948                let table = iter_catalog.get_mut(&cte.name).expect("just-created");
4949                for row in &working_set {
4950                    table.insert(row.clone()).map_err(EngineError::Storage)?;
4951                }
4952            }
4953            let mut iter_engine = Engine::restore(iter_catalog);
4954            if let Some(c) = self.clock {
4955                iter_engine = iter_engine.with_clock(c);
4956            }
4957            if let Some(f) = self.salt_fn {
4958                iter_engine = iter_engine.with_salt_fn(f);
4959            }
4960            // Run each recursive term in sequence and collect new rows.
4961            let mut next_set: Vec<Row> = Vec::new();
4962            for (_, term) in &union_terms {
4963                let mut term = term.clone();
4964                term.ctes = Vec::new();
4965                let r = iter_engine.exec_select_cancel(&term, cancel)?;
4966                let QueryResult::Rows {
4967                    columns: rc,
4968                    rows: rs,
4969                } = r
4970                else {
4971                    return Err(EngineError::Unsupported(alloc::format!(
4972                        "WITH RECURSIVE {:?}: recursive term did not return rows",
4973                        cte.name
4974                    )));
4975                };
4976                if rc.len() != columns.len() {
4977                    return Err(EngineError::Unsupported(alloc::format!(
4978                        "WITH RECURSIVE {:?}: column count of recursive term ({}) does not match anchor ({})",
4979                        cte.name,
4980                        rc.len(),
4981                        columns.len()
4982                    )));
4983                }
4984                for row in rs {
4985                    if !all_union_all {
4986                        let key = encode_row_key(&row);
4987                        if !seen.insert(key) {
4988                            continue;
4989                        }
4990                    }
4991                    next_set.push(row);
4992                }
4993            }
4994            if next_set.is_empty() {
4995                break;
4996            }
4997            all_rows.extend(next_set.iter().cloned());
4998            working_set = next_set;
4999            if all_rows.len() > MAX_TOTAL_ROWS {
5000                return Err(EngineError::Unsupported(alloc::format!(
5001                    "WITH RECURSIVE {:?}: produced more than {MAX_TOTAL_ROWS} rows — likely runaway recursion",
5002                    cte.name
5003                )));
5004            }
5005            if iter + 1 == MAX_ITERATIONS {
5006                return Err(EngineError::Unsupported(alloc::format!(
5007                    "WITH RECURSIVE {:?}: exceeded {MAX_ITERATIONS} iterations",
5008                    cte.name
5009                )));
5010            }
5011        }
5012        Ok((columns, all_rows))
5013    }
5014
5015    fn resolve_select_subqueries(
5016        &self,
5017        stmt: &mut SelectStatement,
5018        cancel: CancelToken<'_>,
5019    ) -> Result<(), EngineError> {
5020        for item in &mut stmt.items {
5021            if let SelectItem::Expr { expr, .. } = item {
5022                self.resolve_expr_subqueries(expr, cancel)?;
5023            }
5024        }
5025        if let Some(w) = &mut stmt.where_ {
5026            self.resolve_expr_subqueries(w, cancel)?;
5027        }
5028        if let Some(gs) = &mut stmt.group_by {
5029            for g in gs {
5030                self.resolve_expr_subqueries(g, cancel)?;
5031            }
5032        }
5033        if let Some(h) = &mut stmt.having {
5034            self.resolve_expr_subqueries(h, cancel)?;
5035        }
5036        for o in &mut stmt.order_by {
5037            self.resolve_expr_subqueries(&mut o.expr, cancel)?;
5038        }
5039        for (_, peer) in &mut stmt.unions {
5040            self.resolve_select_subqueries(peer, cancel)?;
5041        }
5042        Ok(())
5043    }
5044
5045    #[allow(clippy::only_used_in_recursion)] // engine handle reads aren't really pure
5046    fn resolve_expr_subqueries(
5047        &self,
5048        e: &mut Expr,
5049        cancel: CancelToken<'_>,
5050    ) -> Result<(), EngineError> {
5051        // Replace-on-this-node cases first.
5052        if let Some(replacement) = self.subquery_replacement(e, cancel)? {
5053            *e = replacement;
5054            return Ok(());
5055        }
5056        match e {
5057            Expr::Binary { lhs, rhs, .. } => {
5058                self.resolve_expr_subqueries(lhs, cancel)?;
5059                self.resolve_expr_subqueries(rhs, cancel)?;
5060            }
5061            Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
5062                self.resolve_expr_subqueries(expr, cancel)?;
5063            }
5064            Expr::FunctionCall { args, .. } => {
5065                for a in args {
5066                    self.resolve_expr_subqueries(a, cancel)?;
5067                }
5068            }
5069            Expr::Like { expr, pattern, .. } => {
5070                self.resolve_expr_subqueries(expr, cancel)?;
5071                self.resolve_expr_subqueries(pattern, cancel)?;
5072            }
5073            Expr::Extract { source, .. } => self.resolve_expr_subqueries(source, cancel)?,
5074            // v4.12 window functions — recurse into args + ORDER BY
5075            // + PARTITION BY in case they carry inner subqueries.
5076            Expr::WindowFunction {
5077                args,
5078                partition_by,
5079                order_by,
5080                ..
5081            } => {
5082                for a in args {
5083                    self.resolve_expr_subqueries(a, cancel)?;
5084                }
5085                for p in partition_by {
5086                    self.resolve_expr_subqueries(p, cancel)?;
5087                }
5088                for (e, _) in order_by {
5089                    self.resolve_expr_subqueries(e, cancel)?;
5090                }
5091            }
5092            // Subquery nodes are handled in subquery_replacement
5093            // (which returned None — defensive no-op); Literal /
5094            // Column are leaves.
5095            Expr::ScalarSubquery(_)
5096            | Expr::Exists { .. }
5097            | Expr::InSubquery { .. }
5098            | Expr::Literal(_)
5099            | Expr::Placeholder(_)
5100            | Expr::Column(_) => {}
5101        }
5102        Ok(())
5103    }
5104
5105    /// v4.23: per-row eval that handles correlated subqueries.
5106    /// Equivalent to `eval::eval_expr` when the expression has no
5107    /// subqueries; otherwise clones the expression, substitutes
5108    /// outer-row columns into each surviving subquery node, runs
5109    /// the inner SELECT, and replaces the node with the literal
5110    /// result. Only the WHERE-filter call sites use this path so
5111    /// the uncorrelated fast path is preserved everywhere else.
5112    fn eval_expr_with_correlated(
5113        &self,
5114        expr: &Expr,
5115        row: &Row,
5116        ctx: &EvalContext<'_>,
5117        cancel: CancelToken<'_>,
5118        memo: Option<&mut memoize::MemoizeCache>,
5119    ) -> Result<Value, EngineError> {
5120        if !expr_has_subquery(expr) {
5121            return eval::eval_expr(expr, row, ctx).map_err(EngineError::Eval);
5122        }
5123        let mut e = expr.clone();
5124        self.resolve_correlated_in_expr(&mut e, row, ctx, cancel, memo)?;
5125        eval::eval_expr(&e, row, ctx).map_err(EngineError::Eval)
5126    }
5127
5128    fn resolve_correlated_in_expr(
5129        &self,
5130        e: &mut Expr,
5131        row: &Row,
5132        ctx: &EvalContext<'_>,
5133        cancel: CancelToken<'_>,
5134        mut memo: Option<&mut memoize::MemoizeCache>,
5135    ) -> Result<(), EngineError> {
5136        match e {
5137            Expr::ScalarSubquery(inner) => {
5138                // v6.2.6 — Memoize: build the cache key from the
5139                // pre-substitution subquery repr + the outer row's
5140                // values. Two outer rows with identical correlated
5141                // values hit the same entry.
5142                let cache_key = memo.as_ref().map(|_| memoize::CacheKey {
5143                    subquery_repr: alloc::format!("{}", **inner),
5144                    outer_values: row.values.clone(),
5145                });
5146                if let (Some(cache), Some(k)) = (memo.as_deref_mut(), cache_key.as_ref())
5147                    && let Some(cached) = cache.get(k)
5148                {
5149                    *e = value_to_literal_expr(cached)?;
5150                    return Ok(());
5151                }
5152                let mut s = (**inner).clone();
5153                substitute_outer_columns(&mut s, row, ctx);
5154                let r = self.exec_select_cancel(&s, cancel)?;
5155                let QueryResult::Rows { rows, .. } = r else {
5156                    return Err(EngineError::Unsupported(
5157                        "scalar subquery: inner did not return rows".into(),
5158                    ));
5159                };
5160                let value = match rows.as_slice() {
5161                    [] => Value::Null,
5162                    [r0] => r0.values.first().cloned().unwrap_or(Value::Null),
5163                    _ => {
5164                        return Err(EngineError::Unsupported(alloc::format!(
5165                            "scalar subquery returned {} rows; expected 0 or 1",
5166                            rows.len()
5167                        )));
5168                    }
5169                };
5170                if let (Some(cache), Some(k)) = (memo.as_deref_mut(), cache_key) {
5171                    cache.insert(k, value.clone());
5172                }
5173                *e = value_to_literal_expr(value)?;
5174            }
5175            Expr::Exists { subquery, negated } => {
5176                let mut s = (**subquery).clone();
5177                substitute_outer_columns(&mut s, row, ctx);
5178                let r = self.exec_select_cancel(&s, cancel)?;
5179                let exists = matches!(r, QueryResult::Rows { rows, .. } if !rows.is_empty());
5180                let bit = if *negated { !exists } else { exists };
5181                *e = Expr::Literal(Literal::Bool(bit));
5182            }
5183            Expr::InSubquery {
5184                expr: lhs,
5185                subquery,
5186                negated,
5187            } => {
5188                self.resolve_correlated_in_expr(lhs, row, ctx, cancel, memo.as_deref_mut())?;
5189                let lhs_val = eval::eval_expr(lhs, row, ctx).map_err(EngineError::Eval)?;
5190                let mut s = (**subquery).clone();
5191                substitute_outer_columns(&mut s, row, ctx);
5192                let r = self.exec_select_cancel(&s, cancel)?;
5193                let QueryResult::Rows { columns, rows, .. } = r else {
5194                    return Err(EngineError::Unsupported(
5195                        "IN-subquery: inner did not return rows".into(),
5196                    ));
5197                };
5198                if columns.len() != 1 {
5199                    return Err(EngineError::Unsupported(alloc::format!(
5200                        "IN-subquery must project exactly one column; got {}",
5201                        columns.len()
5202                    )));
5203                }
5204                let mut found = false;
5205                let mut any_null = false;
5206                for r0 in rows {
5207                    let v = r0.values.into_iter().next().unwrap_or(Value::Null);
5208                    if v.is_null() {
5209                        any_null = true;
5210                        continue;
5211                    }
5212                    if value_cmp(&v, &lhs_val) == core::cmp::Ordering::Equal {
5213                        found = true;
5214                        break;
5215                    }
5216                }
5217                let bit = if found {
5218                    !*negated
5219                } else if any_null {
5220                    return Err(EngineError::Unsupported(
5221                        "IN-subquery with NULL in result and no match: NULL semantics not yet implemented".into(),
5222                    ));
5223                } else {
5224                    *negated
5225                };
5226                *e = Expr::Literal(Literal::Bool(bit));
5227            }
5228            Expr::Binary { lhs, rhs, .. } => {
5229                self.resolve_correlated_in_expr(lhs, row, ctx, cancel, memo.as_deref_mut())?;
5230                self.resolve_correlated_in_expr(rhs, row, ctx, cancel, memo.as_deref_mut())?;
5231            }
5232            Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
5233                self.resolve_correlated_in_expr(expr, row, ctx, cancel, memo.as_deref_mut())?;
5234            }
5235            Expr::Like { expr, pattern, .. } => {
5236                self.resolve_correlated_in_expr(expr, row, ctx, cancel, memo.as_deref_mut())?;
5237                self.resolve_correlated_in_expr(pattern, row, ctx, cancel, memo.as_deref_mut())?;
5238            }
5239            Expr::FunctionCall { args, .. } => {
5240                for a in args {
5241                    self.resolve_correlated_in_expr(a, row, ctx, cancel, memo.as_deref_mut())?;
5242                }
5243            }
5244            Expr::Extract { source, .. } => {
5245                self.resolve_correlated_in_expr(source, row, ctx, cancel, memo.as_deref_mut())?;
5246            }
5247            Expr::WindowFunction { .. } | Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => {}
5248        }
5249        Ok(())
5250    }
5251
5252    fn subquery_replacement(
5253        &self,
5254        e: &Expr,
5255        cancel: CancelToken<'_>,
5256    ) -> Result<Option<Expr>, EngineError> {
5257        match e {
5258            Expr::ScalarSubquery(inner) => {
5259                let mut s = (**inner).clone();
5260                // Recurse into the inner SELECT first so nested
5261                // subqueries materialise bottom-up.
5262                self.resolve_select_subqueries(&mut s, cancel)?;
5263                let r = match self.exec_bare_select_cancel(&s, cancel) {
5264                    Ok(r) => r,
5265                    Err(e) if is_correlation_error(&e) => return Ok(None),
5266                    Err(e) => return Err(e),
5267                };
5268                let QueryResult::Rows { rows, .. } = r else {
5269                    return Err(EngineError::Unsupported(
5270                        "scalar subquery: inner statement did not return rows".into(),
5271                    ));
5272                };
5273                let value = match rows.as_slice() {
5274                    [] => Value::Null,
5275                    [row] => row.values.first().cloned().unwrap_or(Value::Null),
5276                    _ => {
5277                        return Err(EngineError::Unsupported(alloc::format!(
5278                            "scalar subquery returned {} rows; expected 0 or 1",
5279                            rows.len()
5280                        )));
5281                    }
5282                };
5283                Ok(Some(value_to_literal_expr(value)?))
5284            }
5285            Expr::Exists { subquery, negated } => {
5286                let mut s = (**subquery).clone();
5287                self.resolve_select_subqueries(&mut s, cancel)?;
5288                let r = match self.exec_bare_select_cancel(&s, cancel) {
5289                    Ok(r) => r,
5290                    Err(e) if is_correlation_error(&e) => return Ok(None),
5291                    Err(e) => return Err(e),
5292                };
5293                let exists = match r {
5294                    QueryResult::Rows { rows, .. } => !rows.is_empty(),
5295                    QueryResult::CommandOk { .. } => false,
5296                };
5297                let bit = if *negated { !exists } else { exists };
5298                Ok(Some(Expr::Literal(Literal::Bool(bit))))
5299            }
5300            Expr::InSubquery {
5301                expr,
5302                subquery,
5303                negated,
5304            } => {
5305                let mut s = (**subquery).clone();
5306                self.resolve_select_subqueries(&mut s, cancel)?;
5307                let r = match self.exec_bare_select_cancel(&s, cancel) {
5308                    Ok(r) => r,
5309                    Err(e) if is_correlation_error(&e) => return Ok(None),
5310                    Err(e) => return Err(e),
5311                };
5312                let QueryResult::Rows { columns, rows, .. } = r else {
5313                    return Err(EngineError::Unsupported(
5314                        "IN-subquery: inner statement did not return rows".into(),
5315                    ));
5316                };
5317                if columns.len() != 1 {
5318                    return Err(EngineError::Unsupported(alloc::format!(
5319                        "IN-subquery must project exactly one column; got {}",
5320                        columns.len()
5321                    )));
5322                }
5323                // Build the same OR-Eq chain the parse-time literal-list
5324                // path constructs, with each value lifted into a Literal.
5325                let mut acc: Option<Expr> = None;
5326                for row in rows {
5327                    let v = row.values.into_iter().next().unwrap_or(Value::Null);
5328                    let lit = value_to_literal_expr(v)?;
5329                    let cmp = Expr::Binary {
5330                        lhs: expr.clone(),
5331                        op: BinOp::Eq,
5332                        rhs: Box::new(lit),
5333                    };
5334                    acc = Some(match acc {
5335                        None => cmp,
5336                        Some(prev) => Expr::Binary {
5337                            lhs: Box::new(prev),
5338                            op: BinOp::Or,
5339                            rhs: Box::new(cmp),
5340                        },
5341                    });
5342                }
5343                let combined = acc.unwrap_or(Expr::Literal(Literal::Bool(false)));
5344                let final_expr = if *negated {
5345                    Expr::Unary {
5346                        op: UnOp::Not,
5347                        expr: Box::new(combined),
5348                    }
5349                } else {
5350                    combined
5351                };
5352                Ok(Some(final_expr))
5353            }
5354            _ => Ok(None),
5355        }
5356    }
5357}
5358
5359// ---- v4.12 window-function helpers ----
5360// The (partition-key, order-key, original-index) tuple shape used
5361// across these helpers is intrinsic to the planner. Factoring it
5362// into a typedef adds indirection without making the code clearer,
5363// so several lints are allowed inline on the affected functions
5364// rather than module-wide.
5365
5366/// v4.22: cheap structural scan for `FROM <name>` (qualified or
5367/// not) inside a SELECT — used to verify the anchor of a WITH
5368/// RECURSIVE CTE doesn't recurse into itself. Conservative: walks
5369/// FROM joins, subqueries, and unions.
5370fn select_refers_to(stmt: &SelectStatement, target: &str) -> bool {
5371    if let Some(from) = &stmt.from
5372        && from_refers_to(from, target)
5373    {
5374        return true;
5375    }
5376    for (_, peer) in &stmt.unions {
5377        if select_refers_to(peer, target) {
5378            return true;
5379        }
5380    }
5381    for item in &stmt.items {
5382        if let SelectItem::Expr { expr, .. } = item
5383            && expr_refers_to(expr, target)
5384        {
5385            return true;
5386        }
5387    }
5388    if let Some(w) = &stmt.where_
5389        && expr_refers_to(w, target)
5390    {
5391        return true;
5392    }
5393    false
5394}
5395
5396fn from_refers_to(from: &FromClause, target: &str) -> bool {
5397    if from.primary.name.eq_ignore_ascii_case(target) {
5398        return true;
5399    }
5400    from.joins
5401        .iter()
5402        .any(|j| j.table.name.eq_ignore_ascii_case(target))
5403}
5404
5405fn expr_refers_to(e: &Expr, target: &str) -> bool {
5406    match e {
5407        Expr::ScalarSubquery(s) => select_refers_to(s, target),
5408        Expr::Exists { subquery, .. } | Expr::InSubquery { subquery, .. } => {
5409            select_refers_to(subquery, target)
5410        }
5411        Expr::Binary { lhs, rhs, .. } => expr_refers_to(lhs, target) || expr_refers_to(rhs, target),
5412        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
5413            expr_refers_to(expr, target)
5414        }
5415        Expr::Like { expr, pattern, .. } => {
5416            expr_refers_to(expr, target) || expr_refers_to(pattern, target)
5417        }
5418        Expr::FunctionCall { args, .. } => args.iter().any(|a| expr_refers_to(a, target)),
5419        Expr::Extract { source, .. } => expr_refers_to(source, target),
5420        Expr::WindowFunction {
5421            args,
5422            partition_by,
5423            order_by,
5424            ..
5425        } => {
5426            args.iter().any(|a| expr_refers_to(a, target))
5427                || partition_by.iter().any(|p| expr_refers_to(p, target))
5428                || order_by.iter().any(|(o, _)| expr_refers_to(o, target))
5429        }
5430        Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => false,
5431    }
5432}
5433
5434/// v4.22: pick more specific column types from observed rows when
5435/// the projection builder defaulted to Text (the v1.x behavior for
5436/// non-column expressions). Lets `WITH t(n) AS (SELECT 1 ...)`
5437/// land an Int column in the CTE storage table rather than failing
5438/// the insert with "expected TEXT, got INT".
5439fn infer_column_types(columns: &[ColumnSchema], rows: &[Row]) -> Vec<ColumnSchema> {
5440    let mut out = columns.to_vec();
5441    for (col_idx, col) in out.iter_mut().enumerate() {
5442        if col.ty != DataType::Text {
5443            continue;
5444        }
5445        let mut inferred: Option<DataType> = None;
5446        let mut all_null = true;
5447        for row in rows {
5448            let Some(v) = row.values.get(col_idx) else {
5449                continue;
5450            };
5451            let ty = match v {
5452                Value::Null => continue,
5453                Value::SmallInt(_) => DataType::SmallInt,
5454                Value::Int(_) => DataType::Int,
5455                Value::BigInt(_) => DataType::BigInt,
5456                Value::Float(_) => DataType::Float,
5457                Value::Bool(_) => DataType::Bool,
5458                Value::Vector(_) => DataType::Vector {
5459                    dim: 0,
5460                    encoding: VecEncoding::F32,
5461                },
5462                _ => DataType::Text,
5463            };
5464            all_null = false;
5465            inferred = Some(match inferred {
5466                None => ty,
5467                Some(prev) if prev == ty => prev,
5468                Some(_) => DataType::Text,
5469            });
5470        }
5471        if let Some(t) = inferred {
5472            col.ty = t;
5473            col.nullable = true;
5474        } else if all_null {
5475            col.nullable = true;
5476        }
5477    }
5478    out
5479}
5480
5481/// v4.26: render a human-readable plan tree for `EXPLAIN <select>`.
5482/// Lines are pushed into `out`; `depth` controls indentation. We
5483/// describe the rewritten SELECT — what the executor *would* do —
5484/// using the engine handle to spot indexed lookups and table shapes.
5485#[allow(clippy::too_many_lines, clippy::format_push_string)]
5486/// v6.2.4 — Walk every line of the rendered plan tree and append
5487/// per-operator stats. Lines that name a known operator get
5488/// `(rows=N)` (`actual_rows` of the top-level operator equals the
5489/// final result row count; scans report their catalog row count
5490/// as the rows-considered metric). Other lines — Filter / Join /
5491/// GroupBy / OrderBy etc. — are marked `(—)` so the surface is
5492/// complete-by-construction; v6.2.5 fills these in via inline
5493/// executor counters.
5494/// v6.8.3 — surface "CREATE INDEX …" suggestions for every
5495/// `(table, column)` pair the query touches via WHERE / JOIN
5496/// that doesn't already have an index on the owning table.
5497/// Walks the SELECT's FROM clauses + WHERE expression tree;
5498/// returns one line per missing index. Deterministic order:
5499/// FROM-clause iteration order, then column-reference walk
5500/// order inside each WHERE. Each suggestion is a copy-pastable
5501/// DDL string.
5502fn build_index_suggestions(stmt: &SelectStatement, engine: &Engine) -> Vec<String> {
5503    use alloc::collections::BTreeSet;
5504    let mut seen: BTreeSet<(String, String)> = BTreeSet::new();
5505    let mut out: Vec<String> = Vec::new();
5506    let cat = engine.active_catalog();
5507    // Build a (table, qualifier-or-alias) list from the FROM clause
5508    // so unqualified column refs in WHERE resolve to the correct
5509    // table.
5510    let Some(from) = &stmt.from else {
5511        return out;
5512    };
5513    let mut tables: Vec<String> = Vec::new();
5514    tables.push(from.primary.name.clone());
5515    for j in &from.joins {
5516        tables.push(j.table.name.clone());
5517    }
5518    // Collect column refs from the WHERE expression. JOIN ON
5519    // predicates also feed in.
5520    let mut col_refs: Vec<spg_sql::ast::ColumnName> = Vec::new();
5521    if let Some(w) = &stmt.where_ {
5522        collect_column_refs(w, &mut col_refs);
5523    }
5524    for j in &from.joins {
5525        if let Some(on) = &j.on {
5526            collect_column_refs(on, &mut col_refs);
5527        }
5528    }
5529    for cn in &col_refs {
5530        // Resolve owner table: explicit qualifier first, else
5531        // first table in FROM that has a column of this name.
5532        let owner: Option<String> = if let Some(q) = &cn.qualifier {
5533            tables.iter().find(|t| t == &q).cloned()
5534        } else {
5535            tables.iter().find_map(|t| {
5536                cat.get(t).and_then(|tbl| {
5537                    if tbl.schema().column_position(&cn.name).is_some() {
5538                        Some(t.clone())
5539                    } else {
5540                        None
5541                    }
5542                })
5543            })
5544        };
5545        let Some(owner) = owner else {
5546            continue;
5547        };
5548        let Some(tbl) = cat.get(&owner) else {
5549            continue;
5550        };
5551        let Some(col_pos) = tbl.schema().column_position(&cn.name) else {
5552            continue;
5553        };
5554        // Skip if any BTree index already covers this column as
5555        // its key.
5556        let already_indexed = tbl.indices().iter().any(|i| {
5557            matches!(i.kind, spg_storage::IndexKind::BTree(_))
5558                && i.column_position == col_pos
5559                && i.expression.is_none()
5560                && i.partial_predicate.is_none()
5561        });
5562        if already_indexed {
5563            continue;
5564        }
5565        if seen.insert((owner.clone(), cn.name.clone())) {
5566            out.push(alloc::format!(
5567                "SUGGEST: CREATE INDEX ix_{}_{} ON {} ({})",
5568                owner,
5569                cn.name,
5570                owner,
5571                cn.name
5572            ));
5573        }
5574    }
5575    out
5576}
5577
5578/// Walks an `Expr` and pushes every `ColumnName` it references.
5579/// Order is depth-first, left-to-right.
5580fn collect_column_refs(expr: &Expr, out: &mut Vec<spg_sql::ast::ColumnName>) {
5581    match expr {
5582        Expr::Column(cn) => out.push(cn.clone()),
5583        Expr::FunctionCall { args, .. } => {
5584            for a in args {
5585                collect_column_refs(a, out);
5586            }
5587        }
5588        Expr::Binary { lhs, rhs, .. } => {
5589            collect_column_refs(lhs, out);
5590            collect_column_refs(rhs, out);
5591        }
5592        Expr::Unary { expr: e, .. } => collect_column_refs(e, out),
5593        _ => {}
5594    }
5595}
5596
5597fn annotate_explain_lines(lines: &mut [String], total_rows: usize, engine: &Engine) {
5598    let catalog = engine.active_catalog();
5599    let cold_ids = catalog.cold_segment_ids_global();
5600    let any_cold = !cold_ids.is_empty();
5601    let cold_ids_repr = if any_cold {
5602        let mut s = alloc::string::String::from("[");
5603        for (i, id) in cold_ids.iter().enumerate() {
5604            if i > 0 {
5605                s.push(',');
5606            }
5607            s.push_str(&alloc::format!("{id}"));
5608        }
5609        s.push(']');
5610        s
5611    } else {
5612        alloc::string::String::new()
5613    };
5614    for (idx, line) in lines.iter_mut().enumerate() {
5615        let trimmed = line.trim_start();
5616        let is_top_level = idx == 0;
5617        if is_top_level {
5618            line.push_str(&alloc::format!(" (rows={total_rows})"));
5619            continue;
5620        }
5621        if let Some(rest) = trimmed.strip_prefix("From: ") {
5622            let (name, scan_kind) = match rest.split_once(" [") {
5623                Some((n, k)) => (n.trim(), k.trim_end_matches(']')),
5624                None => (rest.trim(), ""),
5625            };
5626            let bare = name.split_whitespace().next().unwrap_or(name);
5627            let hot = catalog.get(bare).map(|t| t.rows().len());
5628            // v6.2.7 — `cold_segments=[id0,id1,…]` enumerates every
5629            // cold-tier segment the scan COULD have walked. v6.2.x
5630            // can tighten to per-table by walking the table's
5631            // BTree-index cold locators.
5632            let annot = match (hot, scan_kind) {
5633                (Some(h), "full scan") => {
5634                    let mut s = alloc::format!(" (hot_rows={h}");
5635                    if any_cold {
5636                        s.push_str(&alloc::format!(
5637                            ", cold_tier=present, cold_segments={cold_ids_repr}"
5638                        ));
5639                    }
5640                    s.push(')');
5641                    s
5642                }
5643                (Some(h), "index seek") => {
5644                    let mut s = alloc::format!(" (hot_rows≤{h}");
5645                    if any_cold {
5646                        s.push_str(&alloc::format!(
5647                            ", cold_tier=present, cold_segments={cold_ids_repr}"
5648                        ));
5649                    }
5650                    s.push(')');
5651                    s
5652                }
5653                _ => " (rows=—)".to_string(),
5654            };
5655            line.push_str(&annot);
5656            continue;
5657        }
5658        // Filter / GroupBy / Having / OrderBy / Limit / Join etc.
5659        line.push_str(" (rows=—)");
5660    }
5661}
5662
5663fn explain_select(stmt: &SelectStatement, engine: &Engine, depth: usize, out: &mut Vec<String>) {
5664    let pad = "  ".repeat(depth);
5665    // 1) Top-level operator label.
5666    let top = if !stmt.ctes.is_empty() {
5667        if stmt.ctes.iter().any(|c| c.recursive) {
5668            "CTEScan (WITH RECURSIVE)"
5669        } else {
5670            "CTEScan (WITH)"
5671        }
5672    } else if !stmt.unions.is_empty() {
5673        "UnionScan"
5674    } else if select_has_window(stmt) {
5675        "WindowAgg"
5676    } else if aggregate::uses_aggregate(stmt) {
5677        "Aggregate"
5678    } else if stmt.distinct {
5679        "Distinct"
5680    } else if stmt.from.is_some() {
5681        "TableScan"
5682    } else {
5683        "Result"
5684    };
5685    out.push(alloc::format!("{pad}{top}"));
5686    let child = "  ".repeat(depth + 1);
5687    // 2) CTE bodies.
5688    for cte in &stmt.ctes {
5689        let head = if cte.recursive {
5690            alloc::format!("{child}CTE (recursive): {}", cte.name)
5691        } else {
5692            alloc::format!("{child}CTE: {}", cte.name)
5693        };
5694        out.push(head);
5695        explain_select(&cte.body, engine, depth + 2, out);
5696    }
5697    // 3) FROM details — primary table + joins, index hits.
5698    if let Some(from) = &stmt.from {
5699        let mut tag = alloc::format!("{child}From: {}", from.primary.name);
5700        if let Some(alias) = &from.primary.alias {
5701            tag.push_str(&alloc::format!(" AS {alias}"));
5702        }
5703        // Try to detect an index-seek opportunity on WHERE against
5704        // the primary table — same heuristic the executor uses.
5705        if let Some(w) = &stmt.where_
5706            && let Some(table) = engine.active_catalog().get(&from.primary.name)
5707        {
5708            let alias = from.primary.alias.as_deref().unwrap_or(&from.primary.name);
5709            let cols = &table.schema().columns;
5710            if try_index_seek(w, cols, engine.active_catalog(), table, alias).is_some() {
5711                tag.push_str(" [index seek]");
5712            } else {
5713                tag.push_str(" [full scan]");
5714            }
5715        } else {
5716            tag.push_str(" [full scan]");
5717        }
5718        out.push(tag);
5719        for j in &from.joins {
5720            let kind = match j.kind {
5721                spg_sql::ast::JoinKind::Inner => "INNER JOIN",
5722                spg_sql::ast::JoinKind::Left => "LEFT JOIN",
5723                spg_sql::ast::JoinKind::Cross => "CROSS JOIN",
5724            };
5725            let mut s = alloc::format!("{child}{kind}: {}", j.table.name);
5726            if let Some(alias) = &j.table.alias {
5727                s.push_str(&alloc::format!(" AS {alias}"));
5728            }
5729            if j.on.is_some() {
5730                s.push_str(" (ON …)");
5731            }
5732            out.push(s);
5733        }
5734    }
5735    // 4) WHERE / GROUP BY / HAVING / ORDER BY / LIMIT / OFFSET.
5736    if let Some(w) = &stmt.where_ {
5737        let mut s = alloc::format!("{child}Filter: {w}");
5738        if expr_has_subquery(w) {
5739            s.push_str(" [subquery]");
5740        }
5741        out.push(s);
5742    }
5743    if let Some(gs) = &stmt.group_by {
5744        let mut parts = Vec::new();
5745        for g in gs {
5746            parts.push(alloc::format!("{g}"));
5747        }
5748        out.push(alloc::format!("{child}GroupBy: {}", parts.join(", ")));
5749    }
5750    if let Some(h) = &stmt.having {
5751        out.push(alloc::format!("{child}Having: {h}"));
5752    }
5753    for o in &stmt.order_by {
5754        let dir = if o.desc { "DESC" } else { "ASC" };
5755        out.push(alloc::format!("{child}OrderBy: {} {dir}", o.expr));
5756    }
5757    if let Some(lim) = stmt.limit {
5758        out.push(alloc::format!("{child}Limit: {lim}"));
5759    }
5760    if let Some(off) = stmt.offset {
5761        out.push(alloc::format!("{child}Offset: {off}"));
5762    }
5763    // 5) Projection — collapse Wildcard or render N items.
5764    if stmt
5765        .items
5766        .iter()
5767        .any(|it| matches!(it, SelectItem::Wildcard))
5768    {
5769        out.push(alloc::format!("{child}Project: *"));
5770    } else {
5771        out.push(alloc::format!(
5772            "{child}Project: {} item(s)",
5773            stmt.items.len()
5774        ));
5775    }
5776    // 6) Recurse into UNION peers.
5777    for (kind, peer) in &stmt.unions {
5778        let label = match kind {
5779            UnionKind::All => "UNION ALL",
5780            UnionKind::Distinct => "UNION",
5781        };
5782        out.push(alloc::format!("{child}{label}"));
5783        explain_select(peer, engine, depth + 2, out);
5784    }
5785}
5786
5787/// v4.23: recognise the engine errors that indicate the inner
5788/// SELECT couldn't be evaluated in isolation because it references
5789/// an outer column — used by `subquery_replacement` to skip
5790/// materialisation and let row-eval handle it instead.
5791fn is_correlation_error(e: &EngineError) -> bool {
5792    matches!(
5793        e,
5794        EngineError::Eval(
5795            eval::EvalError::ColumnNotFound { .. } | eval::EvalError::UnknownQualifier { .. }
5796        )
5797    )
5798}
5799
5800/// v4.23: walk every Expr in `stmt` and replace each Column ref
5801/// that targets the outer scope (qualifier matches the outer
5802/// table alias) with a Literal carrying the outer row's value.
5803/// Conservative: only qualified refs are substituted, so the user
5804/// must write `outer_alias.col` to reference an outer column. This
5805/// matches PG's lexical scoping for correlated subqueries and
5806/// avoids accidentally rebinding inner columns of the same name.
5807fn substitute_outer_columns(stmt: &mut SelectStatement, row: &Row, ctx: &EvalContext<'_>) {
5808    let Some(outer_alias) = ctx.table_alias else {
5809        return;
5810    };
5811    substitute_in_select(stmt, row, ctx, outer_alias);
5812}
5813
5814fn substitute_in_select(
5815    stmt: &mut SelectStatement,
5816    row: &Row,
5817    ctx: &EvalContext<'_>,
5818    outer_alias: &str,
5819) {
5820    for item in &mut stmt.items {
5821        if let SelectItem::Expr { expr, .. } = item {
5822            substitute_in_expr(expr, row, ctx, outer_alias);
5823        }
5824    }
5825    if let Some(w) = &mut stmt.where_ {
5826        substitute_in_expr(w, row, ctx, outer_alias);
5827    }
5828    if let Some(gs) = &mut stmt.group_by {
5829        for g in gs {
5830            substitute_in_expr(g, row, ctx, outer_alias);
5831        }
5832    }
5833    if let Some(h) = &mut stmt.having {
5834        substitute_in_expr(h, row, ctx, outer_alias);
5835    }
5836    for o in &mut stmt.order_by {
5837        substitute_in_expr(&mut o.expr, row, ctx, outer_alias);
5838    }
5839    for (_, peer) in &mut stmt.unions {
5840        substitute_in_select(peer, row, ctx, outer_alias);
5841    }
5842}
5843
5844fn substitute_in_expr(e: &mut Expr, row: &Row, ctx: &EvalContext<'_>, outer_alias: &str) {
5845    if let Expr::Column(c) = e
5846        && let Some(qual) = &c.qualifier
5847        && qual.eq_ignore_ascii_case(outer_alias)
5848    {
5849        // Look up the column's index in the outer schema.
5850        if let Some(idx) = ctx
5851            .columns
5852            .iter()
5853            .position(|sc| sc.name.eq_ignore_ascii_case(&c.name))
5854        {
5855            let v = row.values.get(idx).cloned().unwrap_or(Value::Null);
5856            if let Ok(lit) = value_to_literal_expr(v) {
5857                *e = lit;
5858                return;
5859            }
5860        }
5861    }
5862    match e {
5863        Expr::Binary { lhs, rhs, .. } => {
5864            substitute_in_expr(lhs, row, ctx, outer_alias);
5865            substitute_in_expr(rhs, row, ctx, outer_alias);
5866        }
5867        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
5868            substitute_in_expr(expr, row, ctx, outer_alias);
5869        }
5870        Expr::Like { expr, pattern, .. } => {
5871            substitute_in_expr(expr, row, ctx, outer_alias);
5872            substitute_in_expr(pattern, row, ctx, outer_alias);
5873        }
5874        Expr::FunctionCall { args, .. } => {
5875            for a in args {
5876                substitute_in_expr(a, row, ctx, outer_alias);
5877            }
5878        }
5879        Expr::Extract { source, .. } => substitute_in_expr(source, row, ctx, outer_alias),
5880        Expr::WindowFunction {
5881            args,
5882            partition_by,
5883            order_by,
5884            ..
5885        } => {
5886            for a in args {
5887                substitute_in_expr(a, row, ctx, outer_alias);
5888            }
5889            for p in partition_by {
5890                substitute_in_expr(p, row, ctx, outer_alias);
5891            }
5892            for (o, _) in order_by {
5893                substitute_in_expr(o, row, ctx, outer_alias);
5894            }
5895        }
5896        Expr::ScalarSubquery(s) => substitute_in_select(s, row, ctx, outer_alias),
5897        Expr::Exists { subquery, .. } | Expr::InSubquery { subquery, .. } => {
5898            substitute_in_select(subquery, row, ctx, outer_alias);
5899        }
5900        Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => {}
5901    }
5902}
5903
5904/// v4.22: encode a Row to a comparable byte key for UNION-DISTINCT
5905/// dedup inside the recursive iteration. Crude but deterministic
5906/// — Debug prints embed type discriminants so NULL ≠ "" ≠ 0.
5907fn encode_row_key(row: &Row) -> Vec<u8> {
5908    let mut out = Vec::new();
5909    for v in &row.values {
5910        let s = alloc::format!("{v:?}|");
5911        out.extend_from_slice(s.as_bytes());
5912    }
5913    out
5914}
5915
5916fn select_has_window(stmt: &SelectStatement) -> bool {
5917    for item in &stmt.items {
5918        if let SelectItem::Expr { expr, .. } = item
5919            && expr_has_window(expr)
5920        {
5921            return true;
5922        }
5923    }
5924    false
5925}
5926
5927fn expr_has_window(e: &Expr) -> bool {
5928    match e {
5929        Expr::WindowFunction { .. } => true,
5930        Expr::Binary { lhs, rhs, .. } => expr_has_window(lhs) || expr_has_window(rhs),
5931        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
5932            expr_has_window(expr)
5933        }
5934        Expr::FunctionCall { args, .. } => args.iter().any(expr_has_window),
5935        Expr::Like { expr, pattern, .. } => expr_has_window(expr) || expr_has_window(pattern),
5936        Expr::Extract { source, .. } => expr_has_window(source),
5937        Expr::ScalarSubquery(_)
5938        | Expr::Exists { .. }
5939        | Expr::InSubquery { .. }
5940        | Expr::Literal(_)
5941        | Expr::Placeholder(_)
5942        | Expr::Column(_) => false,
5943    }
5944}
5945
5946fn collect_window_nodes(e: &Expr, out: &mut Vec<Expr>) {
5947    if let Expr::WindowFunction { .. } = e {
5948        // Deduplicate by structural equality on the expression
5949        // (cheap because window args + partition + order are
5950        // small). Without dedup we'd recompute identical windows
5951        // once per occurrence in the projection.
5952        if !out.iter().any(|x| x == e) {
5953            out.push(e.clone());
5954        }
5955        return;
5956    }
5957    match e {
5958        // Already handled by the early-return at the top.
5959        Expr::WindowFunction { .. } => unreachable!(),
5960        Expr::Binary { lhs, rhs, .. } => {
5961            collect_window_nodes(lhs, out);
5962            collect_window_nodes(rhs, out);
5963        }
5964        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
5965            collect_window_nodes(expr, out);
5966        }
5967        Expr::FunctionCall { args, .. } => {
5968            for a in args {
5969                collect_window_nodes(a, out);
5970            }
5971        }
5972        Expr::Like { expr, pattern, .. } => {
5973            collect_window_nodes(expr, out);
5974            collect_window_nodes(pattern, out);
5975        }
5976        Expr::Extract { source, .. } => collect_window_nodes(source, out),
5977        _ => {}
5978    }
5979}
5980
5981fn rewrite_window_to_columns(e: &mut Expr, window_nodes: &[Expr]) {
5982    if let Expr::WindowFunction { .. } = e
5983        && let Some(idx) = window_nodes.iter().position(|w| w == e)
5984    {
5985        *e = Expr::Column(spg_sql::ast::ColumnName {
5986            qualifier: None,
5987            name: alloc::format!("__win_{idx}"),
5988        });
5989        return;
5990    }
5991    match e {
5992        Expr::Binary { lhs, rhs, .. } => {
5993            rewrite_window_to_columns(lhs, window_nodes);
5994            rewrite_window_to_columns(rhs, window_nodes);
5995        }
5996        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
5997            rewrite_window_to_columns(expr, window_nodes);
5998        }
5999        Expr::FunctionCall { args, .. } => {
6000            for a in args {
6001                rewrite_window_to_columns(a, window_nodes);
6002            }
6003        }
6004        Expr::Like { expr, pattern, .. } => {
6005            rewrite_window_to_columns(expr, window_nodes);
6006            rewrite_window_to_columns(pattern, window_nodes);
6007        }
6008        Expr::Extract { source, .. } => rewrite_window_to_columns(source, window_nodes),
6009        _ => {}
6010    }
6011}
6012
6013/// Total order over partition-key tuples. NULL sorts as the
6014/// lowest value (matches the `<` partial order's NULL-last
6015/// behaviour with `INFINITY` flipped).
6016fn partition_key_cmp(a: &[Value], b: &[Value]) -> core::cmp::Ordering {
6017    for (x, y) in a.iter().zip(b.iter()) {
6018        let c = value_cmp(x, y);
6019        if c != core::cmp::Ordering::Equal {
6020            return c;
6021        }
6022    }
6023    a.len().cmp(&b.len())
6024}
6025
6026fn order_key_cmp(a: &[(Value, bool)], b: &[(Value, bool)]) -> core::cmp::Ordering {
6027    for ((va, desc), (vb, _)) in a.iter().zip(b.iter()) {
6028        let c = value_cmp(va, vb);
6029        let c = if *desc { c.reverse() } else { c };
6030        if c != core::cmp::Ordering::Equal {
6031            return c;
6032        }
6033    }
6034    a.len().cmp(&b.len())
6035}
6036
6037#[allow(clippy::match_same_arms)] // explicit arms per type document the supported pairs
6038fn value_cmp(a: &Value, b: &Value) -> core::cmp::Ordering {
6039    use core::cmp::Ordering;
6040    match (a, b) {
6041        (Value::Null, Value::Null) => Ordering::Equal,
6042        (Value::Null, _) => Ordering::Less,
6043        (_, Value::Null) => Ordering::Greater,
6044        (Value::Int(x), Value::Int(y)) => x.cmp(y),
6045        (Value::BigInt(x), Value::BigInt(y)) => x.cmp(y),
6046        (Value::SmallInt(x), Value::SmallInt(y)) => x.cmp(y),
6047        (Value::Text(x), Value::Text(y)) => x.cmp(y),
6048        (Value::Bool(x), Value::Bool(y)) => x.cmp(y),
6049        (Value::Float(x), Value::Float(y)) => x.partial_cmp(y).unwrap_or(Ordering::Equal),
6050        (Value::Date(x), Value::Date(y)) => x.cmp(y),
6051        (Value::Timestamp(x), Value::Timestamp(y)) => x.cmp(y),
6052        // Cross-type compare: fall back to the debug rendering —
6053        // same-partition is the goal, exact order is irrelevant.
6054        _ => alloc::format!("{a:?}").cmp(&alloc::format!("{b:?}")),
6055    }
6056}
6057
6058/// Compute the window function's per-row output for one partition.
6059/// `slice` has (partition key, order key, original-row-index)
6060/// tuples already sorted by order key. `filtered_rows` is the
6061/// full row list indexed by original-row-index. `out_vals` is
6062/// the destination, also indexed by original-row-index.
6063#[allow(
6064    clippy::too_many_arguments,
6065    clippy::cast_possible_truncation,
6066    clippy::cast_possible_wrap,
6067    clippy::cast_precision_loss,
6068    clippy::cast_sign_loss,
6069    clippy::doc_markdown,
6070    clippy::too_many_lines,
6071    clippy::type_complexity,
6072    clippy::match_same_arms
6073)]
6074fn compute_window_partition(
6075    name: &str,
6076    args: &[Expr],
6077    ordered: bool,
6078    frame: Option<&WindowFrame>,
6079    null_treatment: spg_sql::ast::NullTreatment,
6080    slice: &[(Vec<Value>, Vec<(Value, bool)>, usize)],
6081    filtered_rows: &[&Row],
6082    ctx: &EvalContext<'_>,
6083    out_vals: &mut [Value],
6084) -> Result<(), EngineError> {
6085    let ignore_nulls = matches!(null_treatment, spg_sql::ast::NullTreatment::Ignore);
6086    let lower = name.to_ascii_lowercase();
6087    match lower.as_str() {
6088        "row_number" => {
6089            for (rank, (_, _, idx)) in slice.iter().enumerate() {
6090                out_vals[*idx] = Value::BigInt((rank + 1) as i64);
6091            }
6092            Ok(())
6093        }
6094        "rank" => {
6095            let mut prev_key: Option<&[(Value, bool)]> = None;
6096            let mut current_rank: i64 = 1;
6097            for (i, (_, okey, idx)) in slice.iter().enumerate() {
6098                if let Some(p) = prev_key
6099                    && order_key_cmp(p, okey) != core::cmp::Ordering::Equal
6100                {
6101                    current_rank = (i + 1) as i64;
6102                }
6103                if prev_key.is_none() {
6104                    current_rank = 1;
6105                }
6106                out_vals[*idx] = Value::BigInt(current_rank);
6107                prev_key = Some(okey.as_slice());
6108            }
6109            Ok(())
6110        }
6111        "dense_rank" => {
6112            let mut prev_key: Option<&[(Value, bool)]> = None;
6113            let mut current_rank: i64 = 0;
6114            for (_, okey, idx) in slice {
6115                if prev_key.is_none_or(|p| order_key_cmp(p, okey) != core::cmp::Ordering::Equal) {
6116                    current_rank += 1;
6117                }
6118                out_vals[*idx] = Value::BigInt(current_rank);
6119                prev_key = Some(okey.as_slice());
6120            }
6121            Ok(())
6122        }
6123        "sum" | "avg" | "min" | "max" | "count" | "count_star" => {
6124            // Pre-evaluate the function arg per row in the slice
6125            // (count_star has no arg).
6126            let arg_values: Vec<Value> = if lower == "count_star" || args.is_empty() {
6127                slice.iter().map(|_| Value::Null).collect()
6128            } else {
6129                slice
6130                    .iter()
6131                    .map(|(_, _, idx)| eval::eval_expr(&args[0], filtered_rows[*idx], ctx))
6132                    .collect::<Result<_, _>>()
6133                    .map_err(EngineError::Eval)?
6134            };
6135            // v4.20: pick the effective frame. Explicit frame
6136            // overrides the implicit default (running for ordered,
6137            // whole-partition for unordered).
6138            let eff = effective_frame(frame, ordered)?;
6139            #[allow(clippy::needless_range_loop)]
6140            for i in 0..slice.len() {
6141                let (lo, hi) = frame_bounds_for_row(&eff, i, slice);
6142                let mut sum: f64 = 0.0;
6143                let mut count: i64 = 0;
6144                let mut min_v: Option<f64> = None;
6145                let mut max_v: Option<f64> = None;
6146                let mut row_count: i64 = 0;
6147                if lo <= hi {
6148                    for j in lo..=hi {
6149                        let v = &arg_values[j];
6150                        match lower.as_str() {
6151                            "count_star" => row_count += 1,
6152                            "count" => {
6153                                if !v.is_null() {
6154                                    count += 1;
6155                                }
6156                            }
6157                            _ => {
6158                                if let Some(x) = value_to_f64(v) {
6159                                    sum += x;
6160                                    count += 1;
6161                                    min_v = Some(min_v.map_or(x, |m| m.min(x)));
6162                                    max_v = Some(max_v.map_or(x, |m| m.max(x)));
6163                                }
6164                            }
6165                        }
6166                    }
6167                }
6168                let value = match lower.as_str() {
6169                    "count_star" => Value::BigInt(row_count),
6170                    "count" => Value::BigInt(count),
6171                    "sum" => Value::Float(sum),
6172                    "avg" => {
6173                        if count == 0 {
6174                            Value::Null
6175                        } else {
6176                            Value::Float(sum / count as f64)
6177                        }
6178                    }
6179                    "min" => min_v.map_or(Value::Null, Value::Float),
6180                    "max" => max_v.map_or(Value::Null, Value::Float),
6181                    _ => unreachable!(),
6182                };
6183                let (_, _, idx) = &slice[i];
6184                out_vals[*idx] = value;
6185            }
6186            Ok(())
6187        }
6188        "lag" | "lead" => {
6189            // lag(expr [, offset [, default]])
6190            // lead(expr [, offset [, default]])
6191            if args.is_empty() {
6192                return Err(EngineError::Unsupported(alloc::format!(
6193                    "{lower}() requires at least one argument"
6194                )));
6195            }
6196            let offset: i64 = if args.len() >= 2 {
6197                let v = eval::eval_expr(&args[1], filtered_rows[slice[0].2], ctx)
6198                    .map_err(EngineError::Eval)?;
6199                match v {
6200                    Value::SmallInt(n) => i64::from(n),
6201                    Value::Int(n) => i64::from(n),
6202                    Value::BigInt(n) => n,
6203                    _ => {
6204                        return Err(EngineError::Unsupported(alloc::format!(
6205                            "{lower}() offset must be integer"
6206                        )));
6207                    }
6208                }
6209            } else {
6210                1
6211            };
6212            let default: Value = if args.len() >= 3 {
6213                eval::eval_expr(&args[2], filtered_rows[slice[0].2], ctx)
6214                    .map_err(EngineError::Eval)?
6215            } else {
6216                Value::Null
6217            };
6218            let values: Vec<Value> = slice
6219                .iter()
6220                .map(|(_, _, idx)| eval::eval_expr(&args[0], filtered_rows[*idx], ctx))
6221                .collect::<Result<_, _>>()
6222                .map_err(EngineError::Eval)?;
6223            let n = slice.len();
6224            for (i, (_, _, idx)) in slice.iter().enumerate() {
6225                let signed_offset = if lower == "lag" { -offset } else { offset };
6226                let v = if ignore_nulls {
6227                    // v6.4.2 — IGNORE NULLS: walk in the offset direction
6228                    // skipping NULL values; the `offset`-th non-NULL
6229                    // encountered is the result.
6230                    let step: i64 = if signed_offset >= 0 { 1 } else { -1 };
6231                    let needed: i64 = signed_offset.abs();
6232                    if needed == 0 {
6233                        values[i].clone()
6234                    } else {
6235                        let mut j: i64 = i as i64;
6236                        let mut hits: i64 = 0;
6237                        let mut found: Option<Value> = None;
6238                        loop {
6239                            j += step;
6240                            if j < 0 || j >= n as i64 {
6241                                break;
6242                            }
6243                            #[allow(clippy::cast_sign_loss)]
6244                            let v = &values[j as usize];
6245                            if !v.is_null() {
6246                                hits += 1;
6247                                if hits == needed {
6248                                    found = Some(v.clone());
6249                                    break;
6250                                }
6251                            }
6252                        }
6253                        found.unwrap_or_else(|| default.clone())
6254                    }
6255                } else {
6256                    let target_signed = i64::try_from(i).unwrap_or(i64::MAX) + signed_offset;
6257                    if target_signed < 0
6258                        || target_signed >= i64::try_from(n).unwrap_or(i64::MAX)
6259                    {
6260                        default.clone()
6261                    } else {
6262                        #[allow(clippy::cast_sign_loss)]
6263                        {
6264                            values[target_signed as usize].clone()
6265                        }
6266                    }
6267                };
6268                out_vals[*idx] = v;
6269            }
6270            Ok(())
6271        }
6272        "first_value" | "last_value" | "nth_value" => {
6273            if args.is_empty() {
6274                return Err(EngineError::Unsupported(alloc::format!(
6275                    "{lower}() requires at least one argument"
6276                )));
6277            }
6278            let values: Vec<Value> = slice
6279                .iter()
6280                .map(|(_, _, idx)| eval::eval_expr(&args[0], filtered_rows[*idx], ctx))
6281                .collect::<Result<_, _>>()
6282                .map_err(EngineError::Eval)?;
6283            let nth: usize = if lower == "nth_value" {
6284                if args.len() < 2 {
6285                    return Err(EngineError::Unsupported(
6286                        "nth_value() requires (expr, n)".into(),
6287                    ));
6288                }
6289                let v = eval::eval_expr(&args[1], filtered_rows[slice[0].2], ctx)
6290                    .map_err(EngineError::Eval)?;
6291                let raw = match v {
6292                    Value::SmallInt(n) => i64::from(n),
6293                    Value::Int(n) => i64::from(n),
6294                    Value::BigInt(n) => n,
6295                    _ => {
6296                        return Err(EngineError::Unsupported(
6297                            "nth_value() n must be integer".into(),
6298                        ));
6299                    }
6300                };
6301                if raw < 1 {
6302                    return Err(EngineError::Unsupported(
6303                        "nth_value() n must be >= 1".into(),
6304                    ));
6305                }
6306                #[allow(clippy::cast_sign_loss)]
6307                {
6308                    raw as usize
6309                }
6310            } else {
6311                0
6312            };
6313            let eff = effective_frame(frame, ordered)?;
6314            for i in 0..slice.len() {
6315                let (lo, hi) = frame_bounds_for_row(&eff, i, slice);
6316                let (_, _, idx) = &slice[i];
6317                let v = if lo > hi {
6318                    Value::Null
6319                } else if ignore_nulls && matches!(lower.as_str(), "first_value" | "last_value") {
6320                    // v6.4.2 — IGNORE NULLS: skip NULL cells when
6321                    // selecting the boundary value within the frame.
6322                    if lower == "first_value" {
6323                        (lo..=hi)
6324                            .find_map(|j| {
6325                                let v = &values[j];
6326                                (!v.is_null()).then(|| v.clone())
6327                            })
6328                            .unwrap_or(Value::Null)
6329                    } else {
6330                        (lo..=hi)
6331                            .rev()
6332                            .find_map(|j| {
6333                                let v = &values[j];
6334                                (!v.is_null()).then(|| v.clone())
6335                            })
6336                            .unwrap_or(Value::Null)
6337                    }
6338                } else {
6339                    match lower.as_str() {
6340                        "first_value" => values[lo].clone(),
6341                        "last_value" => values[hi].clone(),
6342                        "nth_value" => {
6343                            let pos = lo + nth - 1;
6344                            if pos > hi {
6345                                Value::Null
6346                            } else {
6347                                values[pos].clone()
6348                            }
6349                        }
6350                        _ => unreachable!(),
6351                    }
6352                };
6353                out_vals[*idx] = v;
6354            }
6355            Ok(())
6356        }
6357        "ntile" => {
6358            if args.is_empty() {
6359                return Err(EngineError::Unsupported(
6360                    "ntile(n) requires an integer argument".into(),
6361                ));
6362            }
6363            let v = eval::eval_expr(&args[0], filtered_rows[slice[0].2], ctx)
6364                .map_err(EngineError::Eval)?;
6365            let bucket_count: i64 = match v {
6366                Value::SmallInt(n) => i64::from(n),
6367                Value::Int(n) => i64::from(n),
6368                Value::BigInt(n) => n,
6369                _ => {
6370                    return Err(EngineError::Unsupported(
6371                        "ntile() argument must be integer".into(),
6372                    ));
6373                }
6374            };
6375            if bucket_count < 1 {
6376                return Err(EngineError::Unsupported(
6377                    "ntile() argument must be >= 1".into(),
6378                ));
6379            }
6380            #[allow(clippy::cast_sign_loss)]
6381            let buckets = bucket_count as usize;
6382            let n = slice.len();
6383            // Each bucket gets `base` rows; the first `extras` buckets
6384            // get one extra. PG semantics.
6385            let base = n / buckets;
6386            let extras = n % buckets;
6387            let mut bucket: usize = 1;
6388            let mut remaining_in_bucket = if extras > 0 { base + 1 } else { base };
6389            let mut buckets_with_extra_remaining = extras;
6390            for (_, _, idx) in slice {
6391                if remaining_in_bucket == 0 {
6392                    bucket += 1;
6393                    buckets_with_extra_remaining = buckets_with_extra_remaining.saturating_sub(1);
6394                    remaining_in_bucket = if buckets_with_extra_remaining > 0 {
6395                        base + 1
6396                    } else {
6397                        base
6398                    };
6399                    // Edge: if base==0 and extras==0, all rows fit;
6400                    // shouldn't reach here, but guard anyway.
6401                    if remaining_in_bucket == 0 {
6402                        remaining_in_bucket = 1;
6403                    }
6404                }
6405                out_vals[*idx] = Value::BigInt(i64::try_from(bucket).unwrap_or(i64::MAX));
6406                remaining_in_bucket -= 1;
6407            }
6408            Ok(())
6409        }
6410        "percent_rank" => {
6411            // (rank - 1) / (n - 1) where rank is the standard RANK().
6412            // Single-row partitions get 0.
6413            let n = slice.len();
6414            let mut prev_key: Option<&[(Value, bool)]> = None;
6415            let mut current_rank: i64 = 1;
6416            for (i, (_, okey, idx)) in slice.iter().enumerate() {
6417                if let Some(p) = prev_key
6418                    && order_key_cmp(p, okey) != core::cmp::Ordering::Equal
6419                {
6420                    current_rank = i64::try_from(i + 1).unwrap_or(i64::MAX);
6421                }
6422                if prev_key.is_none() {
6423                    current_rank = 1;
6424                }
6425                #[allow(clippy::cast_precision_loss)]
6426                let pr = if n <= 1 {
6427                    0.0
6428                } else {
6429                    (current_rank - 1) as f64 / (n - 1) as f64
6430                };
6431                out_vals[*idx] = Value::Float(pr);
6432                prev_key = Some(okey.as_slice());
6433            }
6434            Ok(())
6435        }
6436        "cume_dist" => {
6437            // # rows up to and including this row's peer group / n.
6438            let n = slice.len();
6439            // First pass: find peer-group-end rank for each row.
6440            for i in 0..slice.len() {
6441                let peer_end = peer_group_end(slice, i);
6442                #[allow(clippy::cast_precision_loss)]
6443                let cd = (peer_end + 1) as f64 / n as f64;
6444                let (_, _, idx) = &slice[i];
6445                out_vals[*idx] = Value::Float(cd);
6446            }
6447            Ok(())
6448        }
6449        other => Err(EngineError::Unsupported(alloc::format!(
6450            "window function {other:?} not supported (v4.21: row_number/rank/dense_rank/sum/avg/count/min/max/lag/lead/first_value/last_value/nth_value/ntile/percent_rank/cume_dist)"
6451        ))),
6452    }
6453}
6454
6455/// v4.20: resolve the user-provided frame down to a normalised
6456/// `(kind, start, end)`. `None` means default — derive from
6457/// `ordered`: ordered ⇒ RANGE UNBOUNDED PRECEDING AND CURRENT ROW,
6458/// unordered ⇒ ROWS UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING.
6459/// Single-bound shorthand (e.g. `ROWS 5 PRECEDING`) normalises
6460/// end → CURRENT ROW per the PG spec.
6461fn effective_frame(
6462    frame: Option<&WindowFrame>,
6463    ordered: bool,
6464) -> Result<(FrameKind, FrameBound, FrameBound), EngineError> {
6465    match frame {
6466        None => {
6467            if ordered {
6468                Ok((
6469                    FrameKind::Range,
6470                    FrameBound::UnboundedPreceding,
6471                    FrameBound::CurrentRow,
6472                ))
6473            } else {
6474                Ok((
6475                    FrameKind::Rows,
6476                    FrameBound::UnboundedPreceding,
6477                    FrameBound::UnboundedFollowing,
6478                ))
6479            }
6480        }
6481        Some(fr) => {
6482            let end = fr.end.clone().unwrap_or(FrameBound::CurrentRow);
6483            // Reject start > end (a few impossible combinations).
6484            if matches!(fr.start, FrameBound::UnboundedFollowing)
6485                || matches!(end, FrameBound::UnboundedPreceding)
6486            {
6487                return Err(EngineError::Unsupported(alloc::format!(
6488                    "invalid frame: start={:?} end={:?}",
6489                    fr.start,
6490                    end
6491                )));
6492            }
6493            // RANGE OFFSET PRECEDING / FOLLOWING needs value-typed
6494            // arithmetic on the ORDER BY key (e.g. `RANGE BETWEEN
6495            // INTERVAL '1 day' PRECEDING AND CURRENT ROW`). Not
6496            // implemented in v4.20.
6497            if fr.kind == FrameKind::Range
6498                && (matches!(
6499                    fr.start,
6500                    FrameBound::OffsetPreceding(_) | FrameBound::OffsetFollowing(_)
6501                ) || matches!(
6502                    end,
6503                    FrameBound::OffsetPreceding(_) | FrameBound::OffsetFollowing(_)
6504                ))
6505            {
6506                return Err(EngineError::Unsupported(
6507                    "RANGE with explicit offset bounds is not supported (v4.20: only UNBOUNDED / CURRENT ROW for RANGE)".into(),
6508                ));
6509            }
6510            Ok((fr.kind, fr.start.clone(), end))
6511        }
6512    }
6513}
6514
6515/// Compute `(lo, hi)` row-index bounds inside the partition slice
6516/// for the row at position `i`. Inclusive, clamped to
6517/// `[0, slice.len()-1]`. Empty result if `lo > hi`.
6518#[allow(clippy::type_complexity)]
6519fn frame_bounds_for_row(
6520    eff: &(FrameKind, FrameBound, FrameBound),
6521    i: usize,
6522    slice: &[(Vec<Value>, Vec<(Value, bool)>, usize)],
6523) -> (usize, usize) {
6524    let (kind, start, end) = eff;
6525    let n = slice.len();
6526    let last = n.saturating_sub(1);
6527    let (mut lo, mut hi) = match kind {
6528        FrameKind::Rows => {
6529            let lo = match start {
6530                FrameBound::UnboundedPreceding => 0,
6531                FrameBound::OffsetPreceding(k) => {
6532                    let k = usize::try_from(*k).unwrap_or(usize::MAX);
6533                    i.saturating_sub(k)
6534                }
6535                FrameBound::CurrentRow => i,
6536                FrameBound::OffsetFollowing(k) => {
6537                    let k = usize::try_from(*k).unwrap_or(usize::MAX);
6538                    i.saturating_add(k).min(last)
6539                }
6540                FrameBound::UnboundedFollowing => last,
6541            };
6542            let hi = match end {
6543                FrameBound::UnboundedPreceding => 0,
6544                FrameBound::OffsetPreceding(k) => {
6545                    let k = usize::try_from(*k).unwrap_or(usize::MAX);
6546                    i.saturating_sub(k)
6547                }
6548                FrameBound::CurrentRow => i,
6549                FrameBound::OffsetFollowing(k) => {
6550                    let k = usize::try_from(*k).unwrap_or(usize::MAX);
6551                    i.saturating_add(k).min(last)
6552                }
6553                FrameBound::UnboundedFollowing => last,
6554            };
6555            (lo, hi)
6556        }
6557        FrameKind::Range => {
6558            // RANGE bounds are peer-aware. With only UNBOUNDED and
6559            // CURRENT ROW supported (rejected at effective_frame for
6560            // explicit offsets), the start/end map to the
6561            // partition's full extent at the same-order-key peer
6562            // group boundary.
6563            let lo = match start {
6564                FrameBound::UnboundedPreceding => 0,
6565                FrameBound::CurrentRow => peer_group_start(slice, i),
6566                FrameBound::UnboundedFollowing => last,
6567                _ => unreachable!("offset bounds rejected for RANGE"),
6568            };
6569            let hi = match end {
6570                FrameBound::UnboundedPreceding => 0,
6571                FrameBound::CurrentRow => peer_group_end(slice, i),
6572                FrameBound::UnboundedFollowing => last,
6573                _ => unreachable!("offset bounds rejected for RANGE"),
6574            };
6575            (lo, hi)
6576        }
6577    };
6578    if hi >= n {
6579        hi = last;
6580    }
6581    if lo >= n {
6582        lo = last;
6583    }
6584    (lo, hi)
6585}
6586
6587/// Find the inclusive index of the first row with the same ORDER
6588/// BY key as `slice[i]`. Slice is already sorted by partition then
6589/// order, so peers are contiguous.
6590#[allow(clippy::type_complexity)]
6591fn peer_group_start(slice: &[(Vec<Value>, Vec<(Value, bool)>, usize)], i: usize) -> usize {
6592    let key = &slice[i].1;
6593    let mut j = i;
6594    while j > 0 && order_key_cmp(&slice[j - 1].1, key) == core::cmp::Ordering::Equal {
6595        j -= 1;
6596    }
6597    j
6598}
6599
6600/// Find the inclusive index of the last row with the same ORDER
6601/// BY key as `slice[i]`.
6602#[allow(clippy::type_complexity)]
6603fn peer_group_end(slice: &[(Vec<Value>, Vec<(Value, bool)>, usize)], i: usize) -> usize {
6604    let key = &slice[i].1;
6605    let mut j = i;
6606    while j + 1 < slice.len() && order_key_cmp(&slice[j + 1].1, key) == core::cmp::Ordering::Equal {
6607        j += 1;
6608    }
6609    j
6610}
6611
6612fn value_to_f64(v: &Value) -> Option<f64> {
6613    match v {
6614        Value::SmallInt(n) => Some(f64::from(*n)),
6615        Value::Int(n) => Some(f64::from(*n)),
6616        #[allow(clippy::cast_precision_loss)]
6617        Value::BigInt(n) => Some(*n as f64),
6618        Value::Float(x) => Some(*x),
6619        _ => None,
6620    }
6621}
6622
6623/// Quick scan for any subquery-bearing node in a SELECT's WHERE /
6624/// projection / `order_by` — saves cloning the AST when there are
6625/// none (the common case).
6626fn expr_tree_has_subquery(stmt: &SelectStatement) -> bool {
6627    let mut any = false;
6628    for item in &stmt.items {
6629        if let SelectItem::Expr { expr, .. } = item {
6630            any = any || expr_has_subquery(expr);
6631        }
6632    }
6633    if let Some(w) = &stmt.where_ {
6634        any = any || expr_has_subquery(w);
6635    }
6636    if let Some(h) = &stmt.having {
6637        any = any || expr_has_subquery(h);
6638    }
6639    for o in &stmt.order_by {
6640        any = any || expr_has_subquery(&o.expr);
6641    }
6642    for (_, peer) in &stmt.unions {
6643        any = any || expr_tree_has_subquery(peer);
6644    }
6645    any
6646}
6647
6648fn expr_has_subquery(e: &Expr) -> bool {
6649    match e {
6650        Expr::ScalarSubquery(_) | Expr::Exists { .. } | Expr::InSubquery { .. } => true,
6651        Expr::Binary { lhs, rhs, .. } => expr_has_subquery(lhs) || expr_has_subquery(rhs),
6652        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6653            expr_has_subquery(expr)
6654        }
6655        Expr::FunctionCall { args, .. } => args.iter().any(expr_has_subquery),
6656        Expr::Like { expr, pattern, .. } => expr_has_subquery(expr) || expr_has_subquery(pattern),
6657        Expr::Extract { source, .. } => expr_has_subquery(source),
6658        Expr::WindowFunction {
6659            args,
6660            partition_by,
6661            order_by,
6662            ..
6663        } => {
6664            args.iter().any(expr_has_subquery)
6665                || partition_by.iter().any(expr_has_subquery)
6666                || order_by.iter().any(|(e, _)| expr_has_subquery(e))
6667        }
6668        Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => false,
6669    }
6670}
6671
6672/// v4.10 helper: materialise a runtime `Value` back into an AST
6673/// `Expr::Literal` for the subquery-rewrite path. Supports the
6674/// types `Literal` can represent (Integer / Float / Text / Bool /
6675/// Null). Date / Timestamp / Numeric / Vector / Interval / JSON
6676/// would lose precision through Literal and aren't supported in
6677/// uncorrelated-subquery results; they error with a clear hint.
6678fn value_to_literal_expr(v: Value) -> Result<Expr, EngineError> {
6679    let lit = match v {
6680        Value::Null => Literal::Null,
6681        Value::SmallInt(n) => Literal::Integer(i64::from(n)),
6682        Value::Int(n) => Literal::Integer(i64::from(n)),
6683        Value::BigInt(n) => Literal::Integer(n),
6684        Value::Float(x) => Literal::Float(x),
6685        Value::Text(s) | Value::Json(s) => Literal::String(s),
6686        Value::Bool(b) => Literal::Bool(b),
6687        other => {
6688            return Err(EngineError::Unsupported(alloc::format!(
6689                "subquery result type {:?} not yet materialisable; cast to text or integer in the inner SELECT",
6690                other.data_type()
6691            )));
6692        }
6693    };
6694    Ok(Expr::Literal(lit))
6695}
6696
6697/// v6.1.1 — walk the prepared `Statement` AST and replace every
6698/// `Expr::Placeholder(n)` with `Expr::Literal(value_to_literal(
6699/// params[n-1]))`. The dispatch downstream sees a `Statement`
6700/// indistinguishable from a simple-query parse, so the exec path
6701/// stays unchanged.
6702///
6703/// Errors fall into one shape: a `$N` references past the bound
6704/// `params.len()`. Out-of-range happens when the Bind didn't
6705/// supply enough values; pgwire surfaces this as a protocol error
6706/// to the client.
6707fn substitute_placeholders(stmt: &mut Statement, params: &[Value]) -> Result<(), EngineError> {
6708    match stmt {
6709        Statement::Select(s) => substitute_select(s, params)?,
6710        Statement::Insert(ins) => {
6711            for row in &mut ins.rows {
6712                for e in row {
6713                    substitute_expr(e, params)?;
6714                }
6715            }
6716        }
6717        Statement::Update(u) => {
6718            for (_, e) in &mut u.assignments {
6719                substitute_expr(e, params)?;
6720            }
6721            if let Some(w) = &mut u.where_ {
6722                substitute_expr(w, params)?;
6723            }
6724        }
6725        Statement::Delete(d) => {
6726            if let Some(w) = &mut d.where_ {
6727                substitute_expr(w, params)?;
6728            }
6729        }
6730        Statement::Explain(e) => substitute_select(&mut e.inner, params)?,
6731        // Other statements (CREATE / BEGIN / SHOW / …) have no
6732        // expression slots; no walk needed.
6733        _ => {}
6734    }
6735    Ok(())
6736}
6737
6738fn substitute_select(
6739    s: &mut SelectStatement,
6740    params: &[Value],
6741) -> Result<(), EngineError> {
6742    for item in &mut s.items {
6743        if let SelectItem::Expr { expr, .. } = item {
6744            substitute_expr(expr, params)?;
6745        }
6746    }
6747    if let Some(w) = &mut s.where_ {
6748        substitute_expr(w, params)?;
6749    }
6750    if let Some(gs) = &mut s.group_by {
6751        for g in gs {
6752            substitute_expr(g, params)?;
6753        }
6754    }
6755    if let Some(h) = &mut s.having {
6756        substitute_expr(h, params)?;
6757    }
6758    for o in &mut s.order_by {
6759        substitute_expr(&mut o.expr, params)?;
6760    }
6761    for (_, peer) in &mut s.unions {
6762        substitute_select(peer, params)?;
6763    }
6764    Ok(())
6765}
6766
6767fn substitute_expr(e: &mut Expr, params: &[Value]) -> Result<(), EngineError> {
6768    if let Expr::Placeholder(n) = e {
6769        let idx = usize::from(*n).saturating_sub(1);
6770        let v = params.get(idx).ok_or_else(|| {
6771            EngineError::Eval(EvalError::PlaceholderOutOfRange {
6772                n: *n,
6773                bound: u16::try_from(params.len()).unwrap_or(u16::MAX),
6774            })
6775        })?;
6776        *e = Expr::Literal(value_to_literal(v.clone()));
6777        return Ok(());
6778    }
6779    match e {
6780        Expr::Binary { lhs, rhs, .. } => {
6781            substitute_expr(lhs, params)?;
6782            substitute_expr(rhs, params)?;
6783        }
6784        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6785            substitute_expr(expr, params)?;
6786        }
6787        Expr::FunctionCall { args, .. } => {
6788            for a in args {
6789                substitute_expr(a, params)?;
6790            }
6791        }
6792        Expr::Like { expr, pattern, .. } => {
6793            substitute_expr(expr, params)?;
6794            substitute_expr(pattern, params)?;
6795        }
6796        Expr::Extract { source, .. } => substitute_expr(source, params)?,
6797        Expr::ScalarSubquery(s) => substitute_select(s, params)?,
6798        Expr::Exists { subquery, .. } => substitute_select(subquery, params)?,
6799        Expr::InSubquery { expr, subquery, .. } => {
6800            substitute_expr(expr, params)?;
6801            substitute_select(subquery, params)?;
6802        }
6803        Expr::WindowFunction {
6804            args,
6805            partition_by,
6806            order_by,
6807            ..
6808        } => {
6809            for a in args {
6810                substitute_expr(a, params)?;
6811            }
6812            for p in partition_by {
6813                substitute_expr(p, params)?;
6814            }
6815            for (e, _) in order_by {
6816                substitute_expr(e, params)?;
6817            }
6818        }
6819        Expr::Literal(_) | Expr::Column(_) => {}
6820        // Already handled above.
6821        Expr::Placeholder(_) => unreachable!("Placeholder handled at top of fn"),
6822    }
6823    Ok(())
6824}
6825
6826/// v6.1.1 — convert a runtime `Value` into the closest matching
6827/// `Literal` for the substitute walker. Lossless for the simple
6828/// scalars (Int / Float / Text / Bool); Numeric / Date / Timestamp
6829/// / Json / Interval render as their canonical text form so the
6830/// downstream coerce_value can re-parse against the target column
6831/// type. SQ8 / HalfVector cells are NOT expected as bind params;
6832/// pgwire's Bind decodes vector params to the f32 representation
6833/// before they reach this helper.
6834/// v6.2.0 — total ordering on `Value`s used by ANALYZE to sort a
6835/// column's non-NULL sample before histogram building. Cross-type
6836/// pairs (Int vs Float, Date vs Timestamp, …) compare via the
6837/// same widening the eval-side `compare` operator uses; everything
6838/// else (the genuinely-incompatible pairs) falls back to ordering
6839/// by canonical string form so the sort is still total + stable.
6840/// Vector / SQ8 / Half / Json / Numeric / Interval values reach
6841/// here only via the string-fallback path because vector columns
6842/// are filtered out upstream.
6843fn sort_values_for_histogram(a: &Value, b: &Value) -> core::cmp::Ordering {
6844    use core::cmp::Ordering;
6845    match (a, b) {
6846        (Value::SmallInt(a), Value::SmallInt(b)) => a.cmp(b),
6847        (Value::Int(a), Value::Int(b)) => a.cmp(b),
6848        (Value::BigInt(a), Value::BigInt(b)) => a.cmp(b),
6849        (Value::SmallInt(a), Value::Int(b)) => i32::from(*a).cmp(b),
6850        (Value::Int(a), Value::SmallInt(b)) => a.cmp(&i32::from(*b)),
6851        (Value::Int(a), Value::BigInt(b)) => i64::from(*a).cmp(b),
6852        (Value::BigInt(a), Value::Int(b)) => a.cmp(&i64::from(*b)),
6853        (Value::SmallInt(a), Value::BigInt(b)) => i64::from(*a).cmp(b),
6854        (Value::BigInt(a), Value::SmallInt(b)) => a.cmp(&i64::from(*b)),
6855        (Value::Float(a), Value::Float(b)) => a.partial_cmp(b).unwrap_or(Ordering::Equal),
6856        (Value::Text(a), Value::Text(b)) | (Value::Json(a), Value::Json(b)) => a.cmp(b),
6857        (Value::Bool(a), Value::Bool(b)) => a.cmp(b),
6858        (Value::Date(a), Value::Date(b)) => a.cmp(b),
6859        (Value::Timestamp(a), Value::Timestamp(b)) => a.cmp(b),
6860        // Mixed numeric/float — widen to f64 and compare.
6861        (Value::SmallInt(n), Value::Float(x)) => {
6862            (f64::from(*n)).partial_cmp(x).unwrap_or(Ordering::Equal)
6863        }
6864        (Value::Float(x), Value::SmallInt(n)) => {
6865            x.partial_cmp(&f64::from(*n)).unwrap_or(Ordering::Equal)
6866        }
6867        (Value::Int(n), Value::Float(x)) => {
6868            (f64::from(*n)).partial_cmp(x).unwrap_or(Ordering::Equal)
6869        }
6870        (Value::Float(x), Value::Int(n)) => {
6871            x.partial_cmp(&f64::from(*n)).unwrap_or(Ordering::Equal)
6872        }
6873        (Value::BigInt(n), Value::Float(x)) => {
6874            #[allow(clippy::cast_precision_loss)]
6875            let nf = *n as f64;
6876            nf.partial_cmp(x).unwrap_or(Ordering::Equal)
6877        }
6878        (Value::Float(x), Value::BigInt(n)) => {
6879            #[allow(clippy::cast_precision_loss)]
6880            let nf = *n as f64;
6881            x.partial_cmp(&nf).unwrap_or(Ordering::Equal)
6882        }
6883        // Cross-type fallback: lexicographic on canonical form.
6884        // Total + stable so the sort is well-defined.
6885        _ => canonical_value_repr(a).cmp(&canonical_value_repr(b)),
6886    }
6887}
6888
6889/// v6.2.0 — render the histogram bounds list as a `[v0, v1, ...]`
6890/// string for the `spg_statistic.histogram_bounds` column. Values
6891/// containing `,` or `[` / `]` are JSON-style escaped so the
6892/// rendering round-trips through a future parser; v6.2.0 only
6893/// uses the rendered form for human consumption, so the escaping
6894/// is conservative.
6895fn render_histogram_bounds(bounds: &[alloc::string::String]) -> alloc::string::String {
6896    let mut out = alloc::string::String::with_capacity(bounds.len() * 8 + 2);
6897    out.push('[');
6898    for (i, b) in bounds.iter().enumerate() {
6899        if i > 0 {
6900            out.push_str(", ");
6901        }
6902        let needs_quote = b.contains([',', '[', ']', '"']) || b.is_empty();
6903        if needs_quote {
6904            out.push('"');
6905            for ch in b.chars() {
6906                if ch == '"' || ch == '\\' {
6907                    out.push('\\');
6908                }
6909                out.push(ch);
6910            }
6911            out.push('"');
6912        } else {
6913            out.push_str(b);
6914        }
6915    }
6916    out.push(']');
6917    out
6918}
6919
6920/// v6.2.0 — canonical textual form of a `Value` for histogram
6921/// bound storage. Strings used by ANALYZE for sort + bound output.
6922/// INT / BIGINT → decimal; FLOAT → shortest-round-trip via
6923/// `{:?}`; TEXT pass-through; BOOL → `t` / `f`; DATE / TIMESTAMP →
6924/// the same form `format_date` / `format_timestamp` produce for
6925/// SQL Display. Vector / SQ8 / Half / Json / Numeric / Interval
6926/// reach this only via a non-Vector column (vector columns are
6927/// skipped upstream); they fall back to a Debug-derived form so
6928/// stats still serialise without crashing.
6929pub(crate) fn canonical_value_repr(v: &Value) -> alloc::string::String {
6930    match v {
6931        Value::Null => "NULL".to_string(),
6932        Value::SmallInt(n) => alloc::format!("{n}"),
6933        Value::Int(n) => alloc::format!("{n}"),
6934        Value::BigInt(n) => alloc::format!("{n}"),
6935        Value::Float(x) => alloc::format!("{x:?}"),
6936        Value::Text(s) | Value::Json(s) => s.clone(),
6937        Value::Bool(b) => if *b { "t" } else { "f" }.to_string(),
6938        Value::Date(d) => eval::format_date(*d),
6939        Value::Timestamp(t) => eval::format_timestamp(*t),
6940        Value::Interval { months, micros } => eval::format_interval(*months, *micros),
6941        Value::Numeric { scaled, scale } => eval::format_numeric(*scaled, *scale),
6942        Value::Vector(_) | Value::Sq8Vector(_) | Value::HalfVector(_) => {
6943            // Unreachable in practice (vector columns are filtered
6944            // out before this). Defensive fallback so a future
6945            // vector-stats path doesn't crash.
6946            alloc::format!("{v:?}")
6947        }
6948        // v7.5.0 — Value is #[non_exhaustive] for downstream
6949        // forward-compat. Future variants fall through to Debug
6950        // form here (same shape as the vector fallback above).
6951        _ => alloc::format!("{v:?}"),
6952    }
6953}
6954
6955/// v6.2.0 — true for engine-managed catalog tables that the bare
6956/// `ANALYZE` (no target) should skip. v6.2.0 has no internal
6957/// tables yet (publications / subscriptions / users / statistics
6958/// all live as engine fields, not catalog tables), so this is a
6959/// reserved future-proofing hook — every existing user table is
6960/// analysed.
6961const fn is_internal_table_name(_name: &str) -> bool {
6962    false
6963}
6964
6965fn value_to_literal(v: Value) -> Literal {
6966    match v {
6967        Value::Null => Literal::Null,
6968        Value::SmallInt(n) => Literal::Integer(i64::from(n)),
6969        Value::Int(n) => Literal::Integer(i64::from(n)),
6970        Value::BigInt(n) => Literal::Integer(n),
6971        Value::Float(x) => Literal::Float(x),
6972        Value::Text(s) | Value::Json(s) => Literal::String(s),
6973        Value::Bool(b) => Literal::Bool(b),
6974        Value::Vector(v) => Literal::Vector(v),
6975        Value::Numeric { scaled, scale } => {
6976            Literal::String(eval::format_numeric(scaled, scale))
6977        }
6978        Value::Date(d) => Literal::String(eval::format_date(d)),
6979        Value::Timestamp(t) => Literal::String(eval::format_timestamp(t)),
6980        Value::Interval { months, micros } => Literal::Interval {
6981            months,
6982            micros,
6983            text: eval::format_interval(months, micros),
6984        },
6985        // SQ8 / halfvec cells dequantise to f32 before reaching the
6986        // substitute walker; pgwire's Bind path handles that.
6987        Value::Sq8Vector(q) => Literal::Vector(spg_storage::quantize::dequantize(&q)),
6988        Value::HalfVector(h) => Literal::Vector(h.to_f32_vec()),
6989        // v7.5.0 — Value is #[non_exhaustive]; future variants
6990        // render as Debug-form String literal until explicit
6991        // mapping is added.
6992        v => Literal::String(alloc::format!("{v:?}")),
6993    }
6994}
6995
6996fn rewrite_clock_calls(stmt: &mut Statement, now_micros: Option<i64>) {
6997    let Some(now) = now_micros else {
6998        return;
6999    };
7000    match stmt {
7001        Statement::Select(s) => rewrite_select_clock(s, now),
7002        Statement::Insert(ins) => {
7003            for row in &mut ins.rows {
7004                for e in row {
7005                    rewrite_expr_clock(e, now);
7006                }
7007            }
7008        }
7009        _ => {}
7010    }
7011}
7012
7013fn rewrite_select_clock(s: &mut SelectStatement, now: i64) {
7014    for item in &mut s.items {
7015        if let SelectItem::Expr { expr, .. } = item {
7016            rewrite_expr_clock(expr, now);
7017        }
7018    }
7019    if let Some(w) = &mut s.where_ {
7020        rewrite_expr_clock(w, now);
7021    }
7022    if let Some(gs) = &mut s.group_by {
7023        for g in gs {
7024            rewrite_expr_clock(g, now);
7025        }
7026    }
7027    if let Some(h) = &mut s.having {
7028        rewrite_expr_clock(h, now);
7029    }
7030    for o in &mut s.order_by {
7031        rewrite_expr_clock(&mut o.expr, now);
7032    }
7033    for (_, peer) in &mut s.unions {
7034        rewrite_select_clock(peer, now);
7035    }
7036}
7037
7038/// v3.0.3 hot path: every recursion lands in exactly one `match` arm.
7039/// Literal / Column-with-qualifier (the dominant cases on a typical
7040/// AST) take a single pattern dispatch and exit. The clock-rewrite
7041/// targets (zero-arg `NOW` / `CURRENT_TIMESTAMP` / `CURRENT_DATE`
7042/// functions, and bare `CURRENT_TIMESTAMP` / `CURRENT_DATE` column
7043/// refs) sit on their own arms with match guards so the fall-through
7044/// to the recursive arms is unambiguous.
7045fn rewrite_expr_clock(e: &mut Expr, now: i64) {
7046    // Fast-path test on the no-recursion shapes first. We can't fold
7047    // them into the big match below because they need to *replace* `e`
7048    // outright; the recursive arms below match on its sub-fields.
7049    if let Some(replacement) = clock_replacement_for(e, now) {
7050        *e = replacement;
7051        return;
7052    }
7053    match e {
7054        Expr::Binary { lhs, rhs, .. } => {
7055            rewrite_expr_clock(lhs, now);
7056            rewrite_expr_clock(rhs, now);
7057        }
7058        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
7059            rewrite_expr_clock(expr, now);
7060        }
7061        Expr::FunctionCall { args, .. } => {
7062            for a in args {
7063                rewrite_expr_clock(a, now);
7064            }
7065        }
7066        Expr::Like { expr, pattern, .. } => {
7067            rewrite_expr_clock(expr, now);
7068            rewrite_expr_clock(pattern, now);
7069        }
7070        Expr::Extract { source, .. } => rewrite_expr_clock(source, now),
7071        // v4.10 subquery nodes — recurse into the inner SELECT's
7072        // expression slots so e.g. SELECT NOW() in a scalar
7073        // subquery picks up the same instant as the outer query.
7074        Expr::ScalarSubquery(s) => rewrite_select_clock(s, now),
7075        Expr::Exists { subquery, .. } => rewrite_select_clock(subquery, now),
7076        Expr::InSubquery { expr, subquery, .. } => {
7077            rewrite_expr_clock(expr, now);
7078            rewrite_select_clock(subquery, now);
7079        }
7080        // v4.12 window functions — args + PARTITION BY + ORDER BY
7081        // may all reference clock literals.
7082        Expr::WindowFunction {
7083            args,
7084            partition_by,
7085            order_by,
7086            ..
7087        } => {
7088            for a in args {
7089                rewrite_expr_clock(a, now);
7090            }
7091            for p in partition_by {
7092                rewrite_expr_clock(p, now);
7093            }
7094            for (e, _) in order_by {
7095                rewrite_expr_clock(e, now);
7096            }
7097        }
7098        Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => {}
7099    }
7100}
7101
7102/// Returns `Some(Expr)` when `e` is one of the clock-call shapes that
7103/// must be rewritten; otherwise `None` so the caller falls through to
7104/// the recursive walk. Identifies both function-call forms (`NOW()` /
7105/// `CURRENT_TIMESTAMP()` / `CURRENT_DATE()`) and bare-identifier forms
7106/// (`CURRENT_TIMESTAMP` / `CURRENT_DATE` as unqualified column refs,
7107/// which is how PG accepts them without parens).
7108fn clock_replacement_for(e: &Expr, now: i64) -> Option<Expr> {
7109    let (kind, name) = match e {
7110        Expr::FunctionCall { name, args } if args.is_empty() => (ClockSite::Fn, name.as_str()),
7111        Expr::Column(c) if c.qualifier.is_none() => (ClockSite::BareIdent, c.name.as_str()),
7112        _ => return None,
7113    };
7114    // ASCII case-insensitive name match. Limited to the three keywords
7115    // that actually need rewriting.
7116    let matched = match name.len() {
7117        3 if kind == ClockSite::Fn && name.eq_ignore_ascii_case("now") => Some(true),
7118        12 if name.eq_ignore_ascii_case("current_date") => Some(false),
7119        17 if name.eq_ignore_ascii_case("current_timestamp") => Some(true),
7120        _ => None,
7121    };
7122    let is_timestamp = matched?;
7123    let payload = if is_timestamp {
7124        now
7125    } else {
7126        now.div_euclid(86_400_000_000)
7127    };
7128    let target = if is_timestamp {
7129        spg_sql::ast::CastTarget::Timestamp
7130    } else {
7131        spg_sql::ast::CastTarget::Date
7132    };
7133    Some(Expr::Cast {
7134        expr: alloc::boxed::Box::new(Expr::Literal(spg_sql::ast::Literal::Integer(payload))),
7135        target,
7136    })
7137}
7138
7139#[derive(Debug, Clone, Copy, PartialEq, Eq)]
7140enum ClockSite {
7141    Fn,
7142    BareIdent,
7143}
7144
7145/// `ORDER BY <integer>` references the N-th SELECT item (1-based).
7146/// Swap the integer literal for the matching item's expression so the
7147/// executor doesn't need a special-case branch. Recurses into UNION
7148/// peers because each peer keeps its own SELECT list.
7149/// v6.4.1 — expand `GROUP BY ALL` to every non-aggregate SELECT-list
7150/// item. Mirrors DuckDB / PG 19 semantics. Wildcards (`SELECT * …`)
7151/// are NOT expanded by GROUP BY ALL (PG 19 leaves the wildcard intact
7152/// and groups by whatever explicit non-aggregates remain — none in
7153/// the wildcard-only case, which still works for non-aggregate
7154/// queries).
7155fn expand_group_by_all(s: &mut SelectStatement) {
7156    if !s.group_by_all {
7157        for (_, peer) in &mut s.unions {
7158            expand_group_by_all(peer);
7159        }
7160        return;
7161    }
7162    let mut groups: Vec<Expr> = Vec::new();
7163    for item in &s.items {
7164        if let SelectItem::Expr { expr, .. } = item
7165            && !aggregate::contains_aggregate(expr)
7166        {
7167            groups.push(expr.clone());
7168        }
7169    }
7170    s.group_by = Some(groups);
7171    s.group_by_all = false;
7172    for (_, peer) in &mut s.unions {
7173        expand_group_by_all(peer);
7174    }
7175}
7176
7177fn resolve_order_by_position(s: &mut SelectStatement) {
7178    // v6.4.0 — iterate every ORDER BY key. Position references
7179    // (`ORDER BY 2`) bind to the 1-based projection index;
7180    // identifier references that match a SELECT-list alias bind to
7181    // the projected expression (Step 4 of L3a).
7182    for order in &mut s.order_by {
7183        match &order.expr {
7184            Expr::Literal(Literal::Integer(n)) if *n >= 1 => {
7185                if let Ok(idx_one_based) = usize::try_from(*n) {
7186                    let idx = idx_one_based - 1;
7187                    if idx < s.items.len()
7188                        && let SelectItem::Expr { expr, .. } = &s.items[idx]
7189                    {
7190                        order.expr = expr.clone();
7191                    }
7192                }
7193            }
7194            Expr::Column(c) if c.qualifier.is_none() => {
7195                // Alias-in-ORDER-BY lookup.
7196                for item in &s.items {
7197                    if let SelectItem::Expr {
7198                        expr,
7199                        alias: Some(a),
7200                    } = item
7201                        && a == &c.name
7202                    {
7203                        order.expr = expr.clone();
7204                        break;
7205                    }
7206                }
7207            }
7208            _ => {}
7209        }
7210    }
7211    for (_, peer) in &mut s.unions {
7212        resolve_order_by_position(peer);
7213    }
7214}
7215
7216/// Sort `tagged` by `f64` key, reversing the comparator under DESC.
7217/// Used by the UNION ORDER BY path; per-block paths inline the same
7218/// comparator because they already hold `&OrderBy` directly.
7219/// v3.1.1: partial-sort helper. When `keep` (= offset + limit) is
7220/// strictly less than `tagged.len()`, run `select_nth_unstable_by` to
7221/// partition the prefix in O(n), then sort just that prefix in O(k
7222/// log k). Total O(n + k log k), vs O(n log n) for a full sort. The
7223/// caller decides what `keep` is; passing `None` (no LIMIT) keeps the
7224/// full-sort behaviour.
7225///
7226/// `tagged` holds `(Option<f64>, Row)` (the SELECT path) — `None` keys
7227/// sort last in ascending order, mirroring NULL-sorts-last in SQL.
7228fn partial_sort_tagged(
7229    tagged: &mut Vec<(Vec<f64>, Row)>,
7230    keep: Option<usize>,
7231    descs: &[bool],
7232) {
7233    let cmp = |a: &(Vec<f64>, Row), b: &(Vec<f64>, Row)| cmp_multi_key(&a.0, &b.0, descs);
7234    match keep {
7235        Some(k) if k < tagged.len() && k > 0 => {
7236            let pivot = k - 1;
7237            tagged.select_nth_unstable_by(pivot, cmp);
7238            tagged[..k].sort_by(cmp);
7239            tagged.truncate(k);
7240        }
7241        _ => {
7242            tagged.sort_by(cmp);
7243        }
7244    }
7245}
7246
7247fn sort_by_keys(tagged: &mut [(Vec<f64>, Row)], descs: &[bool]) {
7248    tagged.sort_by(|a, b| cmp_multi_key(&a.0, &b.0, descs));
7249}
7250
7251/// v6.4.0 — multi-key ORDER BY comparator. Each key's per-key DESC
7252/// flag is honored independently. NULL is encoded as `f64::INFINITY`
7253/// so it sorts last in ASC and first in DESC (matches PG default).
7254fn cmp_multi_key(a: &[f64], b: &[f64], descs: &[bool]) -> core::cmp::Ordering {
7255    use core::cmp::Ordering;
7256    for (i, (ka, kb)) in a.iter().zip(b.iter()).enumerate() {
7257        let ord = ka.partial_cmp(kb).unwrap_or(Ordering::Equal);
7258        let ord = if descs.get(i).copied().unwrap_or(false) {
7259            ord.reverse()
7260        } else {
7261            ord
7262        };
7263        if ord != Ordering::Equal {
7264            return ord;
7265        }
7266    }
7267    Ordering::Equal
7268}
7269
7270/// v6.4.0 — eval every ORDER BY expression for a row and pack the
7271/// resulting keys into a `Vec<f64>`. NULL → `f64::INFINITY`.
7272fn build_order_keys(
7273    order_by: &[OrderBy],
7274    row: &Row,
7275    ctx: &EvalContext,
7276) -> Result<Vec<f64>, EngineError> {
7277    let mut keys = Vec::with_capacity(order_by.len());
7278    for o in order_by {
7279        let v = eval::eval_expr(&o.expr, row, ctx)?;
7280        keys.push(value_to_order_key(&v)?);
7281    }
7282    Ok(keys)
7283}
7284
7285/// Drop the first `offset` rows then truncate to `limit`. PG / `MySQL`
7286/// agree: OFFSET applies *after* ORDER BY but *before* LIMIT (so
7287/// `LIMIT 10 OFFSET 5` keeps rows 6..=15).
7288fn apply_offset_and_limit(rows: &mut Vec<Row>, offset: Option<u32>, limit: Option<u32>) {
7289    if let Some(off) = offset {
7290        let off = off as usize;
7291        if off >= rows.len() {
7292            rows.clear();
7293        } else {
7294            rows.drain(..off);
7295        }
7296    }
7297    if let Some(n) = limit {
7298        rows.truncate(n as usize);
7299    }
7300}
7301
7302/// v7.6.1 — resolve a parser-level `ForeignKeyConstraint` (column
7303/// names + parent table name) into the storage-layer shape (column
7304/// indices + same parent table). Validates everything the engine
7305/// needs to know about the FK at CREATE TABLE time:
7306///
7307///   - parent table exists (catalog lookup, unless self-referencing)
7308///   - parent columns exist on the parent table
7309///   - parent column list matches the local arity (defaults to the
7310///     parent's primary index column when omitted)
7311///   - parent columns are covered by a `BTree` UNIQUE-class index
7312///     (SPG's stand-in for `PRIMARY KEY`/`UNIQUE`) — required so
7313///     the v7.6.2 INSERT path can do an O(log n) parent lookup
7314///   - local columns exist on the table being created
7315fn resolve_foreign_key(
7316    local_table_name: &str,
7317    local_cols: &[ColumnSchema],
7318    fk: spg_sql::ast::ForeignKeyConstraint,
7319    catalog: &Catalog,
7320) -> Result<spg_storage::ForeignKeyConstraint, EngineError> {
7321    // Resolve local columns.
7322    let mut local_columns = Vec::with_capacity(fk.columns.len());
7323    for name in &fk.columns {
7324        let pos = local_cols
7325            .iter()
7326            .position(|c| c.name == *name)
7327            .ok_or_else(|| {
7328                EngineError::Unsupported(alloc::format!(
7329                    "FOREIGN KEY references unknown local column {name:?}"
7330                ))
7331            })?;
7332        local_columns.push(pos);
7333    }
7334    // Self-referencing FK: parent table is the one we're creating.
7335    // The parent column resolution uses the local column list since
7336    // the catalog doesn't have this table yet.
7337    let is_self_ref = fk.parent_table == local_table_name;
7338    let (parent_cols_for_lookup, parent_table_str): (&[ColumnSchema], &str) = if is_self_ref {
7339        (local_cols, local_table_name)
7340    } else {
7341        let parent_table = catalog.get(&fk.parent_table).ok_or_else(|| {
7342            EngineError::Storage(StorageError::TableNotFound {
7343                name: fk.parent_table.clone(),
7344            })
7345        })?;
7346        (parent_table.schema().columns.as_slice(), fk.parent_table.as_str())
7347    };
7348    // Resolve parent column names → positions. If the FK omitted the
7349    // parent column list, fall back to the parent's primary index
7350    // column (single-column only — composite default is rejected
7351    // because there's no unambiguous "PK" in SPG's index list).
7352    let parent_columns: Vec<usize> = if fk.parent_columns.is_empty() {
7353        if fk.columns.len() != 1 {
7354            return Err(EngineError::Unsupported(
7355                "composite FOREIGN KEY without explicit parent column list is not supported \
7356                 — list the parent columns explicitly"
7357                    .into(),
7358            ));
7359        }
7360        // Find a single BTree index on the parent and use its column.
7361        let pos = pick_pk_index_column(catalog, parent_table_str, is_self_ref, local_cols)
7362            .ok_or_else(|| {
7363                EngineError::Unsupported(alloc::format!(
7364                    "parent table {parent_table_str:?} has no PRIMARY-key / UNIQUE BTree index \
7365                     to default the FOREIGN KEY against"
7366                ))
7367            })?;
7368        alloc::vec![pos]
7369    } else {
7370        let mut out = Vec::with_capacity(fk.parent_columns.len());
7371        for name in &fk.parent_columns {
7372            let pos = parent_cols_for_lookup
7373                .iter()
7374                .position(|c| c.name == *name)
7375                .ok_or_else(|| {
7376                    EngineError::Unsupported(alloc::format!(
7377                        "FOREIGN KEY references unknown parent column \
7378                         {name:?} on table {parent_table_str:?}"
7379                    ))
7380                })?;
7381            out.push(pos);
7382        }
7383        out
7384    };
7385    if parent_columns.len() != local_columns.len() {
7386        return Err(EngineError::Unsupported(alloc::format!(
7387            "FOREIGN KEY arity mismatch: {} local columns vs {} parent columns",
7388            local_columns.len(),
7389            parent_columns.len()
7390        )));
7391    }
7392    // For non-self-referencing FKs, verify the parent column set is
7393    // covered by a BTree index. SPG doesn't have a `PRIMARY KEY`
7394    // declaration; the convention is "the parent column for FK
7395    // purposes must have a BTree index" — which the user creates via
7396    // `CREATE INDEX ... USING btree (col)` (the default). We accept
7397    // any single-column BTree index that covers a parent column;
7398    // composite parent column lists require an index whose `column_position`
7399    // matches the first parent column (multi-column BTree indices
7400    // are not in the v7.x roadmap).
7401    if !is_self_ref {
7402        let parent_table = catalog
7403            .get(&fk.parent_table)
7404            .expect("checked above");
7405        let primary_parent_col = parent_columns[0];
7406        let has_btree = parent_table.schema().columns.get(primary_parent_col).is_some()
7407            && parent_table
7408                .indices()
7409                .iter()
7410                .any(|idx| {
7411                    matches!(idx.kind, spg_storage::IndexKind::BTree(_))
7412                        && idx.column_position == primary_parent_col
7413                        && idx.partial_predicate.is_none()
7414                });
7415        if !has_btree {
7416            return Err(EngineError::Unsupported(alloc::format!(
7417                "FOREIGN KEY parent column on {:?} is not covered by an unconditional BTree \
7418                 index — create one with `CREATE INDEX ... ON {} ({})` first",
7419                parent_table_str,
7420                parent_table_str,
7421                parent_table.schema().columns[primary_parent_col].name,
7422            )));
7423        }
7424    }
7425    let on_delete = fk_action_sql_to_storage(fk.on_delete);
7426    let on_update = fk_action_sql_to_storage(fk.on_update);
7427    Ok(spg_storage::ForeignKeyConstraint {
7428        name: fk.name,
7429        local_columns,
7430        parent_table: fk.parent_table,
7431        parent_columns,
7432        on_delete,
7433        on_update,
7434    })
7435}
7436
7437/// v7.6.1 — pick a sentinel "primary key" column from the parent
7438/// table when the FK didn't name parent columns. Picks the first
7439/// single-column unconditional BTree index — that's the closest
7440/// thing SPG has to a PRIMARY KEY today. Self-referencing FKs use
7441/// `local_cols` as the column source.
7442fn pick_pk_index_column(
7443    catalog: &Catalog,
7444    parent_name: &str,
7445    is_self_ref: bool,
7446    local_cols: &[ColumnSchema],
7447) -> Option<usize> {
7448    if is_self_ref {
7449        // Self-ref FK omitted parent columns: pick column 0 by
7450        // convention (no catalog entry yet). Engine will widen this
7451        // when v7.6.7 lands; v7.6.1 only handles the explicit form.
7452        let _ = local_cols;
7453        return Some(0);
7454    }
7455    let parent = catalog.get(parent_name)?;
7456    parent.indices().iter().find_map(|idx| {
7457        if matches!(idx.kind, spg_storage::IndexKind::BTree(_))
7458            && idx.partial_predicate.is_none()
7459            && idx.included_columns.is_empty()
7460            && idx.expression.is_none()
7461        {
7462            Some(idx.column_position)
7463        } else {
7464            None
7465        }
7466    })
7467}
7468
7469/// v7.9.8 / v7.9.10 — resolve the column positions that
7470/// identify a conflict for ON CONFLICT. Returns a Vec of
7471/// column positions (1 element for single-column form, N for
7472/// composite). When the user wrote bare `ON CONFLICT DO …`,
7473/// falls back to the table's first unconditional BTree index
7474/// (always single-column today).
7475fn resolve_on_conflict_columns(
7476    catalog: &Catalog,
7477    table_name: &str,
7478    target: &[String],
7479) -> Result<Vec<usize>, EngineError> {
7480    let table = catalog.get(table_name).ok_or_else(|| {
7481        EngineError::Storage(StorageError::TableNotFound {
7482            name: table_name.into(),
7483        })
7484    })?;
7485    if target.is_empty() {
7486        let pos = table
7487            .indices()
7488            .iter()
7489            .find_map(|idx| {
7490                if matches!(idx.kind, spg_storage::IndexKind::BTree(_))
7491                    && idx.partial_predicate.is_none()
7492                    && idx.included_columns.is_empty()
7493                    && idx.expression.is_none()
7494                {
7495                    Some(idx.column_position)
7496                } else {
7497                    None
7498                }
7499            })
7500            .ok_or_else(|| {
7501                EngineError::Unsupported(alloc::format!(
7502                    "ON CONFLICT without target requires a UNIQUE BTree index on {table_name:?}"
7503                ))
7504            })?;
7505        return Ok(alloc::vec![pos]);
7506    }
7507    let mut out = Vec::with_capacity(target.len());
7508    for name in target {
7509        let pos = table
7510            .schema()
7511            .columns
7512            .iter()
7513            .position(|c| c.name == *name)
7514            .ok_or_else(|| {
7515                EngineError::Unsupported(alloc::format!(
7516                    "ON CONFLICT target column {name:?} not found on {table_name:?}"
7517                ))
7518            })?;
7519        out.push(pos);
7520    }
7521    Ok(out)
7522}
7523
7524/// v7.9.8 — check whether the BTree index on `column_pos` of
7525/// `table_name` already has a row with this key.
7526fn on_conflict_key_exists(
7527    catalog: &Catalog,
7528    table_name: &str,
7529    column_pos: usize,
7530    key: &Value,
7531) -> bool {
7532    let Some(table) = catalog.get(table_name) else {
7533        return false;
7534    };
7535    let Some(idx_key) = spg_storage::IndexKey::from_value(key) else {
7536        return false;
7537    };
7538    table.indices().iter().any(|idx| {
7539        matches!(idx.kind, spg_storage::IndexKind::BTree(_))
7540            && idx.column_position == column_pos
7541            && idx.partial_predicate.is_none()
7542            && !idx.lookup_eq(&idx_key).is_empty()
7543    })
7544}
7545
7546/// v7.9.9 / v7.9.10 — look up an existing row's position by
7547/// matching all `column_positions` against the incoming `key`
7548/// tuple. Single-column shape (one column) reduces to the
7549/// canonical PK lookup; composite shapes scan linearly until
7550/// every position matches.
7551fn lookup_row_position_by_keys(
7552    catalog: &Catalog,
7553    table_name: &str,
7554    column_positions: &[usize],
7555    key: &[&Value],
7556) -> Option<usize> {
7557    let table = catalog.get(table_name)?;
7558    table.rows().iter().position(|r| {
7559        column_positions
7560            .iter()
7561            .enumerate()
7562            .all(|(i, &pos)| r.values.get(pos) == Some(key[i]))
7563    })
7564}
7565
7566/// v7.9.10 — does the table already contain a row whose
7567/// `column_positions` tuple equals `key`? Single-column shape
7568/// uses the existing BTree fast path; composite shapes fall
7569/// back to a row scan.
7570fn on_conflict_keys_exist(
7571    catalog: &Catalog,
7572    table_name: &str,
7573    column_positions: &[usize],
7574    key: &[&Value],
7575) -> bool {
7576    if column_positions.len() == 1 {
7577        return on_conflict_key_exists(
7578            catalog,
7579            table_name,
7580            column_positions[0],
7581            key[0],
7582        );
7583    }
7584    let Some(table) = catalog.get(table_name) else {
7585        return false;
7586    };
7587    table.rows().iter().any(|r| {
7588        column_positions
7589            .iter()
7590            .enumerate()
7591            .all(|(i, &pos)| r.values.get(pos) == Some(key[i]))
7592    })
7593}
7594
7595/// v7.9.9 — apply ON CONFLICT DO UPDATE SET assignments to an
7596/// existing row.
7597///
7598/// `incoming` is the rejected INSERT row (used to resolve
7599/// `EXCLUDED.col` references in the assignment exprs);
7600/// `target_pos` is the position of the existing row in the table.
7601/// Each assignment substitutes `EXCLUDED.col` with the matching
7602/// incoming value, evaluates the resulting expression against
7603/// the existing row, and writes the new value into the
7604/// corresponding column of the returned `Vec<Value>`. If
7605/// `where_` evaluates falsy, returns Ok(None) — PG behaviour:
7606/// the conflicting row is silently kept unchanged.
7607fn apply_on_conflict_assignments(
7608    catalog: &Catalog,
7609    table_name: &str,
7610    target_pos: usize,
7611    incoming: &[Value],
7612    assignments: &[(String, Expr)],
7613    where_: Option<&Expr>,
7614) -> Result<Option<Vec<Value>>, EngineError> {
7615    let table = catalog.get(table_name).ok_or_else(|| {
7616        EngineError::Storage(StorageError::TableNotFound {
7617            name: table_name.into(),
7618        })
7619    })?;
7620    let schema_cols = table.schema().columns.clone();
7621    let existing = table
7622        .rows()
7623        .get(target_pos)
7624        .ok_or_else(|| {
7625            EngineError::Unsupported(alloc::format!(
7626                "ON CONFLICT DO UPDATE: row position {target_pos} out of bounds on {table_name:?}"
7627            ))
7628        })?
7629        .clone();
7630    let ctx = eval::EvalContext::new(&schema_cols, Some(table_name));
7631    // Optional WHERE filter on the conflict row.
7632    if let Some(w) = where_ {
7633        let pred = w.clone();
7634        let pred = substitute_excluded_refs(pred, &schema_cols, incoming);
7635        let v = eval::eval_expr(&pred, &existing, &ctx)?;
7636        if !matches!(v, Value::Bool(true)) {
7637            return Ok(None);
7638        }
7639    }
7640    let mut new_values = existing.values.clone();
7641    for (col_name, expr) in assignments {
7642        let target_idx = schema_cols
7643            .iter()
7644            .position(|c| c.name == *col_name)
7645            .ok_or_else(|| {
7646                EngineError::Eval(EvalError::ColumnNotFound {
7647                    name: col_name.clone(),
7648                })
7649            })?;
7650        let sub = substitute_excluded_refs(expr.clone(), &schema_cols, incoming);
7651        let v = eval::eval_expr(&sub, &existing, &ctx)?;
7652        new_values[target_idx] =
7653            coerce_value(v, schema_cols[target_idx].ty, col_name, target_idx)?;
7654    }
7655    Ok(Some(new_values))
7656}
7657
7658/// v7.9.9 — walk an `Expr` tree replacing any `Column { qualifier:
7659/// "EXCLUDED", name }` reference with a `Literal` of the matching
7660/// value from the incoming-row vec. Resolution against the
7661/// child-table column list (by name).
7662fn substitute_excluded_refs(
7663    expr: Expr,
7664    schema_cols: &[ColumnSchema],
7665    incoming: &[Value],
7666) -> Expr {
7667    use spg_sql::ast::ColumnName;
7668    match expr {
7669        Expr::Column(ColumnName { qualifier, name })
7670            if qualifier
7671                .as_deref()
7672                .is_some_and(|q| q.eq_ignore_ascii_case("excluded")) =>
7673        {
7674            let pos = schema_cols.iter().position(|c| c.name == name);
7675            match pos {
7676                Some(p) => {
7677                    let v = incoming.get(p).cloned().unwrap_or(Value::Null);
7678                    value_to_literal_expr(v).unwrap_or_else(|_| {
7679                        Expr::Literal(spg_sql::ast::Literal::Null)
7680                    })
7681                }
7682                None => Expr::Column(ColumnName { qualifier, name }),
7683            }
7684        }
7685        Expr::Binary { op, lhs, rhs } => Expr::Binary {
7686            op,
7687            lhs: Box::new(substitute_excluded_refs(*lhs, schema_cols, incoming)),
7688            rhs: Box::new(substitute_excluded_refs(*rhs, schema_cols, incoming)),
7689        },
7690        Expr::Unary { op, expr } => Expr::Unary {
7691            op,
7692            expr: Box::new(substitute_excluded_refs(*expr, schema_cols, incoming)),
7693        },
7694        Expr::FunctionCall { name, args } => Expr::FunctionCall {
7695            name,
7696            args: args
7697                .into_iter()
7698                .map(|a| substitute_excluded_refs(a, schema_cols, incoming))
7699                .collect(),
7700        },
7701        other => other,
7702    }
7703}
7704
7705/// v7.6.2 / v7.6.7 — INSERT-side FK enforcement. For every row
7706/// about to be inserted into `child_table`, every FK declared on
7707/// that table is checked: the row's FK columns must either be
7708/// NULL (SQL spec skip) or match an existing parent row via the
7709/// parent's BTree PK / UNIQUE index.
7710///
7711/// Returns `EngineError::Unsupported` with a `FOREIGN KEY violation`
7712/// payload on first failure.
7713///
7714/// **Self-referencing FKs (v7.6.7 widening):** when `fk.parent_table
7715/// == child_table`, the parent rows visible to this check are
7716///  (a) rows already committed to the table, plus
7717///  (b) earlier rows from the *same* `rows` batch.
7718/// This makes `INSERT INTO tree VALUES (1, NULL), (2, 1), (3, 2)`
7719/// work in a single statement — common pattern for bulk-loading
7720/// hierarchies.
7721fn enforce_fk_inserts(
7722    catalog: &Catalog,
7723    child_table: &str,
7724    fks: &[spg_storage::ForeignKeyConstraint],
7725    rows: &[Vec<Value>],
7726) -> Result<(), EngineError> {
7727    for fk in fks {
7728        let parent_is_self = fk.parent_table == child_table;
7729        let parent = if parent_is_self {
7730            // Self-ref: read the current state of the same table.
7731            // The mut borrow on child has been dropped by the caller.
7732            catalog.get(child_table).ok_or_else(|| {
7733                EngineError::Storage(StorageError::TableNotFound {
7734                    name: child_table.into(),
7735                })
7736            })?
7737        } else {
7738            catalog.get(&fk.parent_table).ok_or_else(|| {
7739                EngineError::Storage(StorageError::TableNotFound {
7740                    name: fk.parent_table.clone(),
7741                })
7742            })?
7743        };
7744        for (batch_idx, row_values) in rows.iter().enumerate() {
7745            // Single-column FK fast path: try the parent's BTree
7746            // index for an O(log n) lookup. Composite FKs fall back
7747            // to a parent-row scan.
7748            if fk.local_columns.len() == 1 {
7749                let v = &row_values[fk.local_columns[0]];
7750                if matches!(v, Value::Null) {
7751                    continue;
7752                }
7753                let parent_col = fk.parent_columns[0];
7754                let key = spg_storage::IndexKey::from_value(v).ok_or_else(|| {
7755                    EngineError::Unsupported(alloc::format!(
7756                        "FOREIGN KEY column value of type {:?} is not index-eligible",
7757                        v.data_type()
7758                    ))
7759                })?;
7760                let present_committed = parent.indices().iter().any(|idx| {
7761                    matches!(idx.kind, spg_storage::IndexKind::BTree(_))
7762                        && idx.column_position == parent_col
7763                        && idx.partial_predicate.is_none()
7764                        && !idx.lookup_eq(&key).is_empty()
7765                });
7766                // v7.6.7 self-ref widening: also accept a match
7767                // against earlier rows in this same batch when the
7768                // FK points at the table being inserted into.
7769                let present_in_batch = parent_is_self
7770                    && rows[..batch_idx].iter().any(|earlier| {
7771                        earlier.get(parent_col) == Some(v)
7772                    });
7773                if !(present_committed || present_in_batch) {
7774                    return Err(EngineError::Unsupported(alloc::format!(
7775                        "FOREIGN KEY violation: no parent row in {:?} where {} = {:?}",
7776                        fk.parent_table,
7777                        parent
7778                            .schema()
7779                            .columns
7780                            .get(parent_col)
7781                            .map_or("?", |c| c.name.as_str()),
7782                        v,
7783                    )));
7784                }
7785            } else {
7786                // Composite FK: scan parent rows. v7.6.7 also
7787                // accepts a match against earlier rows in the same
7788                // batch (self-ref bulk-loading of hierarchies).
7789                if fk.local_columns
7790                    .iter()
7791                    .all(|&i| matches!(row_values.get(i), Some(Value::Null)))
7792                {
7793                    continue;
7794                }
7795                let local: Vec<&Value> = fk.local_columns.iter().map(|&i| &row_values[i]).collect();
7796                let parent_match_committed = parent.rows().iter().any(|prow| {
7797                    fk.parent_columns
7798                        .iter()
7799                        .enumerate()
7800                        .all(|(i, &pi)| prow.values.get(pi) == Some(local[i]))
7801                });
7802                let parent_match_in_batch = parent_is_self
7803                    && rows[..batch_idx].iter().any(|earlier| {
7804                        fk.parent_columns
7805                            .iter()
7806                            .enumerate()
7807                            .all(|(i, &pi)| earlier.get(pi) == Some(local[i]))
7808                    });
7809                if !(parent_match_committed || parent_match_in_batch) {
7810                    return Err(EngineError::Unsupported(alloc::format!(
7811                        "FOREIGN KEY violation: no parent row in {:?} matching composite key",
7812                        fk.parent_table,
7813                    )));
7814                }
7815            }
7816        }
7817    }
7818    Ok(())
7819}
7820
7821/// v7.6.4 / v7.6.5 — one step of the FK action plan computed for a
7822/// DELETE on a parent. The plan is a list of these steps, stacked
7823/// across the FK graph by `plan_fk_parent_deletions`.
7824#[derive(Debug, Clone)]
7825struct FkChildStep {
7826    child_table: String,
7827    action: FkChildAction,
7828}
7829
7830#[derive(Debug, Clone)]
7831enum FkChildAction {
7832    /// CASCADE — remove these rows. Sorted, deduplicated positions.
7833    Delete { positions: Vec<usize> },
7834    /// SET NULL — for each (row, column) in the flat list, write
7835    /// NULL into that child cell. Multiple FKs on the same row may
7836    /// produce overlapping entries (deduped at plan time).
7837    SetNull {
7838        positions: Vec<usize>,
7839        columns: Vec<usize>,
7840    },
7841    /// SET DEFAULT — same shape as SetNull but writes the column's
7842    /// declared DEFAULT value (resolved at plan time). Columns
7843    /// without a DEFAULT raise an error during planning.
7844    SetDefault {
7845        positions: Vec<usize>,
7846        columns: Vec<usize>,
7847        defaults: Vec<Value>,
7848    },
7849}
7850
7851/// v7.6.3 → v7.6.5 — plan FK fallout for a DELETE on a parent table.
7852///
7853/// Walks every table in the catalog looking for FKs whose
7854/// `parent_table` is `parent_table_name`. For each such FK + each
7855/// to-be-deleted parent row:
7856///
7857///   - RESTRICT / NoAction → error, no plan returned
7858///   - CASCADE → child rows get scheduled for deletion; recursive
7859///   - SetNull → child FK column(s) scheduled to be NULL-ed.
7860///     Verified NULL-able at plan time.
7861///   - SetDefault → child FK column(s) scheduled to be reset to
7862///     their declared DEFAULT. Columns without a DEFAULT raise.
7863///
7864/// SET NULL / SET DEFAULT do NOT cascade further — the child row
7865/// stays; only one of its columns mutates.
7866fn plan_fk_parent_deletions(
7867    catalog: &Catalog,
7868    parent_table_name: &str,
7869    to_delete_positions: &[usize],
7870    to_delete_rows: &[Vec<Value>],
7871) -> Result<Vec<FkChildStep>, EngineError> {
7872    use alloc::collections::{BTreeMap, BTreeSet};
7873    if to_delete_rows.is_empty() {
7874        return Ok(Vec::new());
7875    }
7876    let mut delete_plan: BTreeMap<String, BTreeSet<usize>> = BTreeMap::new();
7877    // setnull / setdefault keyed by child_table → (row_idx, col_idx) → optional default
7878    let mut setnull_plan: BTreeMap<String, BTreeSet<(usize, usize)>> = BTreeMap::new();
7879    let mut setdefault_plan: BTreeMap<String, BTreeMap<(usize, usize), Value>> =
7880        BTreeMap::new();
7881    let mut visited: BTreeSet<(String, usize)> = BTreeSet::new();
7882    for &p in to_delete_positions {
7883        visited.insert((parent_table_name.to_string(), p));
7884    }
7885    let mut work: Vec<(String, Vec<Value>)> = to_delete_rows
7886        .iter()
7887        .map(|r| (parent_table_name.to_string(), r.clone()))
7888        .collect();
7889    while let Some((cur_parent, parent_row)) = work.pop() {
7890        for child_name in catalog.table_names() {
7891            let child = catalog
7892                .get(&child_name)
7893                .expect("table_names → catalog.get round-trip is total");
7894            for fk in &child.schema().foreign_keys {
7895                if fk.parent_table != cur_parent {
7896                    continue;
7897                }
7898                let parent_key: Vec<&Value> = fk
7899                    .parent_columns
7900                    .iter()
7901                    .map(|&pi| &parent_row[pi])
7902                    .collect();
7903                if parent_key.iter().any(|v| matches!(v, Value::Null)) {
7904                    continue;
7905                }
7906                for (child_row_idx, child_row) in child.rows().iter().enumerate() {
7907                    if child_name == cur_parent
7908                        && visited.contains(&(child_name.clone(), child_row_idx))
7909                    {
7910                        continue;
7911                    }
7912                    let matches_key = fk
7913                        .local_columns
7914                        .iter()
7915                        .enumerate()
7916                        .all(|(i, &li)| child_row.values.get(li) == Some(parent_key[i]));
7917                    if !matches_key {
7918                        continue;
7919                    }
7920                    match fk.on_delete {
7921                        spg_storage::FkAction::Restrict
7922                        | spg_storage::FkAction::NoAction => {
7923                            return Err(EngineError::Unsupported(alloc::format!(
7924                                "FOREIGN KEY violation: DELETE on {cur_parent:?} is \
7925                                 restricted by FK from {child_name:?}.{:?}",
7926                                fk.local_columns,
7927                            )));
7928                        }
7929                        spg_storage::FkAction::Cascade => {
7930                            if visited.insert((child_name.clone(), child_row_idx)) {
7931                                delete_plan
7932                                    .entry(child_name.clone())
7933                                    .or_default()
7934                                    .insert(child_row_idx);
7935                                work.push((child_name.clone(), child_row.values.clone()));
7936                            }
7937                        }
7938                        spg_storage::FkAction::SetNull => {
7939                            // Verify every local FK column is NULL-able.
7940                            for &li in &fk.local_columns {
7941                                let col = child.schema().columns.get(li).ok_or_else(|| {
7942                                    EngineError::Unsupported(alloc::format!(
7943                                        "FK local column {li} missing in {child_name:?}"
7944                                    ))
7945                                })?;
7946                                if !col.nullable {
7947                                    return Err(EngineError::Unsupported(alloc::format!(
7948                                        "FOREIGN KEY ON DELETE SET NULL: column \
7949                                         {child_name:?}.{:?} is NOT NULL — cannot SET NULL",
7950                                        col.name,
7951                                    )));
7952                                }
7953                            }
7954                            let entry = setnull_plan.entry(child_name.clone()).or_default();
7955                            for &li in &fk.local_columns {
7956                                entry.insert((child_row_idx, li));
7957                            }
7958                        }
7959                        spg_storage::FkAction::SetDefault => {
7960                            // Resolve the DEFAULT for every local FK col.
7961                            let entry =
7962                                setdefault_plan.entry(child_name.clone()).or_default();
7963                            for &li in &fk.local_columns {
7964                                let col = child.schema().columns.get(li).ok_or_else(|| {
7965                                    EngineError::Unsupported(alloc::format!(
7966                                        "FK local column {li} missing in {child_name:?}"
7967                                    ))
7968                                })?;
7969                                let default = col.default.clone().ok_or_else(|| {
7970                                    EngineError::Unsupported(alloc::format!(
7971                                        "FOREIGN KEY ON DELETE SET DEFAULT: column \
7972                                         {child_name:?}.{:?} has no DEFAULT declared",
7973                                        col.name,
7974                                    ))
7975                                })?;
7976                                entry.insert((child_row_idx, li), default);
7977                            }
7978                        }
7979                    }
7980                }
7981            }
7982        }
7983    }
7984    // Flatten the three plans into the ordered `FkChildStep` list.
7985    // Deletes are applied last per child (after any null/default
7986    // re-writes on the same child) so a child row that's both
7987    // re-written and then cascade-deleted only ends up deleted —
7988    // but in v7.6.5 SetNull/Cascade never overlap on the same row
7989    // (a single FK chooses exactly one action), so the order is
7990    // mostly a precaution.
7991    let mut steps: Vec<FkChildStep> = Vec::new();
7992    for (child_table, entries) in setnull_plan {
7993        let (positions, columns): (Vec<usize>, Vec<usize>) = entries.into_iter().unzip();
7994        steps.push(FkChildStep {
7995            child_table,
7996            action: FkChildAction::SetNull { positions, columns },
7997        });
7998    }
7999    for (child_table, entries) in setdefault_plan {
8000        let mut positions = Vec::with_capacity(entries.len());
8001        let mut columns = Vec::with_capacity(entries.len());
8002        let mut defaults = Vec::with_capacity(entries.len());
8003        for ((p, c), v) in entries {
8004            positions.push(p);
8005            columns.push(c);
8006            defaults.push(v);
8007        }
8008        steps.push(FkChildStep {
8009            child_table,
8010            action: FkChildAction::SetDefault {
8011                positions,
8012                columns,
8013                defaults,
8014            },
8015        });
8016    }
8017    for (child_table, positions) in delete_plan {
8018        steps.push(FkChildStep {
8019            child_table,
8020            action: FkChildAction::Delete {
8021                positions: positions.into_iter().collect(),
8022            },
8023        });
8024    }
8025    Ok(steps)
8026}
8027
8028/// v7.6.6 — plan FK fallout for an UPDATE that mutates parent-side
8029/// PK/UNIQUE columns. Walks every other table whose FK references
8030/// `parent_table_name`; for each FK whose parent_columns overlap a
8031/// mutated column, decides the action by `fk.on_update`.
8032///
8033///   - RESTRICT / NoAction → error if any child references the OLD
8034///     value
8035///   - CASCADE → child FK columns get rewritten to the NEW parent
8036///     value (a SetNull-style update step with the new value)
8037///   - SetNull → child FK columns set to NULL
8038///   - SetDefault → child FK columns set to declared default
8039///
8040/// `plan_with_old` is `(row_position, old_values, new_values)` so
8041/// the planner can detect "did this row's parent key actually
8042/// change?" — only rows where at least one referenced parent
8043/// column moved trigger inbound work.
8044fn plan_fk_parent_updates(
8045    catalog: &Catalog,
8046    parent_table_name: &str,
8047    plan_with_old: &[(usize, Vec<Value>, Vec<Value>)],
8048) -> Result<Vec<FkChildStep>, EngineError> {
8049    use alloc::collections::BTreeMap;
8050    if plan_with_old.is_empty() {
8051        return Ok(Vec::new());
8052    }
8053    // For each child table we may touch, build per-child step
8054    // lists. UPDATE never deletes children — `delete_plan` stays
8055    // empty here but is kept structurally aligned with
8056    // `plan_fk_parent_deletions` for future use.
8057    let delete_plan: BTreeMap<String, alloc::collections::BTreeSet<usize>> = BTreeMap::new();
8058    let mut setnull_plan: BTreeMap<
8059        String,
8060        alloc::collections::BTreeSet<(usize, usize)>,
8061    > = BTreeMap::new();
8062    let mut setdefault_plan: BTreeMap<String, BTreeMap<(usize, usize), Value>> =
8063        BTreeMap::new();
8064    // Cascade-update plan: child_table → row_idx → col_idx → new_value
8065    let mut cascade_plan: BTreeMap<String, BTreeMap<(usize, usize), Value>> = BTreeMap::new();
8066
8067    for child_name in catalog.table_names() {
8068        let child = catalog
8069            .get(&child_name)
8070            .expect("table_names → catalog.get total");
8071        for fk in &child.schema().foreign_keys {
8072            if fk.parent_table != parent_table_name {
8073                continue;
8074            }
8075            for (_pos, old_row, new_row) in plan_with_old {
8076                // Did any parent FK column change?
8077                let key_changed = fk
8078                    .parent_columns
8079                    .iter()
8080                    .any(|&pi| old_row.get(pi) != new_row.get(pi));
8081                if !key_changed {
8082                    continue;
8083                }
8084                // The OLD parent key — used to find referring children.
8085                let old_key: Vec<&Value> = fk
8086                    .parent_columns
8087                    .iter()
8088                    .map(|&pi| &old_row[pi])
8089                    .collect();
8090                if old_key.iter().any(|v| matches!(v, Value::Null)) {
8091                    // NULL parent has no children — skip.
8092                    continue;
8093                }
8094                let new_key: Vec<&Value> = fk
8095                    .parent_columns
8096                    .iter()
8097                    .map(|&pi| &new_row[pi])
8098                    .collect();
8099                for (child_row_idx, child_row) in child.rows().iter().enumerate() {
8100                    // Self-ref same-row updates: a row updating its
8101                    // own PK doesn't restrict itself.
8102                    if child_name == parent_table_name
8103                        && plan_with_old
8104                            .iter()
8105                            .any(|(p, _, _)| *p == child_row_idx)
8106                    {
8107                        continue;
8108                    }
8109                    let matches_key = fk
8110                        .local_columns
8111                        .iter()
8112                        .enumerate()
8113                        .all(|(i, &li)| child_row.values.get(li) == Some(old_key[i]));
8114                    if !matches_key {
8115                        continue;
8116                    }
8117                    match fk.on_update {
8118                        spg_storage::FkAction::Restrict
8119                        | spg_storage::FkAction::NoAction => {
8120                            return Err(EngineError::Unsupported(alloc::format!(
8121                                "FOREIGN KEY violation: UPDATE on {parent_table_name:?} PK is \
8122                                 restricted by FK from {child_name:?}.{:?}",
8123                                fk.local_columns,
8124                            )));
8125                        }
8126                        spg_storage::FkAction::Cascade => {
8127                            // Rewrite child FK columns to new key.
8128                            let entry = cascade_plan.entry(child_name.clone()).or_default();
8129                            for (i, &li) in fk.local_columns.iter().enumerate() {
8130                                entry.insert((child_row_idx, li), new_key[i].clone());
8131                            }
8132                        }
8133                        spg_storage::FkAction::SetNull => {
8134                            for &li in &fk.local_columns {
8135                                let col = child.schema().columns.get(li).ok_or_else(|| {
8136                                    EngineError::Unsupported(alloc::format!(
8137                                        "FK local column {li} missing in {child_name:?}"
8138                                    ))
8139                                })?;
8140                                if !col.nullable {
8141                                    return Err(EngineError::Unsupported(alloc::format!(
8142                                        "FOREIGN KEY ON UPDATE SET NULL: column \
8143                                         {child_name:?}.{:?} is NOT NULL",
8144                                        col.name,
8145                                    )));
8146                                }
8147                            }
8148                            let entry = setnull_plan.entry(child_name.clone()).or_default();
8149                            for &li in &fk.local_columns {
8150                                entry.insert((child_row_idx, li));
8151                            }
8152                        }
8153                        spg_storage::FkAction::SetDefault => {
8154                            let entry =
8155                                setdefault_plan.entry(child_name.clone()).or_default();
8156                            for &li in &fk.local_columns {
8157                                let col = child.schema().columns.get(li).ok_or_else(|| {
8158                                    EngineError::Unsupported(alloc::format!(
8159                                        "FK local column {li} missing in {child_name:?}"
8160                                    ))
8161                                })?;
8162                                let default = col.default.clone().ok_or_else(|| {
8163                                    EngineError::Unsupported(alloc::format!(
8164                                        "FOREIGN KEY ON UPDATE SET DEFAULT: column \
8165                                         {child_name:?}.{:?} has no DEFAULT",
8166                                        col.name,
8167                                    ))
8168                                })?;
8169                                entry.insert((child_row_idx, li), default);
8170                            }
8171                        }
8172                    }
8173                }
8174            }
8175        }
8176    }
8177    // Flatten into FkChildStep list. UPDATE doesn't produce
8178    // DeleteSteps (CASCADE on UPDATE just rewrites FK values).
8179    let mut steps: Vec<FkChildStep> = Vec::new();
8180    for (child_table, entries) in cascade_plan {
8181        let mut positions = Vec::with_capacity(entries.len());
8182        let mut columns = Vec::with_capacity(entries.len());
8183        let mut defaults = Vec::with_capacity(entries.len());
8184        for ((p, c), v) in entries {
8185            positions.push(p);
8186            columns.push(c);
8187            defaults.push(v);
8188        }
8189        // We reuse `FkChildAction::SetDefault` for cascade-update:
8190        // both shapes are "write a known value into specific cells"
8191        // — `apply_per_cell_writes` doesn't care whether the value
8192        // came from a DEFAULT declaration or a new parent key.
8193        steps.push(FkChildStep {
8194            child_table,
8195            action: FkChildAction::SetDefault {
8196                positions,
8197                columns,
8198                defaults,
8199            },
8200        });
8201    }
8202    for (child_table, entries) in setnull_plan {
8203        let (positions, columns): (Vec<usize>, Vec<usize>) = entries.into_iter().unzip();
8204        steps.push(FkChildStep {
8205            child_table,
8206            action: FkChildAction::SetNull { positions, columns },
8207        });
8208    }
8209    for (child_table, entries) in setdefault_plan {
8210        let mut positions = Vec::with_capacity(entries.len());
8211        let mut columns = Vec::with_capacity(entries.len());
8212        let mut defaults = Vec::with_capacity(entries.len());
8213        for ((p, c), v) in entries {
8214            positions.push(p);
8215            columns.push(c);
8216            defaults.push(v);
8217        }
8218        steps.push(FkChildStep {
8219            child_table,
8220            action: FkChildAction::SetDefault {
8221                positions,
8222                columns,
8223                defaults,
8224            },
8225        });
8226    }
8227    let _ = delete_plan; // UPDATE never deletes children.
8228    Ok(steps)
8229}
8230
8231/// v7.6.5 — apply one FK child step to the catalog. Encapsulates
8232/// the three action variants so the DELETE executor stays a
8233/// simple loop over the planned steps.
8234fn apply_fk_child_step(
8235    catalog: &mut Catalog,
8236    step: &FkChildStep,
8237) -> Result<(), EngineError> {
8238    let child = catalog.get_mut(&step.child_table).ok_or_else(|| {
8239        EngineError::Storage(StorageError::TableNotFound {
8240            name: step.child_table.clone(),
8241        })
8242    })?;
8243    match &step.action {
8244        FkChildAction::Delete { positions } => {
8245            let _ = child.delete_rows(positions);
8246        }
8247        FkChildAction::SetNull { positions, columns } => {
8248            apply_per_cell_writes(child, positions, columns, |_| Value::Null)?;
8249        }
8250        FkChildAction::SetDefault {
8251            positions,
8252            columns,
8253            defaults,
8254        } => {
8255            apply_per_cell_writes(child, positions, columns, |i| defaults[i].clone())?;
8256        }
8257    }
8258    Ok(())
8259}
8260
8261/// v7.6.5 — write new values into selected child cells via
8262/// `Table::update_row` (the catalog's existing UPDATE entry).
8263/// Groups writes by row position so multi-column updates on the
8264/// same row only call `update_row` once. `value_for(i)` produces
8265/// the new value for the i-th (position, column) entry.
8266fn apply_per_cell_writes(
8267    child: &mut spg_storage::Table,
8268    positions: &[usize],
8269    columns: &[usize],
8270    mut value_for: impl FnMut(usize) -> Value,
8271) -> Result<(), EngineError> {
8272    use alloc::collections::BTreeMap;
8273    let mut by_row: BTreeMap<usize, Vec<(usize, Value)>> = BTreeMap::new();
8274    for i in 0..positions.len() {
8275        by_row
8276            .entry(positions[i])
8277            .or_default()
8278            .push((columns[i], value_for(i)));
8279    }
8280    for (pos, mutations) in by_row {
8281        let mut new_values = child.rows()[pos].values.clone();
8282        for (col, v) in mutations {
8283            if let Some(slot) = new_values.get_mut(col) {
8284                *slot = v;
8285            }
8286        }
8287        child
8288            .update_row(pos, new_values)
8289            .map_err(EngineError::Storage)?;
8290    }
8291    Ok(())
8292}
8293
8294fn fk_action_sql_to_storage(a: spg_sql::ast::FkAction) -> spg_storage::FkAction {
8295    match a {
8296        spg_sql::ast::FkAction::Restrict => spg_storage::FkAction::Restrict,
8297        spg_sql::ast::FkAction::Cascade => spg_storage::FkAction::Cascade,
8298        spg_sql::ast::FkAction::SetNull => spg_storage::FkAction::SetNull,
8299        spg_sql::ast::FkAction::SetDefault => spg_storage::FkAction::SetDefault,
8300        spg_sql::ast::FkAction::NoAction => spg_storage::FkAction::NoAction,
8301    }
8302}
8303
8304fn column_def_to_schema(c: ColumnDef) -> Result<ColumnSchema, EngineError> {
8305    let ty = column_type_to_data_type(c.ty);
8306    let mut schema = ColumnSchema::new(c.name.clone(), ty, c.nullable);
8307    if let Some(default_expr) = c.default {
8308        // DEFAULT must be a literal expression — evaluated at CREATE TABLE
8309        // time against an empty row context. Any column ref / aggregate
8310        // surfaces as the corresponding eval error.
8311        let raw = literal_expr_to_value(default_expr)?;
8312        let coerced = coerce_value(raw, ty, &c.name, 0)?;
8313        schema = schema.with_default(coerced);
8314    }
8315    if c.auto_increment {
8316        // AUTO_INCREMENT only makes sense on integer-shaped columns.
8317        if !matches!(ty, DataType::SmallInt | DataType::Int | DataType::BigInt) {
8318            return Err(EngineError::Unsupported(alloc::format!(
8319                "AUTO_INCREMENT requires an integer column type, got {ty:?}"
8320            )));
8321        }
8322        schema = schema.with_auto_increment();
8323    }
8324    Ok(schema)
8325}
8326
8327const fn column_type_to_data_type(t: ColumnTypeName) -> DataType {
8328    match t {
8329        ColumnTypeName::SmallInt => DataType::SmallInt,
8330        ColumnTypeName::Int => DataType::Int,
8331        ColumnTypeName::BigInt => DataType::BigInt,
8332        ColumnTypeName::Float => DataType::Float,
8333        ColumnTypeName::Text => DataType::Text,
8334        ColumnTypeName::Varchar(n) => DataType::Varchar(n),
8335        ColumnTypeName::Char(n) => DataType::Char(n),
8336        ColumnTypeName::Bool => DataType::Bool,
8337        ColumnTypeName::Vector { dim, encoding } => DataType::Vector {
8338            dim,
8339            encoding: match encoding {
8340                SqlVecEncoding::F32 => VecEncoding::F32,
8341                SqlVecEncoding::Sq8 => VecEncoding::Sq8,
8342                SqlVecEncoding::F16 => VecEncoding::F16,
8343            },
8344        },
8345        ColumnTypeName::Numeric(precision, scale) => DataType::Numeric { precision, scale },
8346        ColumnTypeName::Date => DataType::Date,
8347        ColumnTypeName::Timestamp => DataType::Timestamp,
8348        ColumnTypeName::Timestamptz => DataType::Timestamptz,
8349        ColumnTypeName::Json => DataType::Json,
8350        ColumnTypeName::Jsonb => DataType::Jsonb,
8351    }
8352}
8353
8354/// Convert an INSERT VALUES expression to a storage Value. Supports literal
8355/// expressions, unary-minus over numeric literals, and pgvector-style
8356/// `'[..]'::vector` cast (v1.2). Anything more complex returns `Unsupported`.
8357fn literal_expr_to_value(expr: Expr) -> Result<Value, EngineError> {
8358    match expr {
8359        Expr::Literal(l) => Ok(literal_to_value(l)),
8360        Expr::Cast { expr, target } => {
8361            let inner_value = literal_expr_to_value(*expr)?;
8362            crate::eval::cast_value(inner_value, target).map_err(EngineError::Eval)
8363        }
8364        Expr::Unary {
8365            op: UnOp::Neg,
8366            expr,
8367        } => match *expr {
8368            Expr::Literal(Literal::Integer(n)) => {
8369                // Fold to i32 if it fits, else BigInt. Parser emits Integer(i64)
8370                // — overflow on negate of i64::MIN is the one edge case.
8371                let neg = n.checked_neg().ok_or_else(|| {
8372                    EngineError::Unsupported("integer literal overflow on negation".into())
8373                })?;
8374                Ok(int_value_for(neg))
8375            }
8376            Expr::Literal(Literal::Float(x)) => Ok(Value::Float(-x)),
8377            other => Err(EngineError::Unsupported(alloc::format!(
8378                "unary minus over non-literal expression: {other:?}"
8379            ))),
8380        },
8381        other => Err(EngineError::Unsupported(alloc::format!(
8382            "non-literal INSERT value expression: {other:?}"
8383        ))),
8384    }
8385}
8386
8387fn literal_to_value(l: Literal) -> Value {
8388    match l {
8389        Literal::Integer(n) => int_value_for(n),
8390        Literal::Float(x) => Value::Float(x),
8391        Literal::String(s) => Value::Text(s),
8392        Literal::Bool(b) => Value::Bool(b),
8393        Literal::Null => Value::Null,
8394        Literal::Vector(v) => Value::Vector(v),
8395        Literal::Interval { months, micros, .. } => Value::Interval { months, micros },
8396    }
8397}
8398
8399/// Pick `Int` (`i32`) when the literal fits, else `BigInt`. `INT` vs `BIGINT`
8400/// columns will still enforce the right tag downstream — this is just the
8401/// default we synthesise from an unannotated integer literal.
8402fn int_value_for(n: i64) -> Value {
8403    if let Ok(small) = i32::try_from(n) {
8404        Value::Int(small)
8405    } else {
8406        Value::BigInt(n)
8407    }
8408}
8409
8410/// Widen / narrow `v` to fit `expected`. Numerics permit safe widening
8411/// (`Int → BigInt`, `Int/BigInt → Float`) and best-effort narrowing
8412/// (`BigInt → Int` succeeds only when the value fits in `i32`). Everything
8413/// else returns `TypeMismatch` carrying the column name for caller diagnostics.
8414/// `NULL` is always permitted; the nullability check happens later in storage.
8415#[allow(clippy::too_many_lines)]
8416fn coerce_value(
8417    v: Value,
8418    expected: DataType,
8419    col_name: &str,
8420    position: usize,
8421) -> Result<Value, EngineError> {
8422    if v.is_null() {
8423        return Ok(Value::Null);
8424    }
8425    let actual = v.data_type().expect("non-null");
8426    if actual == expected {
8427        return Ok(v);
8428    }
8429    let coerced =
8430        match (v, expected) {
8431            (Value::Int(n), DataType::BigInt) => Some(Value::BigInt(i64::from(n))),
8432            (Value::Int(n), DataType::Float) => Some(Value::Float(f64::from(n))),
8433            (Value::Int(n), DataType::SmallInt) => i16::try_from(n).ok().map(Value::SmallInt),
8434            (Value::Int(n), DataType::Numeric { precision, scale }) => Some(numeric_from_integer(
8435                i128::from(n),
8436                precision,
8437                scale,
8438                col_name,
8439            )?),
8440            (Value::SmallInt(n), DataType::Int) => Some(Value::Int(i32::from(n))),
8441            (Value::SmallInt(n), DataType::BigInt) => Some(Value::BigInt(i64::from(n))),
8442            (Value::SmallInt(n), DataType::Float) => Some(Value::Float(f64::from(n))),
8443            (Value::SmallInt(n), DataType::Numeric { precision, scale }) => Some(
8444                numeric_from_integer(i128::from(n), precision, scale, col_name)?,
8445            ),
8446            (Value::BigInt(n), DataType::Int) => i32::try_from(n).ok().map(Value::Int),
8447            (Value::BigInt(n), DataType::SmallInt) => i16::try_from(n).ok().map(Value::SmallInt),
8448            #[allow(clippy::cast_precision_loss)]
8449            (Value::BigInt(n), DataType::Float) => Some(Value::Float(n as f64)),
8450            (Value::BigInt(n), DataType::Numeric { precision, scale }) => Some(
8451                numeric_from_integer(i128::from(n), precision, scale, col_name)?,
8452            ),
8453            (Value::Float(x), DataType::Numeric { precision, scale }) => {
8454                Some(numeric_from_float(x, precision, scale, col_name)?)
8455            }
8456            // Text → DATE / TIMESTAMP: parse canonical text forms.
8457            (Value::Text(s), DataType::Date) => {
8458                let d = eval::parse_date_literal(&s).ok_or_else(|| {
8459                    EngineError::Eval(EvalError::TypeMismatch {
8460                        detail: alloc::format!(
8461                            "cannot parse {s:?} as DATE for column `{col_name}`"
8462                        ),
8463                    })
8464                })?;
8465                Some(Value::Date(d))
8466            }
8467            // v4.9: Text ↔ JSON coercion. No structural validation —
8468            // any text literal is accepted; the responsibility for
8469            // valid JSON lies with the producer.
8470            (Value::Text(s), DataType::Json | DataType::Jsonb) => Some(Value::Json(s)),
8471            (Value::Json(s), DataType::Text) => Some(Value::Text(s)),
8472            (Value::Text(s), DataType::Timestamp | DataType::Timestamptz) => {
8473                let t = eval::parse_timestamp_literal(&s).ok_or_else(|| {
8474                    EngineError::Eval(EvalError::TypeMismatch {
8475                        detail: alloc::format!(
8476                            "cannot parse {s:?} as TIMESTAMP for column `{col_name}`"
8477                        ),
8478                    })
8479                })?;
8480                Some(Value::Timestamp(t))
8481            }
8482            // DATE ↔ TIMESTAMP convertibility (DATE → midnight,
8483            // TIMESTAMP → day truncation).
8484            (Value::Date(d), DataType::Timestamp | DataType::Timestamptz) => {
8485                Some(Value::Timestamp(i64::from(d) * 86_400_000_000))
8486            }
8487            (Value::Timestamp(t), DataType::Date) => {
8488                let days = t.div_euclid(86_400_000_000);
8489                i32::try_from(days).ok().map(Value::Date)
8490            }
8491            (
8492                Value::Numeric {
8493                    scaled,
8494                    scale: src_scale,
8495                },
8496                DataType::Numeric { precision, scale },
8497            ) => Some(numeric_rescale(
8498                scaled, src_scale, precision, scale, col_name,
8499            )?),
8500            #[allow(clippy::cast_precision_loss)]
8501            (Value::Numeric { scaled, scale }, DataType::Float) => {
8502                let mut div = 1.0_f64;
8503                for _ in 0..scale {
8504                    div *= 10.0;
8505                }
8506                Some(Value::Float((scaled as f64) / div))
8507            }
8508            (Value::Numeric { scaled, scale }, DataType::Int) => {
8509                let truncated = numeric_truncate_to_integer(scaled, scale);
8510                i32::try_from(truncated).ok().map(Value::Int)
8511            }
8512            (Value::Numeric { scaled, scale }, DataType::BigInt) => {
8513                let truncated = numeric_truncate_to_integer(scaled, scale);
8514                i64::try_from(truncated).ok().map(Value::BigInt)
8515            }
8516            (Value::Numeric { scaled, scale }, DataType::SmallInt) => {
8517                let truncated = numeric_truncate_to_integer(scaled, scale);
8518                i16::try_from(truncated).ok().map(Value::SmallInt)
8519            }
8520            // VARCHAR(n) enforces an upper bound on character count.
8521            (Value::Text(s), DataType::Varchar(max)) => {
8522                if u32::try_from(s.chars().count()).unwrap_or(u32::MAX) <= max {
8523                    Some(Value::Text(s))
8524                } else {
8525                    return Err(EngineError::Unsupported(alloc::format!(
8526                        "value for VARCHAR({max}) column `{col_name}` exceeds length: \
8527                     {} chars",
8528                        s.chars().count()
8529                    )));
8530                }
8531            }
8532            // v6.0.1: f32 → SQ8 INSERT-time quantisation. Triggered
8533            // when the column declares `VECTOR(N) USING SQ8` and
8534            // the INSERT VALUES expression yields a raw f32 vector
8535            // (the normal pgvector-shape literal). Dim mismatch
8536            // falls through the `_ => None` arm and surfaces as
8537            // `TypeMismatch` with the expected SQ8 column type —
8538            // matching the F32 path's existing error.
8539            (
8540                Value::Vector(v),
8541                DataType::Vector {
8542                    dim,
8543                    encoding: VecEncoding::Sq8,
8544                },
8545            ) if v.len() == dim as usize => {
8546                Some(Value::Sq8Vector(spg_storage::quantize::quantize(&v)))
8547            }
8548            // v6.0.3: f32 → f16 INSERT-time conversion for HALF
8549            // columns. Bit-exact at the storage layer (modulo
8550            // half-precision rounding); no rerank pass needed at
8551            // search time.
8552            (
8553                Value::Vector(v),
8554                DataType::Vector {
8555                    dim,
8556                    encoding: VecEncoding::F16,
8557                },
8558            ) if v.len() == dim as usize => Some(Value::HalfVector(
8559                spg_storage::halfvec::HalfVector::from_f32_slice(&v),
8560            )),
8561            // CHAR(n) right-pads with U+0020 to exactly n chars; if the input
8562            // is already longer we reject (PG truncates trailing-space-only;
8563            // staying strict for v1).
8564            (Value::Text(s), DataType::Char(size)) => {
8565                let len = u32::try_from(s.chars().count()).unwrap_or(u32::MAX);
8566                if len > size {
8567                    return Err(EngineError::Unsupported(alloc::format!(
8568                        "value for CHAR({size}) column `{col_name}` exceeds length: \
8569                     {len} chars"
8570                    )));
8571                }
8572                let need = (size - len) as usize;
8573                let mut padded = s;
8574                padded.reserve(need);
8575                for _ in 0..need {
8576                    padded.push(' ');
8577                }
8578                Some(Value::Text(padded))
8579            }
8580            _ => None,
8581        };
8582    coerced.ok_or(EngineError::Storage(StorageError::TypeMismatch {
8583        column: col_name.into(),
8584        expected,
8585        actual,
8586        position,
8587    }))
8588}
8589
8590#[cfg(test)]
8591mod tests {
8592    use super::*;
8593    use alloc::vec;
8594
8595    fn unwrap_command_ok(r: &QueryResult) -> usize {
8596        match r {
8597            QueryResult::CommandOk { affected, .. } => *affected,
8598            QueryResult::Rows { .. } => panic!("expected CommandOk, got Rows"),
8599        }
8600    }
8601
8602    #[test]
8603    fn create_table_registers_schema() {
8604        let mut e = Engine::new();
8605        e.execute("CREATE TABLE foo (a INT NOT NULL, b TEXT)")
8606            .unwrap();
8607        assert_eq!(e.catalog().table_count(), 1);
8608        let t = e.catalog().get("foo").unwrap();
8609        assert_eq!(t.schema().columns.len(), 2);
8610        assert_eq!(t.schema().columns[0].ty, DataType::Int);
8611        assert!(!t.schema().columns[0].nullable);
8612        assert_eq!(t.schema().columns[1].ty, DataType::Text);
8613    }
8614
8615    #[test]
8616    fn create_table_vector_default_is_f32_encoded() {
8617        let mut e = Engine::new();
8618        e.execute("CREATE TABLE t (v VECTOR(8))").unwrap();
8619        let t = e.catalog().get("t").unwrap();
8620        assert_eq!(
8621            t.schema().columns[0].ty,
8622            DataType::Vector {
8623                dim: 8,
8624                encoding: VecEncoding::F32,
8625            },
8626        );
8627    }
8628
8629    #[test]
8630    fn create_table_vector_using_sq8_succeeds() {
8631        // v6.0.1 step 3: the step-1 fence in `column_def_to_schema`
8632        // is lifted. CREATE TABLE persists an SQ8 column type in
8633        // the catalog; INSERT (next test) quantises raw f32 input.
8634        let mut e = Engine::new();
8635        e.execute("CREATE TABLE t (v VECTOR(8) USING SQ8)").unwrap();
8636        let t = e.catalog().get("t").unwrap();
8637        assert_eq!(
8638            t.schema().columns[0].ty,
8639            DataType::Vector {
8640                dim: 8,
8641                encoding: VecEncoding::Sq8,
8642            },
8643        );
8644    }
8645
8646    #[test]
8647    fn insert_into_sq8_column_quantises_f32_payload() {
8648        // v6.0.1 step 3: INSERT-time `coerce_value` rewrites a raw
8649        // `Value::Vector(Vec<f32>)` literal into the column's
8650        // quantised representation. The row that lands in the
8651        // catalog must therefore hold a `Value::Sq8Vector`, not the
8652        // original f32 buffer — that's the bit that delivers the
8653        // 4× compression target.
8654        let mut e = Engine::new();
8655        e.execute("CREATE TABLE t (v VECTOR(4) USING SQ8)").unwrap();
8656        e.execute("INSERT INTO t VALUES ([0.0, 0.25, 0.5, 1.0])")
8657            .unwrap();
8658        let t = e.catalog().get("t").unwrap();
8659        assert_eq!(t.rows().len(), 1);
8660        match &t.rows()[0].values[0] {
8661            Value::Sq8Vector(q) => {
8662                assert_eq!(q.bytes.len(), 4);
8663                // min/max are derived from the payload: min=0.0, max=1.0.
8664                assert!((q.min - 0.0).abs() < 1e-6);
8665                assert!((q.max - 1.0).abs() < 1e-6);
8666            }
8667            other => panic!("expected Sq8Vector cell, got {other:?}"),
8668        }
8669    }
8670
8671    #[test]
8672    fn create_table_vector_using_half_succeeds_and_insert_converts_to_f16() {
8673        // v6.0.3: CREATE TABLE accepts USING HALF; INSERT path
8674        // converts the incoming `Value::Vector(Vec<f32>)` cell
8675        // into `Value::HalfVector(HalfVector)` via the new
8676        // `coerce_value` arm. The dequantised round-trip is
8677        // bit-exact for f16-representable values, so 0.0 / 0.25
8678        // / 0.5 / 1.0 hit their grid points exactly.
8679        let mut e = Engine::new();
8680        e.execute("CREATE TABLE t (v VECTOR(4) USING HALF)")
8681            .unwrap();
8682        e.execute("INSERT INTO t VALUES ([0.0, 0.25, 0.5, 1.0])")
8683            .unwrap();
8684        let t = e.catalog().get("t").unwrap();
8685        assert_eq!(t.rows().len(), 1);
8686        match &t.rows()[0].values[0] {
8687            Value::HalfVector(h) => {
8688                assert_eq!(h.dim(), 4);
8689                let back = h.to_f32_vec();
8690                let expected = alloc::vec![0.0_f32, 0.25, 0.5, 1.0];
8691                for (g, e) in back.iter().zip(expected.iter()) {
8692                    assert!(
8693                        (g - e).abs() < 1e-6,
8694                        "{g} vs {e} should be exact on f16 grid"
8695                    );
8696                }
8697            }
8698            other => panic!("expected HalfVector cell, got {other:?}"),
8699        }
8700    }
8701
8702    #[test]
8703    fn alter_index_rebuild_in_place_succeeds() {
8704        // v6.0.4: bare REBUILD (no encoding switch) walks every
8705        // row again to rebuild the NSW graph. Verifies the engine
8706        // dispatch + storage helper plumbing without changing any
8707        // cell encoding.
8708        let mut e = Engine::new();
8709        e.execute("CREATE TABLE t (id INT NOT NULL, v VECTOR(3) NOT NULL)")
8710            .unwrap();
8711        for i in 0..8_i32 {
8712            #[allow(clippy::cast_precision_loss)]
8713            let base = (i as f32) * 0.1;
8714            e.execute(&alloc::format!(
8715                "INSERT INTO t VALUES ({i}, [{base}, {b1}, {b2}])",
8716                b1 = base + 0.01,
8717                b2 = base + 0.02,
8718            ))
8719            .unwrap();
8720        }
8721        e.execute("CREATE INDEX t_idx ON t USING hnsw (v)").unwrap();
8722        e.execute("ALTER INDEX t_idx REBUILD").unwrap();
8723        // Schema encoding stays F32 (no encoding clause).
8724        assert_eq!(
8725            e.catalog().get("t").unwrap().schema().columns[1].ty,
8726            DataType::Vector {
8727                dim: 3,
8728                encoding: VecEncoding::F32,
8729            },
8730        );
8731    }
8732
8733    #[test]
8734    fn alter_index_rebuild_with_encoding_switches_cell_type() {
8735        // v6.0.4: REBUILD WITH (encoding = SQ8) recodes every
8736        // stored cell from F32 → SQ8 + rebuilds the graph atop the
8737        // new encoding. Post-rebuild, cells must be Sq8Vector and
8738        // the schema must report encoding = Sq8.
8739        let mut e = Engine::new();
8740        e.execute("CREATE TABLE t (id INT NOT NULL, v VECTOR(4) NOT NULL)")
8741            .unwrap();
8742        e.execute("INSERT INTO t VALUES (1, [0.0, 0.25, 0.5, 1.0])")
8743            .unwrap();
8744        e.execute("CREATE INDEX t_idx ON t USING hnsw (v)").unwrap();
8745        e.execute("ALTER INDEX t_idx REBUILD WITH (encoding = SQ8)")
8746            .unwrap();
8747        let t = e.catalog().get("t").unwrap();
8748        assert_eq!(
8749            t.schema().columns[1].ty,
8750            DataType::Vector {
8751                dim: 4,
8752                encoding: VecEncoding::Sq8,
8753            },
8754        );
8755        assert!(matches!(t.rows()[0].values[1], Value::Sq8Vector(_)));
8756    }
8757
8758    #[test]
8759    fn alter_index_rebuild_unknown_index_errors() {
8760        let mut e = Engine::new();
8761        let err = e.execute("ALTER INDEX nope REBUILD").unwrap_err();
8762        assert!(
8763            matches!(
8764                &err,
8765                EngineError::Storage(StorageError::IndexNotFound { name }) if name == "nope"
8766            ),
8767            "got: {err}"
8768        );
8769    }
8770
8771    #[test]
8772    fn alter_index_rebuild_on_btree_index_errors() {
8773        // REBUILD on a B-tree index has no semantic meaning in
8774        // v6.0.4 — rejected at the storage layer with `Unsupported`.
8775        let mut e = Engine::new();
8776        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
8777        e.execute("INSERT INTO t VALUES (1)").unwrap();
8778        e.execute("CREATE INDEX t_idx ON t (id)").unwrap();
8779        let err = e.execute("ALTER INDEX t_idx REBUILD").unwrap_err();
8780        assert!(
8781            matches!(&err, EngineError::Storage(StorageError::Unsupported(_))),
8782            "got: {err}"
8783        );
8784    }
8785
8786    #[test]
8787    fn prepared_insert_substitutes_placeholders() {
8788        // v6.1.1: prepare() parses once; execute_prepared() walks the
8789        // AST and replaces $1/$2 with the param Values BEFORE the
8790        // dispatch sees them. Same logical result as a simple-query
8791        // INSERT, but parse happens once per *statement*, not per
8792        // execution.
8793        let mut e = Engine::new();
8794        e.execute("CREATE TABLE t (id INT NOT NULL, name TEXT NOT NULL)")
8795            .unwrap();
8796        let stmt = e.prepare("INSERT INTO t VALUES ($1, $2)").unwrap();
8797        for (id, name) in [(1, "alice"), (2, "bob"), (3, "carol")] {
8798            e.execute_prepared(
8799                stmt.clone(),
8800                &[Value::Int(id), Value::Text(name.into())],
8801            )
8802            .unwrap();
8803        }
8804        // Read back via simple-query SELECT.
8805        let rows_result = e.execute("SELECT id, name FROM t").unwrap();
8806        let QueryResult::Rows { rows, .. } = rows_result else {
8807            panic!("expected Rows")
8808        };
8809        assert_eq!(rows.len(), 3);
8810    }
8811
8812    #[test]
8813    fn prepared_select_with_placeholder_filters_rows() {
8814        let mut e = Engine::new();
8815        e.execute("CREATE TABLE t (id INT NOT NULL, v INT NOT NULL)")
8816            .unwrap();
8817        for i in 0..10_i32 {
8818            e.execute(&alloc::format!("INSERT INTO t VALUES ({i}, {})", i * 7))
8819                .unwrap();
8820        }
8821        let stmt = e
8822            .prepare("SELECT id FROM t WHERE v = $1")
8823            .unwrap();
8824        let QueryResult::Rows { rows, .. } = e
8825            .execute_prepared(stmt, &[Value::Int(35)])
8826            .unwrap()
8827        else {
8828            panic!("expected Rows")
8829        };
8830        // v = 35 means i*7 = 35 → i = 5.
8831        assert_eq!(rows.len(), 1);
8832        assert_eq!(rows[0].values[0], Value::Int(5));
8833    }
8834
8835    #[test]
8836    fn prepared_too_few_params_errors() {
8837        let mut e = Engine::new();
8838        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
8839        let stmt = e.prepare("INSERT INTO t VALUES ($1)").unwrap();
8840        let err = e.execute_prepared(stmt, &[]).unwrap_err();
8841        assert!(
8842            matches!(
8843                &err,
8844                EngineError::Eval(EvalError::PlaceholderOutOfRange { n: 1, bound: 0 })
8845            ),
8846            "got: {err}"
8847        );
8848    }
8849
8850    #[test]
8851    fn insert_into_half_column_dim_mismatch_errors() {
8852        let mut e = Engine::new();
8853        e.execute("CREATE TABLE t (v VECTOR(4) USING HALF)")
8854            .unwrap();
8855        let err = e.execute("INSERT INTO t VALUES ([1.0, 2.0])").unwrap_err();
8856        assert!(matches!(
8857            &err,
8858            EngineError::Storage(StorageError::TypeMismatch { .. })
8859        ));
8860    }
8861
8862    #[test]
8863    fn insert_into_sq8_column_dim_mismatch_errors() {
8864        // Dim mismatch falls through the `coerce_value` Vector→Sq8
8865        // arm's guard and surfaces as `TypeMismatch` — the same
8866        // error the F32 path produces today, so client error
8867        // handling stays uniform across encodings.
8868        let mut e = Engine::new();
8869        e.execute("CREATE TABLE t (v VECTOR(4) USING SQ8)").unwrap();
8870        let err = e.execute("INSERT INTO t VALUES ([1.0, 2.0])").unwrap_err();
8871        assert!(
8872            matches!(
8873                &err,
8874                EngineError::Storage(StorageError::TypeMismatch { .. })
8875            ),
8876            "got: {err}",
8877        );
8878    }
8879
8880    #[test]
8881    fn create_table_duplicate_errors() {
8882        let mut e = Engine::new();
8883        e.execute("CREATE TABLE foo (a INT)").unwrap();
8884        let err = e.execute("CREATE TABLE foo (a INT)").unwrap_err();
8885        assert!(matches!(
8886            err,
8887            EngineError::Storage(StorageError::DuplicateTable { ref name }) if name == "foo"
8888        ));
8889    }
8890
8891    #[test]
8892    fn insert_into_unknown_table_errors() {
8893        let mut e = Engine::new();
8894        let err = e.execute("INSERT INTO ghost VALUES (1)").unwrap_err();
8895        assert!(matches!(
8896            err,
8897            EngineError::Storage(StorageError::TableNotFound { ref name }) if name == "ghost"
8898        ));
8899    }
8900
8901    #[test]
8902    fn insert_happy_path_reports_one_affected() {
8903        let mut e = Engine::new();
8904        e.execute("CREATE TABLE foo (a INT NOT NULL)").unwrap();
8905        let r = e.execute("INSERT INTO foo VALUES (42)").unwrap();
8906        assert_eq!(unwrap_command_ok(&r), 1);
8907        assert_eq!(e.catalog().get("foo").unwrap().row_count(), 1);
8908    }
8909
8910    #[test]
8911    fn insert_arity_mismatch_propagates() {
8912        let mut e = Engine::new();
8913        e.execute("CREATE TABLE foo (a INT, b TEXT)").unwrap();
8914        let err = e.execute("INSERT INTO foo VALUES (1)").unwrap_err();
8915        assert!(matches!(
8916            err,
8917            EngineError::Storage(StorageError::ArityMismatch { .. })
8918        ));
8919    }
8920
8921    #[test]
8922    fn insert_negative_integer_via_unary_minus() {
8923        let mut e = Engine::new();
8924        e.execute("CREATE TABLE foo (a INT NOT NULL)").unwrap();
8925        e.execute("INSERT INTO foo VALUES (-7)").unwrap();
8926        let rows = e.catalog().get("foo").unwrap().rows();
8927        assert_eq!(rows[0].values[0], Value::Int(-7));
8928    }
8929
8930    #[test]
8931    fn insert_non_literal_expr_unsupported() {
8932        let mut e = Engine::new();
8933        e.execute("CREATE TABLE foo (a INT NOT NULL)").unwrap();
8934        let err = e.execute("INSERT INTO foo VALUES (1 + 2)").unwrap_err();
8935        assert!(matches!(err, EngineError::Unsupported(_)));
8936    }
8937
8938    #[test]
8939    fn select_star_returns_all_rows_in_insertion_order() {
8940        let mut e = Engine::new();
8941        e.execute("CREATE TABLE foo (a INT NOT NULL, b TEXT NOT NULL)")
8942            .unwrap();
8943        e.execute("INSERT INTO foo VALUES (1, 'one')").unwrap();
8944        e.execute("INSERT INTO foo VALUES (2, 'two')").unwrap();
8945        e.execute("INSERT INTO foo VALUES (3, 'three')").unwrap();
8946
8947        let r = e.execute("SELECT * FROM foo").unwrap();
8948        let QueryResult::Rows { columns, rows } = r else {
8949            panic!("expected Rows")
8950        };
8951        assert_eq!(columns.len(), 2);
8952        assert_eq!(columns[0].name, "a");
8953        assert_eq!(rows.len(), 3);
8954        assert_eq!(
8955            rows[1].values,
8956            vec![Value::Int(2), Value::Text("two".into())]
8957        );
8958    }
8959
8960    #[test]
8961    fn select_star_on_empty_table_returns_zero_rows() {
8962        let mut e = Engine::new();
8963        e.execute("CREATE TABLE foo (a INT)").unwrap();
8964        let r = e.execute("SELECT * FROM foo").unwrap();
8965        match r {
8966            QueryResult::Rows { rows, .. } => assert!(rows.is_empty()),
8967            QueryResult::CommandOk { .. } => panic!("expected Rows"),
8968        }
8969    }
8970
8971    // --- v0.4: WHERE + projection ------------------------------------------
8972
8973    fn make_three_row_users(e: &mut Engine) {
8974        e.execute("CREATE TABLE users (id INT NOT NULL, name TEXT NOT NULL, score INT)")
8975            .unwrap();
8976        e.execute("INSERT INTO users VALUES (1, 'alice', 90)")
8977            .unwrap();
8978        e.execute("INSERT INTO users VALUES (2, 'bob', NULL)")
8979            .unwrap();
8980        e.execute("INSERT INTO users VALUES (3, 'cara', 70)")
8981            .unwrap();
8982    }
8983
8984    fn unwrap_rows(r: QueryResult) -> (Vec<ColumnSchema>, Vec<Row>) {
8985        match r {
8986            QueryResult::Rows { columns, rows } => (columns, rows),
8987            QueryResult::CommandOk { .. } => panic!("expected Rows"),
8988        }
8989    }
8990
8991    #[test]
8992    fn where_filter_passes_only_true_rows() {
8993        let mut e = Engine::new();
8994        make_three_row_users(&mut e);
8995        let r = e.execute("SELECT * FROM users WHERE id > 1").unwrap();
8996        let (_, rows) = unwrap_rows(r);
8997        assert_eq!(rows.len(), 2);
8998        assert_eq!(rows[0].values[0], Value::Int(2));
8999        assert_eq!(rows[1].values[0], Value::Int(3));
9000    }
9001
9002    #[test]
9003    fn where_with_null_result_filters_out_row() {
9004        let mut e = Engine::new();
9005        make_three_row_users(&mut e);
9006        // score is NULL for bob → score > 80 is NULL → row excluded
9007        let r = e.execute("SELECT * FROM users WHERE score > 80").unwrap();
9008        let (_, rows) = unwrap_rows(r);
9009        assert_eq!(rows.len(), 1);
9010        assert_eq!(rows[0].values[1], Value::Text("alice".into()));
9011    }
9012
9013    #[test]
9014    fn projection_named_columns() {
9015        let mut e = Engine::new();
9016        make_three_row_users(&mut e);
9017        let r = e.execute("SELECT name, score FROM users").unwrap();
9018        let (cols, rows) = unwrap_rows(r);
9019        assert_eq!(cols.len(), 2);
9020        assert_eq!(cols[0].name, "name");
9021        assert_eq!(cols[1].name, "score");
9022        assert_eq!(rows.len(), 3);
9023        assert_eq!(
9024            rows[0].values,
9025            vec![Value::Text("alice".into()), Value::Int(90)]
9026        );
9027    }
9028
9029    #[test]
9030    fn projection_with_column_alias() {
9031        let mut e = Engine::new();
9032        make_three_row_users(&mut e);
9033        let r = e
9034            .execute("SELECT name AS who FROM users WHERE id = 1")
9035            .unwrap();
9036        let (cols, rows) = unwrap_rows(r);
9037        assert_eq!(cols[0].name, "who");
9038        assert_eq!(rows.len(), 1);
9039        assert_eq!(rows[0].values[0], Value::Text("alice".into()));
9040    }
9041
9042    #[test]
9043    fn qualified_column_with_table_alias_resolves() {
9044        let mut e = Engine::new();
9045        make_three_row_users(&mut e);
9046        let r = e
9047            .execute("SELECT u.id, u.name FROM users AS u WHERE u.id < 3")
9048            .unwrap();
9049        let (cols, rows) = unwrap_rows(r);
9050        assert_eq!(cols.len(), 2);
9051        assert_eq!(rows.len(), 2);
9052    }
9053
9054    #[test]
9055    fn qualified_column_with_wrong_alias_errors() {
9056        let mut e = Engine::new();
9057        make_three_row_users(&mut e);
9058        let err = e.execute("SELECT x.id FROM users AS u").unwrap_err();
9059        assert!(matches!(
9060            err,
9061            EngineError::Eval(EvalError::UnknownQualifier { ref qualifier }) if qualifier == "x"
9062        ));
9063    }
9064
9065    #[test]
9066    fn select_unknown_column_errors_in_projection() {
9067        let mut e = Engine::new();
9068        make_three_row_users(&mut e);
9069        let err = e.execute("SELECT ghost FROM users").unwrap_err();
9070        assert!(matches!(
9071            err,
9072            EngineError::Eval(EvalError::ColumnNotFound { ref name }) if name == "ghost"
9073        ));
9074    }
9075
9076    #[test]
9077    fn where_unknown_column_errors() {
9078        let mut e = Engine::new();
9079        make_three_row_users(&mut e);
9080        let err = e
9081            .execute("SELECT * FROM users WHERE ghost = 1")
9082            .unwrap_err();
9083        assert!(matches!(
9084            err,
9085            EngineError::Eval(EvalError::ColumnNotFound { .. })
9086        ));
9087    }
9088
9089    #[test]
9090    fn expression_projection_evaluates_and_renders() {
9091        // Compound expressions in the SELECT list are evaluated per row;
9092        // the output column is typed TEXT, name defaults to the expression.
9093        let mut e = Engine::new();
9094        e.execute("CREATE TABLE t (a INT NOT NULL)").unwrap();
9095        e.execute("INSERT INTO t VALUES (3)").unwrap();
9096        let (_, rows) = unwrap_rows(e.execute("SELECT 1 + 2 FROM t").unwrap());
9097        assert_eq!(rows.len(), 1);
9098        // The expression evaluates to integer 3; rendered as the cell value
9099        // (storage::Value::Int(3) since arithmetic kept ints).
9100        assert_eq!(rows[0].values[0], Value::Int(3));
9101    }
9102
9103    #[test]
9104    fn select_unknown_table_errors() {
9105        let mut e = Engine::new();
9106        let err = e.execute("SELECT * FROM ghost").unwrap_err();
9107        assert!(matches!(
9108            err,
9109            EngineError::Storage(StorageError::TableNotFound { .. })
9110        ));
9111    }
9112
9113    #[test]
9114    fn invalid_sql_returns_parse_error() {
9115        // v4.4: UPDATE is now real SQL, so use a true syntactic
9116        // garbage payload for the parse-error path.
9117        let mut e = Engine::new();
9118        let err = e.execute("THIS_IS_NOT_A_KEYWORD foo bar baz").unwrap_err();
9119        assert!(matches!(err, EngineError::Parse(_)));
9120    }
9121
9122    // --- v0.8 CREATE INDEX + index seek ------------------------------------
9123
9124    #[test]
9125    fn create_index_registers_on_table() {
9126        let mut e = Engine::new();
9127        make_three_row_users(&mut e);
9128        e.execute("CREATE INDEX by_name ON users (name)").unwrap();
9129        let t = e.catalog().get("users").unwrap();
9130        assert_eq!(t.indices().len(), 1);
9131        assert_eq!(t.indices()[0].name, "by_name");
9132    }
9133
9134    #[test]
9135    fn create_index_on_unknown_table_errors() {
9136        let mut e = Engine::new();
9137        let err = e.execute("CREATE INDEX i ON ghost (a)").unwrap_err();
9138        assert!(matches!(
9139            err,
9140            EngineError::Storage(StorageError::TableNotFound { .. })
9141        ));
9142    }
9143
9144    #[test]
9145    fn create_index_on_unknown_column_errors() {
9146        let mut e = Engine::new();
9147        make_three_row_users(&mut e);
9148        let err = e.execute("CREATE INDEX i ON users (ghost)").unwrap_err();
9149        assert!(matches!(
9150            err,
9151            EngineError::Storage(StorageError::ColumnNotFound { .. })
9152        ));
9153    }
9154
9155    #[test]
9156    fn select_eq_uses_index_returns_same_rows_as_scan() {
9157        // Build two engines: one with an index, one without. Same query →
9158        // same row set (index is a planner optimisation, not a semantic
9159        // change).
9160        let mut without = Engine::new();
9161        make_three_row_users(&mut without);
9162        let mut with = Engine::new();
9163        make_three_row_users(&mut with);
9164        with.execute("CREATE INDEX by_id ON users (id)").unwrap();
9165
9166        let q = "SELECT * FROM users WHERE id = 2";
9167        let (_, no_idx_rows) = unwrap_rows(without.execute(q).unwrap());
9168        let (_, idx_rows) = unwrap_rows(with.execute(q).unwrap());
9169        assert_eq!(no_idx_rows, idx_rows);
9170        assert_eq!(idx_rows.len(), 1);
9171    }
9172
9173    #[test]
9174    fn select_eq_with_no_matching_index_value_returns_empty() {
9175        let mut e = Engine::new();
9176        make_three_row_users(&mut e);
9177        e.execute("CREATE INDEX by_id ON users (id)").unwrap();
9178        let (_, rows) = unwrap_rows(e.execute("SELECT * FROM users WHERE id = 999").unwrap());
9179        assert_eq!(rows.len(), 0);
9180    }
9181
9182    // --- v0.9 transactions -------------------------------------------------
9183
9184    #[test]
9185    fn begin_sets_in_transaction_flag() {
9186        let mut e = Engine::new();
9187        assert!(!e.in_transaction());
9188        e.execute("BEGIN").unwrap();
9189        assert!(e.in_transaction());
9190    }
9191
9192    #[test]
9193    fn double_begin_errors() {
9194        let mut e = Engine::new();
9195        e.execute("BEGIN").unwrap();
9196        let err = e.execute("BEGIN").unwrap_err();
9197        assert_eq!(err, EngineError::TransactionAlreadyOpen);
9198    }
9199
9200    #[test]
9201    fn commit_without_begin_errors() {
9202        let mut e = Engine::new();
9203        let err = e.execute("COMMIT").unwrap_err();
9204        assert_eq!(err, EngineError::NoActiveTransaction);
9205    }
9206
9207    #[test]
9208    fn rollback_without_begin_errors() {
9209        let mut e = Engine::new();
9210        let err = e.execute("ROLLBACK").unwrap_err();
9211        assert_eq!(err, EngineError::NoActiveTransaction);
9212    }
9213
9214    #[test]
9215    fn commit_applies_shadow_to_committed_catalog() {
9216        let mut e = Engine::new();
9217        e.execute("CREATE TABLE t (v INT NOT NULL)").unwrap();
9218        e.execute("BEGIN").unwrap();
9219        e.execute("INSERT INTO t VALUES (1)").unwrap();
9220        e.execute("INSERT INTO t VALUES (2)").unwrap();
9221        e.execute("COMMIT").unwrap();
9222        assert!(!e.in_transaction());
9223        assert_eq!(e.catalog().get("t").unwrap().row_count(), 2);
9224    }
9225
9226    #[test]
9227    fn rollback_discards_shadow() {
9228        let mut e = Engine::new();
9229        e.execute("CREATE TABLE t (v INT NOT NULL)").unwrap();
9230        e.execute("BEGIN").unwrap();
9231        e.execute("INSERT INTO t VALUES (1)").unwrap();
9232        e.execute("INSERT INTO t VALUES (2)").unwrap();
9233        e.execute("ROLLBACK").unwrap();
9234        assert!(!e.in_transaction());
9235        assert_eq!(e.catalog().get("t").unwrap().row_count(), 0);
9236    }
9237
9238    #[test]
9239    fn select_during_tx_sees_uncommitted_writes_own_session() {
9240        // The shadow catalog is read by SELECTs while a TX is open — the
9241        // session can see its own pending writes.
9242        let mut e = Engine::new();
9243        e.execute("CREATE TABLE t (v INT NOT NULL)").unwrap();
9244        e.execute("BEGIN").unwrap();
9245        e.execute("INSERT INTO t VALUES (42)").unwrap();
9246        let (_, rows) = unwrap_rows(e.execute("SELECT * FROM t").unwrap());
9247        assert_eq!(rows.len(), 1);
9248        assert_eq!(rows[0].values[0], Value::Int(42));
9249    }
9250
9251    #[test]
9252    fn snapshot_with_no_users_is_bare_catalog_format() {
9253        let mut e = Engine::new();
9254        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
9255        let bytes = e.snapshot();
9256        assert_eq!(
9257            &bytes[..8],
9258            b"SPGDB001",
9259            "must be the bare v3.x catalog magic"
9260        );
9261        let e2 = Engine::restore_envelope(&bytes).unwrap();
9262        assert!(e2.users().is_empty());
9263        assert_eq!(e2.catalog().table_count(), 1);
9264    }
9265
9266    #[test]
9267    fn snapshot_with_users_round_trips_both_via_envelope() {
9268        let mut e = Engine::new();
9269        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
9270        e.create_user("alice", "pw1", Role::Admin, [9; 16]).unwrap();
9271        e.create_user("bob", "pw2", Role::ReadOnly, [5; 16])
9272            .unwrap();
9273        let bytes = e.snapshot();
9274        assert_eq!(&bytes[..8], b"SPGENV01", "must be the v4.1 envelope magic");
9275        let e2 = Engine::restore_envelope(&bytes).unwrap();
9276        assert_eq!(e2.users().len(), 2);
9277        assert_eq!(e2.verify_user("alice", "pw1"), Some(Role::Admin));
9278        assert_eq!(e2.verify_user("bob", "pw2"), Some(Role::ReadOnly));
9279        assert_eq!(e2.verify_user("alice", "wrong"), None);
9280        assert_eq!(e2.catalog().table_count(), 1);
9281    }
9282
9283    #[test]
9284    fn ddl_inside_tx_also_rolled_back() {
9285        let mut e = Engine::new();
9286        e.execute("BEGIN").unwrap();
9287        e.execute("CREATE TABLE t (v INT)").unwrap();
9288        // Visible inside the TX.
9289        e.execute("SELECT * FROM t").unwrap();
9290        e.execute("ROLLBACK").unwrap();
9291        // Gone after rollback.
9292        let err = e.execute("SELECT * FROM t").unwrap_err();
9293        assert!(matches!(
9294            err,
9295            EngineError::Storage(StorageError::TableNotFound { .. })
9296        ));
9297    }
9298
9299    // ── v6.1.2: CREATE / DROP PUBLICATION (engine-side) ──────
9300
9301    #[test]
9302    fn create_publication_lands_in_catalog() {
9303        let mut e = Engine::new();
9304        assert!(e.publications().is_empty());
9305        e.execute("CREATE PUBLICATION pub_a").unwrap();
9306        assert_eq!(e.publications().len(), 1);
9307        assert!(e.publications().contains("pub_a"));
9308    }
9309
9310    #[test]
9311    fn create_publication_duplicate_errors() {
9312        let mut e = Engine::new();
9313        e.execute("CREATE PUBLICATION pub_a").unwrap();
9314        let err = e.execute("CREATE PUBLICATION pub_a").unwrap_err();
9315        assert!(
9316            alloc::format!("{err:?}").contains("DuplicateName"),
9317            "got {err:?}"
9318        );
9319    }
9320
9321    #[test]
9322    fn drop_publication_silent_when_absent() {
9323        let mut e = Engine::new();
9324        // PG-compatible: DROP a publication that doesn't exist
9325        // succeeds (no-op) but reports zero affected.
9326        let r = e.execute("DROP PUBLICATION nope").unwrap();
9327        match r {
9328            QueryResult::CommandOk { affected, .. } => assert_eq!(affected, 0),
9329            other => panic!("expected CommandOk, got {other:?}"),
9330        }
9331    }
9332
9333    #[test]
9334    fn drop_publication_present_reports_one_affected() {
9335        let mut e = Engine::new();
9336        e.execute("CREATE PUBLICATION pub_a").unwrap();
9337        let r = e.execute("DROP PUBLICATION pub_a").unwrap();
9338        match r {
9339            QueryResult::CommandOk {
9340                affected,
9341                modified_catalog,
9342            } => {
9343                assert_eq!(affected, 1);
9344                assert!(modified_catalog);
9345            }
9346            other => panic!("expected CommandOk, got {other:?}"),
9347        }
9348        assert!(e.publications().is_empty());
9349    }
9350
9351    #[test]
9352    fn publications_persist_across_snapshot_restore() {
9353        // The persist-across-restart ship-gate at the engine layer —
9354        // snapshot → restore_envelope round trip must preserve the
9355        // publication catalog. The spg-server e2e covers the
9356        // process-restart variant.
9357        let mut e = Engine::new();
9358        e.execute("CREATE PUBLICATION pub_a").unwrap();
9359        e.execute("CREATE PUBLICATION pub_b FOR ALL TABLES").unwrap();
9360        let snap = e.snapshot();
9361        let e2 = Engine::restore_envelope(&snap).unwrap();
9362        assert_eq!(e2.publications().len(), 2);
9363        assert!(e2.publications().contains("pub_a"));
9364        assert!(e2.publications().contains("pub_b"));
9365    }
9366
9367    #[test]
9368    fn create_publication_allowed_inside_transaction() {
9369        // v6.1.4 dropped the v6.1.2 in-TX guard — PG allows
9370        // CREATE PUBLICATION inside a TX and the auto-commit
9371        // wrap path needs the same allowance.
9372        let mut e = Engine::new();
9373        e.execute("BEGIN").unwrap();
9374        e.execute("CREATE PUBLICATION pub_a").unwrap();
9375        e.execute("COMMIT").unwrap();
9376        assert!(e.publications().contains("pub_a"));
9377    }
9378
9379    // ── v6.1.3: SHOW PUBLICATIONS + FOR-list variants ───────
9380
9381    #[test]
9382    fn create_publication_for_table_list_lands_with_scope() {
9383        let mut e = Engine::new();
9384        e.execute("CREATE TABLE t1 (id INT NOT NULL)").unwrap();
9385        e.execute("CREATE TABLE t2 (id INT NOT NULL)").unwrap();
9386        e.execute("CREATE PUBLICATION pub_a FOR TABLE t1, t2")
9387            .unwrap();
9388        let scope = e.publications().get("pub_a").cloned();
9389        let Some(spg_sql::ast::PublicationScope::ForTables(ts)) = scope else {
9390            panic!("expected ForTables scope, got {scope:?}")
9391        };
9392        assert_eq!(ts, alloc::vec!["t1".to_string(), "t2".to_string()]);
9393    }
9394
9395    #[test]
9396    fn create_publication_all_tables_except_lands_with_scope() {
9397        let mut e = Engine::new();
9398        e.execute("CREATE PUBLICATION pub_a FOR ALL TABLES EXCEPT t3")
9399            .unwrap();
9400        let scope = e.publications().get("pub_a").cloned();
9401        let Some(spg_sql::ast::PublicationScope::AllTablesExcept(ts)) = scope else {
9402            panic!("expected AllTablesExcept scope, got {scope:?}")
9403        };
9404        assert_eq!(ts, alloc::vec!["t3".to_string()]);
9405    }
9406
9407    #[test]
9408    fn show_publications_empty_returns_zero_rows() {
9409        let e = Engine::new();
9410        let r = e.execute_readonly("SHOW PUBLICATIONS").unwrap();
9411        let QueryResult::Rows { rows, columns } = r else {
9412            panic!()
9413        };
9414        assert!(rows.is_empty());
9415        assert_eq!(columns.len(), 3);
9416        assert_eq!(columns[0].name, "name");
9417        assert_eq!(columns[1].name, "scope");
9418        assert_eq!(columns[2].name, "table_count");
9419    }
9420
9421    #[test]
9422    fn show_publications_returns_one_row_per_publication_ordered_by_name() {
9423        let mut e = Engine::new();
9424        e.execute("CREATE PUBLICATION z_pub").unwrap();
9425        e.execute("CREATE PUBLICATION a_pub FOR TABLE t1, t2")
9426            .unwrap();
9427        e.execute("CREATE PUBLICATION m_pub FOR ALL TABLES EXCEPT bad")
9428            .unwrap();
9429        let r = e.execute_readonly("SHOW PUBLICATIONS").unwrap();
9430        let QueryResult::Rows { rows, .. } = r else {
9431            panic!()
9432        };
9433        assert_eq!(rows.len(), 3);
9434        // Alphabetical order: a_pub, m_pub, z_pub.
9435        let names: Vec<&str> = rows
9436            .iter()
9437            .map(|r| {
9438                if let Value::Text(s) = &r.values[0] {
9439                    s.as_str()
9440                } else {
9441                    panic!()
9442                }
9443            })
9444            .collect();
9445        assert_eq!(names, alloc::vec!["a_pub", "m_pub", "z_pub"]);
9446        // Row 0 — a_pub scope summary + table_count = 2.
9447        match &rows[0].values[1] {
9448            Value::Text(s) => assert_eq!(s, "FOR TABLE t1, t2"),
9449            other => panic!("expected Text, got {other:?}"),
9450        }
9451        assert_eq!(rows[0].values[2], Value::Int(2));
9452        // Row 1 — m_pub.
9453        match &rows[1].values[1] {
9454            Value::Text(s) => assert_eq!(s, "FOR ALL TABLES EXCEPT bad"),
9455            other => panic!("expected Text, got {other:?}"),
9456        }
9457        assert_eq!(rows[1].values[2], Value::Int(1));
9458        // Row 2 — z_pub (AllTables → NULL count).
9459        match &rows[2].values[1] {
9460            Value::Text(s) => assert_eq!(s, "FOR ALL TABLES"),
9461            other => panic!("expected Text, got {other:?}"),
9462        }
9463        assert_eq!(rows[2].values[2], Value::Null);
9464    }
9465
9466    #[test]
9467    fn for_list_scopes_persist_across_snapshot() {
9468        // The v6.1.2 envelope-v3 round-trip exercised AllTables;
9469        // v6.1.3 needs the scope-1 / scope-2 tags to survive too.
9470        let mut e = Engine::new();
9471        e.execute("CREATE PUBLICATION p1 FOR TABLE t1, t2").unwrap();
9472        e.execute("CREATE PUBLICATION p2 FOR ALL TABLES EXCEPT bad, worse")
9473            .unwrap();
9474        let snap = e.snapshot();
9475        let e2 = Engine::restore_envelope(&snap).unwrap();
9476        assert_eq!(e2.publications().len(), 2);
9477        let p1 = e2.publications().get("p1").cloned();
9478        let Some(spg_sql::ast::PublicationScope::ForTables(ts)) = p1 else {
9479            panic!("p1 scope lost: {p1:?}")
9480        };
9481        assert_eq!(ts, alloc::vec!["t1".to_string(), "t2".to_string()]);
9482        let p2 = e2.publications().get("p2").cloned();
9483        let Some(spg_sql::ast::PublicationScope::AllTablesExcept(ts)) = p2 else {
9484            panic!("p2 scope lost: {p2:?}")
9485        };
9486        assert_eq!(ts, alloc::vec!["bad".to_string(), "worse".to_string()]);
9487    }
9488
9489    // ── v6.1.4: CREATE / DROP SUBSCRIPTION + SHOW + envelope v4 ─
9490
9491    #[test]
9492    fn create_subscription_lands_in_catalog_with_defaults() {
9493        let mut e = Engine::new();
9494        e.execute(
9495            "CREATE SUBSCRIPTION sub_a CONNECTION 'host=127.0.0.1 port=20002' PUBLICATION pub_a",
9496        )
9497        .unwrap();
9498        let s = e.subscriptions().get("sub_a").cloned().expect("present");
9499        assert_eq!(s.conn_str, "host=127.0.0.1 port=20002");
9500        assert_eq!(s.publications, alloc::vec!["pub_a".to_string()]);
9501        assert!(s.enabled);
9502        assert_eq!(s.last_received_pos, 0);
9503    }
9504
9505    #[test]
9506    fn create_subscription_duplicate_name_errors() {
9507        let mut e = Engine::new();
9508        e.execute("CREATE SUBSCRIPTION s CONNECTION 'host=x' PUBLICATION p")
9509            .unwrap();
9510        let err = e
9511            .execute("CREATE SUBSCRIPTION s CONNECTION 'host=y' PUBLICATION p")
9512            .unwrap_err();
9513        assert!(
9514            alloc::format!("{err:?}").contains("DuplicateName"),
9515            "got {err:?}"
9516        );
9517    }
9518
9519    #[test]
9520    fn drop_subscription_silent_when_absent() {
9521        let mut e = Engine::new();
9522        let r = e.execute("DROP SUBSCRIPTION never").unwrap();
9523        match r {
9524            QueryResult::CommandOk { affected, .. } => assert_eq!(affected, 0),
9525            other => panic!("expected CommandOk, got {other:?}"),
9526        }
9527    }
9528
9529    #[test]
9530    fn subscription_advance_updates_last_pos_monotone() {
9531        let mut e = Engine::new();
9532        e.execute("CREATE SUBSCRIPTION s CONNECTION 'h=x' PUBLICATION p")
9533            .unwrap();
9534        assert!(e.subscription_advance("s", 100));
9535        assert_eq!(e.subscriptions().get("s").unwrap().last_received_pos, 100);
9536        assert!(e.subscription_advance("s", 50)); // stale → ignored
9537        assert_eq!(e.subscriptions().get("s").unwrap().last_received_pos, 100);
9538        assert!(e.subscription_advance("s", 200));
9539        assert_eq!(e.subscriptions().get("s").unwrap().last_received_pos, 200);
9540        assert!(!e.subscription_advance("missing", 1));
9541    }
9542
9543    #[test]
9544    fn show_subscriptions_returns_rows_ordered_by_name() {
9545        let mut e = Engine::new();
9546        e.execute("CREATE SUBSCRIPTION z_sub CONNECTION 'h=x' PUBLICATION p1, p2")
9547            .unwrap();
9548        e.execute("CREATE SUBSCRIPTION a_sub CONNECTION 'h=y' PUBLICATION p3")
9549            .unwrap();
9550        let r = e.execute_readonly("SHOW SUBSCRIPTIONS").unwrap();
9551        let QueryResult::Rows { rows, columns } = r else {
9552            panic!()
9553        };
9554        assert_eq!(rows.len(), 2);
9555        assert_eq!(columns.len(), 5);
9556        assert_eq!(columns[0].name, "name");
9557        assert_eq!(columns[4].name, "last_received_pos");
9558        // Alphabetical: a_sub, z_sub.
9559        let names: Vec<&str> = rows
9560            .iter()
9561            .map(|r| {
9562                if let Value::Text(s) = &r.values[0] {
9563                    s.as_str()
9564                } else {
9565                    panic!()
9566                }
9567            })
9568            .collect();
9569        assert_eq!(names, alloc::vec!["a_sub", "z_sub"]);
9570        // Row 0: a_sub
9571        assert_eq!(rows[0].values[1], Value::Text("h=y".to_string()));
9572        assert_eq!(rows[0].values[2], Value::Text("p3".to_string()));
9573        assert_eq!(rows[0].values[3], Value::Bool(true));
9574        assert_eq!(rows[0].values[4], Value::BigInt(0));
9575        // Row 1: z_sub — publications join with ", "
9576        assert_eq!(rows[1].values[2], Value::Text("p1, p2".to_string()));
9577    }
9578
9579    #[test]
9580    fn subscriptions_persist_across_snapshot_envelope_v4() {
9581        let mut e = Engine::new();
9582        e.execute("CREATE SUBSCRIPTION s1 CONNECTION 'h=A' PUBLICATION p1, p2")
9583            .unwrap();
9584        e.execute("CREATE SUBSCRIPTION s2 CONNECTION 'h=B' PUBLICATION p3")
9585            .unwrap();
9586        e.subscription_advance("s2", 42);
9587        let snap = e.snapshot();
9588        let e2 = Engine::restore_envelope(&snap).unwrap();
9589        assert_eq!(e2.subscriptions().len(), 2);
9590        let s1 = e2.subscriptions().get("s1").unwrap();
9591        assert_eq!(s1.conn_str, "h=A");
9592        assert_eq!(s1.publications, alloc::vec!["p1".to_string(), "p2".to_string()]);
9593        assert_eq!(s1.last_received_pos, 0);
9594        let s2 = e2.subscriptions().get("s2").unwrap();
9595        assert_eq!(s2.last_received_pos, 42);
9596    }
9597
9598    #[test]
9599    fn v3_envelope_loads_with_empty_subscriptions() {
9600        // v3 snapshot (publications-only). Forge it by hand so we
9601        // verify v6.1.4 readers don't panic — they must surface
9602        // empty subscriptions and a populated publication table.
9603        let mut e = Engine::new();
9604        e.execute("CREATE PUBLICATION pub_legacy").unwrap();
9605        let catalog = e.catalog.serialize();
9606        let users = crate::users::serialize_users(&e.users);
9607        let pubs = e.publications.serialize();
9608        let mut buf = Vec::new();
9609        buf.extend_from_slice(b"SPGENV01");
9610        buf.push(3u8); // v3
9611        buf.extend_from_slice(&u32::try_from(catalog.len()).unwrap().to_le_bytes());
9612        buf.extend_from_slice(&catalog);
9613        buf.extend_from_slice(&u32::try_from(users.len()).unwrap().to_le_bytes());
9614        buf.extend_from_slice(&users);
9615        buf.extend_from_slice(&u32::try_from(pubs.len()).unwrap().to_le_bytes());
9616        buf.extend_from_slice(&pubs);
9617        let crc = spg_crypto::crc32::crc32(&buf);
9618        buf.extend_from_slice(&crc.to_le_bytes());
9619
9620        let e2 = Engine::restore_envelope(&buf).expect("v3 envelope restores under v4 reader");
9621        assert!(e2.subscriptions().is_empty());
9622        assert!(e2.publications().contains("pub_legacy"));
9623    }
9624
9625    #[test]
9626    fn create_subscription_allowed_inside_transaction() {
9627        let mut e = Engine::new();
9628        e.execute("BEGIN").unwrap();
9629        e.execute("CREATE SUBSCRIPTION s CONNECTION 'h=x' PUBLICATION p")
9630            .unwrap();
9631        e.execute("COMMIT").unwrap();
9632        assert!(e.subscriptions().contains("s"));
9633    }
9634
9635    #[test]
9636    // ── v6.2.0: ANALYZE + spg_statistic + envelope v5 ──────────
9637
9638    #[test]
9639    fn analyze_populates_histogram_bounds() {
9640        let mut e = Engine::new();
9641        e.execute("CREATE TABLE t (id INT NOT NULL, name TEXT)").unwrap();
9642        for i in 0..50 {
9643            e.execute(&alloc::format!(
9644                "INSERT INTO t VALUES ({i}, 'name{i}')"
9645            ))
9646            .unwrap();
9647        }
9648        e.execute("ANALYZE t").unwrap();
9649        let stats = e.statistics();
9650        let id_stats = stats.get("t", "id").unwrap();
9651        assert!(id_stats.histogram_bounds.len() >= 2);
9652        assert_eq!(id_stats.histogram_bounds.first().unwrap(), "0");
9653        assert_eq!(id_stats.histogram_bounds.last().unwrap(), "49");
9654        assert!((id_stats.null_frac - 0.0).abs() < 1e-6);
9655        assert_eq!(id_stats.n_distinct, 50);
9656    }
9657
9658    #[test]
9659    fn reanalyze_overwrites_prior_stats() {
9660        let mut e = Engine::new();
9661        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
9662        for i in 0..10 {
9663            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
9664        }
9665        e.execute("ANALYZE t").unwrap();
9666        let n1 = e.statistics().get("t", "id").unwrap().n_distinct;
9667        assert_eq!(n1, 10);
9668        for i in 10..30 {
9669            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
9670        }
9671        e.execute("ANALYZE t").unwrap();
9672        let n2 = e.statistics().get("t", "id").unwrap().n_distinct;
9673        assert_eq!(n2, 30);
9674    }
9675
9676    #[test]
9677    fn analyze_unknown_table_errors() {
9678        let mut e = Engine::new();
9679        let err = e.execute("ANALYZE nonexistent").unwrap_err();
9680        assert!(matches!(err, EngineError::Storage(StorageError::TableNotFound { .. })));
9681    }
9682
9683    #[test]
9684    fn bare_analyze_covers_all_user_tables() {
9685        let mut e = Engine::new();
9686        e.execute("CREATE TABLE t1 (id INT NOT NULL)").unwrap();
9687        e.execute("CREATE TABLE t2 (name TEXT NOT NULL)").unwrap();
9688        e.execute("INSERT INTO t1 VALUES (1)").unwrap();
9689        e.execute("INSERT INTO t2 VALUES ('alice')").unwrap();
9690        let r = e.execute("ANALYZE").unwrap();
9691        match r {
9692            QueryResult::CommandOk { affected, modified_catalog } => {
9693                assert_eq!(affected, 2);
9694                assert!(modified_catalog);
9695            }
9696            other => panic!("expected CommandOk, got {other:?}"),
9697        }
9698        assert!(e.statistics().get("t1", "id").is_some());
9699        assert!(e.statistics().get("t2", "name").is_some());
9700    }
9701
9702    #[test]
9703    fn select_from_spg_statistic_returns_rows_per_column() {
9704        let mut e = Engine::new();
9705        e.execute("CREATE TABLE t (id INT NOT NULL, label TEXT)")
9706            .unwrap();
9707        e.execute("INSERT INTO t VALUES (1, 'a')").unwrap();
9708        e.execute("INSERT INTO t VALUES (2, 'b')").unwrap();
9709        e.execute("ANALYZE t").unwrap();
9710        let r = e.execute_readonly("SELECT * FROM spg_statistic").unwrap();
9711        let QueryResult::Rows { rows, columns } = r else {
9712            panic!()
9713        };
9714        // v6.7.0 — spg_statistic gained a `cold_row_count` column.
9715        assert_eq!(columns.len(), 6);
9716        assert_eq!(columns[0].name, "table_name");
9717        assert_eq!(columns[4].name, "histogram_bounds");
9718        assert_eq!(columns[5].name, "cold_row_count");
9719        assert_eq!(rows.len(), 2, "one row per column of t");
9720        // Sorted by (table_name, column_name).
9721        match (&rows[0].values[0], &rows[0].values[1]) {
9722            (Value::Text(t), Value::Text(c)) => {
9723                assert_eq!(t, "t");
9724                // BTreeMap orders (table, column); columns "id" < "label".
9725                assert_eq!(c, "id");
9726            }
9727            _ => panic!(),
9728        }
9729    }
9730
9731    #[test]
9732    fn analyze_skips_vector_columns() {
9733        // Vector columns have their own stats shape (HNSW graph);
9734        // ANALYZE leaves them out of spg_statistic.
9735        let mut e = Engine::new();
9736        e.execute("CREATE TABLE t (id INT NOT NULL, v VECTOR(3) NOT NULL)")
9737            .unwrap();
9738        e.execute("INSERT INTO t VALUES (1, [1, 2, 3])").unwrap();
9739        e.execute("ANALYZE t").unwrap();
9740        assert!(e.statistics().get("t", "id").is_some());
9741        assert!(e.statistics().get("t", "v").is_none());
9742    }
9743
9744    #[test]
9745    fn statistics_persist_across_envelope_v5_round_trip() {
9746        let mut e = Engine::new();
9747        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
9748        for i in 0..20 {
9749            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
9750        }
9751        e.execute("ANALYZE").unwrap();
9752        let snap = e.snapshot();
9753        let e2 = Engine::restore_envelope(&snap).unwrap();
9754        let s = e2.statistics().get("t", "id").unwrap();
9755        assert_eq!(s.n_distinct, 20);
9756    }
9757
9758    // ── v6.2.1 auto-analyze threshold ───────────────────────────
9759
9760    #[test]
9761    fn auto_analyze_threshold_fires_after_10pct_of_min_rows_on_small_table() {
9762        // For a table with 0 rows then 10 inserts → modified=10,
9763        // row_count=10. Threshold = 0.1 × max(10, 100) = 10. So
9764        // after the 10th INSERT the threshold is met.
9765        let mut e = Engine::new();
9766        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
9767        for i in 0..9 {
9768            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
9769        }
9770        assert!(e.tables_needing_analyze().is_empty(), "9 < threshold");
9771        e.execute("INSERT INTO t VALUES (9)").unwrap();
9772        let needs = e.tables_needing_analyze();
9773        assert_eq!(needs, alloc::vec!["t".to_string()]);
9774    }
9775
9776    #[test]
9777    fn auto_analyze_threshold_uses_10pct_of_row_count_for_large_tables() {
9778        // After ANALYZE on 1000 rows, threshold = 0.1 × row_count.
9779        // Each new INSERT bumps both modified and row_count, so to
9780        // trigger from N=1000 we need modifications ≥ 0.1 × (1000+M),
9781        // i.e. M ≥ 112. The test inserts 50 (no fire), then 150
9782        // more (200 total mods, row_count=1200, threshold=120 → fire).
9783        let mut e = Engine::new();
9784        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
9785        for i in 0..1000 {
9786            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
9787        }
9788        e.execute("ANALYZE t").unwrap();
9789        assert!(e.tables_needing_analyze().is_empty(), "fresh ANALYZE");
9790        for i in 1000..1050 {
9791            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
9792        }
9793        assert!(
9794            e.tables_needing_analyze().is_empty(),
9795            "50 inserts < threshold of ~105"
9796        );
9797        for i in 1050..1200 {
9798            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
9799        }
9800        assert_eq!(
9801            e.tables_needing_analyze(),
9802            alloc::vec!["t".to_string()],
9803            "200 inserts > 0.1 × 1200 threshold"
9804        );
9805    }
9806
9807    #[test]
9808    fn auto_analyze_threshold_resets_after_analyze() {
9809        let mut e = Engine::new();
9810        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
9811        for i in 0..200 {
9812            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
9813        }
9814        assert!(!e.tables_needing_analyze().is_empty());
9815        e.execute("ANALYZE").unwrap();
9816        assert!(
9817            e.tables_needing_analyze().is_empty(),
9818            "ANALYZE must reset the counter"
9819        );
9820    }
9821
9822    #[test]
9823    fn auto_analyze_threshold_tracks_updates_and_deletes() {
9824        let mut e = Engine::new();
9825        e.execute("CREATE TABLE t (id INT NOT NULL, label TEXT)").unwrap();
9826        for i in 0..50 {
9827            e.execute(&alloc::format!("INSERT INTO t VALUES ({i}, 'x')"))
9828                .unwrap();
9829        }
9830        e.execute("ANALYZE t").unwrap();
9831        // UPDATE 20 rows + DELETE 5 → modified=25. Threshold = 0.1
9832        // × max(50, 100) = 10. So 25 >= 10 → trigger.
9833        e.execute("UPDATE t SET label = 'y' WHERE id < 20").unwrap();
9834        e.execute("DELETE FROM t WHERE id >= 45").unwrap();
9835        assert_eq!(
9836            e.tables_needing_analyze(),
9837            alloc::vec!["t".to_string()]
9838        );
9839    }
9840
9841    #[test]
9842    fn v4_envelope_loads_with_empty_statistics() {
9843        // Forge a v4 envelope by hand: catalog + users + pubs +
9844        // subs trailer, no statistics. A v6.2.0 reader must accept
9845        // it and surface an empty Statistics.
9846        let mut e = Engine::new();
9847        e.create_user("alice", "secret", crate::users::Role::ReadOnly, [0u8; 16])
9848            .unwrap();
9849        let catalog = e.catalog.serialize();
9850        let users = crate::users::serialize_users(&e.users);
9851        let pubs = e.publications.serialize();
9852        let subs = e.subscriptions.serialize();
9853        let mut buf = Vec::new();
9854        buf.extend_from_slice(b"SPGENV01");
9855        buf.push(4u8);
9856        buf.extend_from_slice(&u32::try_from(catalog.len()).unwrap().to_le_bytes());
9857        buf.extend_from_slice(&catalog);
9858        buf.extend_from_slice(&u32::try_from(users.len()).unwrap().to_le_bytes());
9859        buf.extend_from_slice(&users);
9860        buf.extend_from_slice(&u32::try_from(pubs.len()).unwrap().to_le_bytes());
9861        buf.extend_from_slice(&pubs);
9862        buf.extend_from_slice(&u32::try_from(subs.len()).unwrap().to_le_bytes());
9863        buf.extend_from_slice(&subs);
9864        let crc = spg_crypto::crc32::crc32(&buf);
9865        buf.extend_from_slice(&crc.to_le_bytes());
9866        let e2 = Engine::restore_envelope(&buf).expect("v4 envelope restores");
9867        assert!(e2.statistics().is_empty());
9868    }
9869
9870    #[test]
9871    fn v1_v2_envelope_loads_with_empty_publications() {
9872        // A snapshot taken before v6.1.2 (no publication trailer,
9873        // envelope v2) must still deserialise — and the resulting
9874        // engine must report zero publications. Use the engine's own
9875        // round-trip with no publications: that emits v3 but with an
9876        // empty pubs block. Then forge a v2 envelope by hand to lock
9877        // the back-compat path.
9878        let mut e = Engine::new();
9879        // Force users to be non-empty so the snapshot takes the
9880        // envelope path rather than the bare-catalog fallback.
9881        e.create_user(
9882            "alice",
9883            "secret",
9884            crate::users::Role::ReadOnly,
9885            [0u8; 16],
9886        )
9887        .unwrap();
9888
9889        // Forge an envelope v2: same shape as v3 but no pubs trailer.
9890        let catalog = e.catalog.serialize();
9891        let users = crate::users::serialize_users(&e.users);
9892        let mut buf = Vec::new();
9893        buf.extend_from_slice(b"SPGENV01");
9894        buf.push(2u8); // v2
9895        buf.extend_from_slice(
9896            &u32::try_from(catalog.len()).unwrap().to_le_bytes(),
9897        );
9898        buf.extend_from_slice(&catalog);
9899        buf.extend_from_slice(
9900            &u32::try_from(users.len()).unwrap().to_le_bytes(),
9901        );
9902        buf.extend_from_slice(&users);
9903        let crc = spg_crypto::crc32::crc32(&buf);
9904        buf.extend_from_slice(&crc.to_le_bytes());
9905
9906        let e2 = Engine::restore_envelope(&buf).expect("v2 envelope restores");
9907        assert!(e2.publications().is_empty());
9908    }
9909}