Skip to main content

spg_engine/
lib.rs

1//! SPG execution engine — v0.3 wires the SQL front-end to the in-memory
2//! storage layer. Implements `CREATE TABLE`, single-row `INSERT VALUES`, and
3//! `SELECT * FROM <table>` (no WHERE yet — that lands in v0.4 alongside
4//! expression evaluation against rows).
5#![no_std]
6
7extern crate alloc;
8
9pub mod aggregate;
10pub mod describe;
11pub mod eval;
12pub mod json;
13pub mod memoize;
14pub mod plan_cache;
15pub mod publications;
16pub mod query_stats;
17pub mod reorder;
18pub mod selectivity;
19pub mod statistics;
20pub mod subscriptions;
21pub mod users;
22
23pub use crate::users::{Role, ScramSecrets, UserError, UserStore};
24
25use alloc::borrow::Cow;
26use alloc::boxed::Box;
27use alloc::collections::BTreeMap;
28use alloc::string::{String, ToString};
29use alloc::vec::Vec;
30use core::fmt;
31
32use spg_sql::ast::{
33    BinOp, ColumnDef, ColumnName, ColumnTypeName, CreateIndexStatement,
34    CreatePublicationStatement, CreateSubscriptionStatement, CreateTableStatement,
35    CreateUserStatement, Expr, FrameBound, FrameKind, FromClause, IndexMethod, InsertStatement,
36    JoinKind, Literal, OrderBy, SelectItem, SelectStatement, Statement, UnOp, UnionKind,
37    VecEncoding as SqlVecEncoding, WindowFrame,
38};
39use spg_sql::parser::{self, ParseError};
40use spg_storage::{
41    Catalog, ColumnSchema, CompactReport, DataType, IndexKey, IndexKind, Row, StorageError, Table,
42    TableSchema, Value, VecEncoding,
43};
44
45use crate::eval::{EvalContext, EvalError};
46
47/// Result of executing one statement.
48#[derive(Debug, Clone, PartialEq)]
49#[non_exhaustive]
50pub enum QueryResult {
51    /// DDL or DML succeeded.
52    ///
53    /// `affected` is the row count for `INSERT` and 0 elsewhere.
54    /// `modified_catalog` tells the server whether this statement
55    /// caused the *committed* catalog to change — it's the signal to
56    /// snapshot/audit. False for `BEGIN`/`ROLLBACK`, false for writeful
57    /// statements executed inside a transaction (those only touch the
58    /// shadow), and true for `COMMIT` and for writes outside a TX.
59    CommandOk {
60        affected: usize,
61        modified_catalog: bool,
62    },
63    /// `SELECT` returned a (possibly empty) row set.
64    Rows {
65        columns: Vec<ColumnSchema>,
66        rows: Vec<Row>,
67    },
68}
69
70/// All errors the engine can return.
71///
72/// Marked `#[non_exhaustive]` from v7.5.0 onward: external `match`
73/// must include a `_` arm so new variants in subsequent v7.x releases
74/// are not breaking changes.
75#[derive(Debug, Clone, PartialEq)]
76#[non_exhaustive]
77pub enum EngineError {
78    Parse(ParseError),
79    Storage(StorageError),
80    Eval(EvalError),
81    /// Front-end accepted a construct that the v0.x executor doesn't support.
82    Unsupported(String),
83    /// `BEGIN` while another transaction is already open.
84    TransactionAlreadyOpen,
85    /// `COMMIT` / `ROLLBACK` with no active transaction.
86    NoActiveTransaction,
87    /// v4.0 sentinel: `execute_readonly` got a statement that
88    /// mutates engine state (INSERT / CREATE / BEGIN / COMMIT / …).
89    /// The caller should retake the write lock and dispatch through
90    /// `execute(&mut self)` instead.
91    WriteRequired,
92    /// v4.2: a SELECT would have returned more rows than the
93    /// configured `max_query_rows` cap. Carries the cap.
94    RowLimitExceeded(usize),
95    /// v4.5: cooperative cancellation — the host (server's
96    /// per-query watchdog) set the cancel flag while a long-running
97    /// SELECT / UPDATE / DELETE was scanning rows. The partial work
98    /// is discarded; the caller should surface this as a timeout
99    /// to the client.
100    Cancelled,
101}
102
103impl fmt::Display for EngineError {
104    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
105        match self {
106            Self::Parse(e) => write!(f, "parse: {e}"),
107            Self::Storage(e) => write!(f, "storage: {e}"),
108            Self::Eval(e) => write!(f, "eval: {e}"),
109            Self::Unsupported(s) => write!(f, "unsupported: {s}"),
110            Self::TransactionAlreadyOpen => f.write_str("a transaction is already open"),
111            Self::NoActiveTransaction => f.write_str("no active transaction"),
112            Self::WriteRequired => {
113                f.write_str("statement requires a write lock (use execute, not execute_readonly)")
114            }
115            Self::RowLimitExceeded(n) => {
116                write!(f, "query exceeded max_query_rows={n}")
117            }
118            Self::Cancelled => f.write_str("query cancelled (timeout or client request)"),
119        }
120    }
121}
122
123impl From<ParseError> for EngineError {
124    fn from(e: ParseError) -> Self {
125        Self::Parse(e)
126    }
127}
128impl From<StorageError> for EngineError {
129    fn from(e: StorageError) -> Self {
130        Self::Storage(e)
131    }
132}
133impl From<EvalError> for EngineError {
134    fn from(e: EvalError) -> Self {
135        Self::Eval(e)
136    }
137}
138
139/// The execution engine. Holds the catalog and (later) other server-scope
140/// state. `Engine::new()` is intentionally cheap so callers can construct one
141/// per database, per test.
142/// Function pointer that returns "now" as microseconds since Unix
143/// epoch. The engine is `no_std`, so it can't reach for `std::time`
144/// itself — callers (`spg-server`, the sqllogictest runner) inject a
145/// concrete implementation. `None` means `NOW()` / `CURRENT_*` raise
146/// `Unsupported`.
147pub type ClockFn = fn() -> i64;
148
149/// Function pointer that produces 16 cryptographically random bytes.
150/// Like `ClockFn`, the engine is `no_std` and can't reach for /dev/urandom
151/// itself — host (`spg-server`) injects an OS-backed source. `None`
152/// means SQL-driven `CREATE USER` falls back to a deterministic salt
153/// derived from the username (acceptable in tests; the server always
154/// installs a real RNG so production paths never see this).
155pub type SaltFn = fn() -> [u8; 16];
156
157/// v4.5 cooperative cancellation token. A long-running SELECT /
158/// UPDATE / DELETE checks `is_cancelled` at row-loop checkpoints
159/// and bails with `EngineError::Cancelled`. The host
160/// (`spg-server`) creates an `AtomicBool` per query, spawns a
161/// watchdog thread that sets it after `SPG_QUERY_TIMEOUT_MS`,
162/// and passes it via `execute_with_cancel` / `execute_readonly_with_cancel`.
163///
164/// `CancelToken::none()` is a no-op — used by the legacy `execute`
165/// and `execute_readonly` entry points so existing callers don't
166/// change.
167#[derive(Debug, Clone, Copy)]
168pub struct CancelToken<'a> {
169    flag: Option<&'a core::sync::atomic::AtomicBool>,
170}
171
172impl<'a> CancelToken<'a> {
173    #[must_use]
174    pub const fn none() -> Self {
175        Self { flag: None }
176    }
177
178    #[must_use]
179    pub const fn from_flag(f: &'a core::sync::atomic::AtomicBool) -> Self {
180        Self { flag: Some(f) }
181    }
182
183    #[must_use]
184    pub fn is_cancelled(self) -> bool {
185        self.flag
186            .is_some_and(|f| f.load(core::sync::atomic::Ordering::Relaxed))
187    }
188
189    /// Returns `Err(Cancelled)` if the token has been tripped.
190    /// Used at row-loop checkpoints to bail cooperatively without
191    /// scattering raw `is_cancelled` checks across the executor.
192    #[inline]
193    pub fn check(self) -> Result<(), EngineError> {
194        if self.is_cancelled() {
195            Err(EngineError::Cancelled)
196        } else {
197            Ok(())
198        }
199    }
200}
201
202// ---- snapshot envelope (v4.1, extended with CRC32 in v4.37,  ----
203// ----   publications in v6.1.2 v3, subscriptions in v6.1.4 v4) ----
204//
205// Wraps a catalog blob + a user blob behind a small header so the
206// server can persist both atomically without inventing a new file.
207// Bare catalog blobs (v3.x) still load via `restore_envelope` since
208// the magic check fails fast and the function falls back to
209// `Catalog::deserialize`.
210//
211// Layout — v1 (v4.1, no CRC):
212//   [8 bytes magic "SPGENV01"]
213//   [u8 version = 1]
214//   [u32 catalog_len][catalog bytes]
215//   [u32 users_len][users bytes]
216//
217// Layout — v2 (v4.37, CRC32 of body):
218//   [8 bytes magic "SPGENV01"]
219//   [u8 version = 2]
220//   [u32 catalog_len][catalog bytes]
221//   [u32 users_len][users bytes]
222//   [u32 crc32]                      ← CRC32 of every byte before it.
223//
224// Layout — v3 (v6.1.2, publications trailer):
225//   [8 bytes magic "SPGENV01"]
226//   [u8 version = 3]
227//   [u32 catalog_len][catalog bytes]
228//   [u32 users_len][users bytes]
229//   [u32 pubs_len][publications bytes]
230//   [u32 crc32]
231//
232// Layout — v4 (v6.1.4, subscriptions trailer):
233//   [8 bytes magic "SPGENV01"]
234//   [u8 version = 4]
235//   [u32 catalog_len][catalog bytes]
236//   [u32 users_len][users bytes]
237//   [u32 pubs_len][publications bytes]
238//   [u32 subs_len][subscriptions bytes]
239//   [u32 crc32]
240//
241// Layout — v5 (v6.2.0, statistics trailer):
242//   [8 bytes magic "SPGENV01"]
243//   [u8 version = 5]
244//   [u32 catalog_len][catalog bytes]
245//   [u32 users_len][users bytes]
246//   [u32 pubs_len][publications bytes]
247//   [u32 subs_len][subscriptions bytes]
248//   [u32 stats_len][statistics bytes]      ← NEW
249//   [u32 crc32]
250//
251// Writers emit v5 from v6.2.0 on. Readers accept all of {v1, v2,
252// v3, v4, v5}: v1/v2 load with empty publications / subscriptions /
253// statistics; v3 loads with empty subscriptions + statistics; v4
254// loads with empty statistics; v5 deserialises all three. Older
255// SPG versions reading a v5 envelope fall through the version
256// match to `EnvelopeParse::Bare` — pre-v6.2.0 binaries cannot
257// open v6.2.0+ snapshots (matches the v6.1.2 / v6.1.4 breaks).
258
259const ENVELOPE_MAGIC: &[u8; 8] = b"SPGENV01";
260const ENVELOPE_VERSION_V1: u8 = 1;
261const ENVELOPE_VERSION_V2: u8 = 2;
262const ENVELOPE_VERSION_V3: u8 = 3;
263const ENVELOPE_VERSION_V4: u8 = 4;
264const ENVELOPE_VERSION_V5: u8 = 5;
265
266fn build_envelope(
267    catalog: &[u8],
268    users: &[u8],
269    pubs: &[u8],
270    subs: &[u8],
271    stats: &[u8],
272) -> Vec<u8> {
273    let mut out = Vec::with_capacity(
274        8 + 1
275            + 4
276            + catalog.len()
277            + 4
278            + users.len()
279            + 4
280            + pubs.len()
281            + 4
282            + subs.len()
283            + 4
284            + stats.len()
285            + 4,
286    );
287    out.extend_from_slice(ENVELOPE_MAGIC);
288    out.push(ENVELOPE_VERSION_V5);
289    out.extend_from_slice(
290        &u32::try_from(catalog.len())
291            .expect("≤ 4G catalog")
292            .to_le_bytes(),
293    );
294    out.extend_from_slice(catalog);
295    out.extend_from_slice(
296        &u32::try_from(users.len())
297            .expect("≤ 4G users")
298            .to_le_bytes(),
299    );
300    out.extend_from_slice(users);
301    out.extend_from_slice(
302        &u32::try_from(pubs.len())
303            .expect("≤ 4G publications")
304            .to_le_bytes(),
305    );
306    out.extend_from_slice(pubs);
307    out.extend_from_slice(
308        &u32::try_from(subs.len())
309            .expect("≤ 4G subscriptions")
310            .to_le_bytes(),
311    );
312    out.extend_from_slice(subs);
313    out.extend_from_slice(
314        &u32::try_from(stats.len())
315            .expect("≤ 4G statistics")
316            .to_le_bytes(),
317    );
318    out.extend_from_slice(stats);
319    let crc = spg_crypto::crc32::crc32(&out);
320    out.extend_from_slice(&crc.to_le_bytes());
321    out
322}
323
324/// Outcome of envelope parsing: either bare-catalog fallback, a
325/// successfully split section trio from a v1/v2/v3 envelope, or an
326/// explicit corruption error from a v2/v3 CRC mismatch. `Bare`
327/// (catalog-only fallback) preserves v3.x readability. v1/v2
328/// envelopes set `publications` to `None`; v3 sets it to the
329/// publications byte slice.
330enum EnvelopeParse<'a> {
331    Bare,
332    Pair {
333        catalog: &'a [u8],
334        users: &'a [u8],
335        publications: Option<&'a [u8]>,
336        subscriptions: Option<&'a [u8]>,
337        statistics: Option<&'a [u8]>,
338    },
339    CrcMismatch {
340        expected: u32,
341        computed: u32,
342    },
343}
344
345/// Returns `EnvelopeParse::Pair` for a valid v1 / v2 / v3 envelope,
346/// `Bare` for a buffer that doesn't look like an envelope (v3.x
347/// bare catalog fallback), and `CrcMismatch` for a v2/v3 envelope
348/// whose trailing CRC32 doesn't match the body.
349fn split_envelope(buf: &[u8]) -> EnvelopeParse<'_> {
350    if buf.len() < 8 + 1 + 4 || &buf[..8] != ENVELOPE_MAGIC {
351        return EnvelopeParse::Bare;
352    }
353    let version = buf[8];
354    if !matches!(
355        version,
356        ENVELOPE_VERSION_V1
357            | ENVELOPE_VERSION_V2
358            | ENVELOPE_VERSION_V3
359            | ENVELOPE_VERSION_V4
360            | ENVELOPE_VERSION_V5
361    ) {
362        return EnvelopeParse::Bare;
363    }
364    let mut p = 9usize;
365    let Some(cat_len_bytes) = buf.get(p..p + 4) else {
366        return EnvelopeParse::Bare;
367    };
368    let Ok(cat_len_arr) = cat_len_bytes.try_into() else {
369        return EnvelopeParse::Bare;
370    };
371    let cat_len = u32::from_le_bytes(cat_len_arr) as usize;
372    p += 4;
373    if p + cat_len + 4 > buf.len() {
374        return EnvelopeParse::Bare;
375    }
376    let catalog = &buf[p..p + cat_len];
377    p += cat_len;
378    let Some(user_len_bytes) = buf.get(p..p + 4) else {
379        return EnvelopeParse::Bare;
380    };
381    let Ok(user_len_arr) = user_len_bytes.try_into() else {
382        return EnvelopeParse::Bare;
383    };
384    let user_len = u32::from_le_bytes(user_len_arr) as usize;
385    p += 4;
386    if p + user_len > buf.len() {
387        return EnvelopeParse::Bare;
388    }
389    let users = &buf[p..p + user_len];
390    p += user_len;
391    let publications = if matches!(
392        version,
393        ENVELOPE_VERSION_V3 | ENVELOPE_VERSION_V4 | ENVELOPE_VERSION_V5
394    ) {
395        // [u32 pubs_len][publications bytes]
396        let Some(pubs_len_bytes) = buf.get(p..p + 4) else {
397            return EnvelopeParse::Bare;
398        };
399        let Ok(pubs_len_arr) = pubs_len_bytes.try_into() else {
400            return EnvelopeParse::Bare;
401        };
402        let pubs_len = u32::from_le_bytes(pubs_len_arr) as usize;
403        p += 4;
404        if p + pubs_len > buf.len() {
405            return EnvelopeParse::Bare;
406        }
407        let pubs_slice = &buf[p..p + pubs_len];
408        p += pubs_len;
409        Some(pubs_slice)
410    } else {
411        None
412    };
413    let subscriptions = if matches!(version, ENVELOPE_VERSION_V4 | ENVELOPE_VERSION_V5) {
414        // [u32 subs_len][subscriptions bytes]
415        let Some(subs_len_bytes) = buf.get(p..p + 4) else {
416            return EnvelopeParse::Bare;
417        };
418        let Ok(subs_len_arr) = subs_len_bytes.try_into() else {
419            return EnvelopeParse::Bare;
420        };
421        let subs_len = u32::from_le_bytes(subs_len_arr) as usize;
422        p += 4;
423        if p + subs_len > buf.len() {
424            return EnvelopeParse::Bare;
425        }
426        let subs_slice = &buf[p..p + subs_len];
427        p += subs_len;
428        Some(subs_slice)
429    } else {
430        None
431    };
432    let statistics = if version == ENVELOPE_VERSION_V5 {
433        // [u32 stats_len][statistics bytes]
434        let Some(stats_len_bytes) = buf.get(p..p + 4) else {
435            return EnvelopeParse::Bare;
436        };
437        let Ok(stats_len_arr) = stats_len_bytes.try_into() else {
438            return EnvelopeParse::Bare;
439        };
440        let stats_len = u32::from_le_bytes(stats_len_arr) as usize;
441        p += 4;
442        if p + stats_len > buf.len() {
443            return EnvelopeParse::Bare;
444        }
445        let stats_slice = &buf[p..p + stats_len];
446        p += stats_len;
447        Some(stats_slice)
448    } else {
449        None
450    };
451    if matches!(
452        version,
453        ENVELOPE_VERSION_V2 | ENVELOPE_VERSION_V3 | ENVELOPE_VERSION_V4 | ENVELOPE_VERSION_V5
454    ) {
455        if p + 4 != buf.len() {
456            return EnvelopeParse::Bare;
457        }
458        let Ok(crc_arr) = buf[p..p + 4].try_into() else {
459            return EnvelopeParse::Bare;
460        };
461        let expected = u32::from_le_bytes(crc_arr);
462        let computed = spg_crypto::crc32::crc32(&buf[..p]);
463        if expected != computed {
464            return EnvelopeParse::CrcMismatch { expected, computed };
465        }
466    } else if p != buf.len() {
467        // v1: must end exactly at the users section.
468        return EnvelopeParse::Bare;
469    }
470    EnvelopeParse::Pair {
471        catalog,
472        users,
473        publications,
474        subscriptions,
475        statistics,
476    }
477}
478
479/// v4.41.1 opaque transaction handle. Returned by `Engine::alloc_tx_id`,
480/// threaded through `Engine::execute_in` so dispatch can identify which
481/// in-flight TX a statement belongs to. `IMPLICIT_TX` is the reserved
482/// slot every legacy caller — engine self-tests, spg-cli, spg-embedded,
483/// startup replay — implicitly uses through the unchanged
484/// `Engine::execute(sql)` API. v4.41.1 keeps at most one active slot at
485/// runtime (dispatch holds `engine.write()` across the wrap, same as
486/// v4.34); the map shape is here to let v4.42 turn on N in-flight
487/// implicit TXs without reshuffling the engine internals.
488#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
489pub struct TxId(pub u64);
490
491/// Reserved slot used by `Engine::execute(sql)` — the legacy single-
492/// global-shadow path. New `alloc_tx_id` handles start at 1.
493pub const IMPLICIT_TX: TxId = TxId(0);
494
495/// v6.7.3 — default segment-size threshold used by `COMPACT COLD
496/// SEGMENTS` when no explicit target is supplied. Segments whose
497/// `OwnedSegment::bytes().len()` is **strictly** less than this
498/// value are eligible to merge. spg-server reads
499/// `SPG_COMPACTION_TARGET_SEGMENT_BYTES` to override.
500pub const COMPACTION_TARGET_DEFAULT_BYTES: u64 = 4 * 1024 * 1024;
501
502/// Per-slot transaction state. Held inside `tx_catalogs[tx_id]` for the
503/// lifetime of a BEGIN..COMMIT (or BEGIN..ROLLBACK) window. Drops when
504/// the TX commits (its `catalog` is moved over `Engine.catalog`) or
505/// rolls back (slot removed, catalog discarded).
506#[derive(Debug, Default, Clone)]
507struct TxState {
508    /// The TX's shadow copy of the catalog. Started as a clone of
509    /// `Engine.catalog` at BEGIN time; writes flow into it; COMMIT
510    /// installs it over `Engine.catalog`. `Catalog::clone()` is O(1)
511    /// since v4.40 (`PersistentVec` rows + `PersistentBTreeMap` indices).
512    catalog: Catalog,
513    /// Per-TX savepoint stack. Each entry pairs the savepoint name with
514    /// a clone of `catalog` at the moment `SAVEPOINT <name>` fired.
515    /// `ROLLBACK TO <name>` restores from the entry and pops everything
516    /// after it; `RELEASE <name>` discards the entry and everything
517    /// after; COMMIT/ROLLBACK clears the whole stack.
518    savepoints: Vec<(String, Catalog)>,
519}
520
521#[derive(Debug, Default)]
522pub struct Engine {
523    /// Committed catalog — what survives `Engine::snapshot()` and what
524    /// outside-TX `SELECT`s read.
525    catalog: Catalog,
526    /// Active TX slots, keyed by `TxId`. Empty when no TX is in flight.
527    /// v4.41.1 runtime invariant: at most one entry (single-writer
528    /// model unchanged). v4.42 will let dispatch hold multiple entries
529    /// concurrently for group commit + engine MVCC.
530    tx_catalogs: BTreeMap<TxId, TxState>,
531    /// Which slot the next exec_* call should mutate. Set by
532    /// `execute_in(sql, tx_id)` at the entry point; legacy `execute(sql)`
533    /// sets it to `IMPLICIT_TX`. None when no TX is in flight (read /
534    /// write goes straight against `catalog`).
535    current_tx: Option<TxId>,
536    /// Monotonic counter for `alloc_tx_id`. Starts at 1 — slot 0 is
537    /// reserved for `IMPLICIT_TX`.
538    next_tx_id: u64,
539    /// Optional wall clock used to satisfy `NOW()` / `CURRENT_TIMESTAMP`
540    /// / `CURRENT_DATE`. Set by the host environment.
541    clock: Option<ClockFn>,
542    /// v4.1 cryptographic RNG for per-user password salt. Set by the
543    /// host. `None` means SQL-driven `CREATE USER` uses a
544    /// deterministic fallback — see `SaltFn`.
545    salt_fn: Option<SaltFn>,
546    /// v4.2 per-query row cap. `None` = unlimited. When set, a
547    /// SELECT that materialises more than `n` rows returns
548    /// `EngineError::RowLimitExceeded`. Enforced before the result
549    /// is shaped into wire frames so a runaway scan can't blow the
550    /// server's heap.
551    max_query_rows: Option<usize>,
552    /// v4.1 RBAC user table. Empty means "no RBAC configured yet" —
553    /// the server decides what that means at the auth boundary
554    /// (open mode vs legacy single-password mode). User CRUD goes
555    /// through `create_user`/`drop_user`/`verify_user`; persistence
556    /// rides the snapshot envelope alongside the catalog.
557    users: UserStore,
558    /// v6.1.2 logical-replication publication catalog. Empty until
559    /// `CREATE PUBLICATION` runs. Persistence rides the v3 envelope
560    /// trailer (see `build_envelope`).
561    publications: publications::Publications,
562    /// v6.1.4 logical-replication subscription catalog. Empty until
563    /// `CREATE SUBSCRIPTION` runs. Persistence rides the v4 envelope
564    /// trailer.
565    subscriptions: subscriptions::Subscriptions,
566    /// v6.2.0 — per-column statistics for the cost-based optimizer.
567    /// Populated by `ANALYZE`; queried via `spg_statistic` virtual
568    /// table. Persistence rides the v5 envelope trailer.
569    statistics: statistics::Statistics,
570    /// v6.3.0 — engine-level plan cache. Caches the post-`prepare()`
571    /// `Statement` keyed on SQL text. In-memory only — does NOT ride
572    /// the snapshot envelope (rebuilt on demand after restart).
573    plan_cache: plan_cache::PlanCache,
574    /// v6.5.1 — per-distinct-SQL execution stats. In-memory only,
575    /// surfaced via `spg_stat_query` virtual table. Updated by the
576    /// `execute_*` paths after a successful execute.
577    query_stats: query_stats::QueryStats,
578    /// v6.5.2 — connection-state provider callback. spg-server
579    /// registers a function at startup that snapshots its
580    /// per-pgwire-connection registry into `ActivityRow`s; engine
581    /// reads through it on every `SELECT * FROM spg_stat_activity`.
582    /// `None` ⇒ no-data (returns empty rows; matches the no_std
583    /// embedded callers that don't run pgwire).
584    activity_provider: Option<ActivityProvider>,
585    /// v6.5.3 — audit-chain provider + verifier. Same pattern as
586    /// activity_provider: spg-server registers both at startup;
587    /// engine reads through on `SELECT * FROM spg_audit_chain` and
588    /// `SELECT * FROM spg_audit_verify`. `None` ⇒ no-data.
589    audit_chain_provider: Option<AuditChainProvider>,
590    audit_verifier: Option<AuditVerifier>,
591    /// v6.5.6 — slow-query log threshold in microseconds. When set,
592    /// every successful execute whose elapsed exceeds the threshold
593    /// gets fed to the registered slow-query log callback (so
594    /// spg-server can emit a structured log line). Default `None`
595    /// = no slow-query logging.
596    slow_query_threshold_us: Option<u64>,
597    slow_query_logger: Option<SlowQueryLogger>,
598}
599
600/// v6.5.6 — callback signature for slow-query log emission. Called
601/// with `(sql, elapsed_us)` once per successful execute that crosses
602/// the threshold.
603pub type SlowQueryLogger = fn(&str, u64);
604
605/// v6.5.4 — synthesise a `CREATE TABLE` statement from catalog
606/// state. Round-trips through `Engine::execute` to recreate the
607/// same schema (sans data + indexes — indexes are emitted as a
608/// separate `CREATE INDEX` chain in `spg_database_ddl`).
609fn render_create_table(name: &str, columns: &[ColumnSchema]) -> String {
610    let mut out = alloc::format!("CREATE TABLE {name} (");
611    for (i, col) in columns.iter().enumerate() {
612        if i > 0 {
613            out.push_str(", ");
614        }
615        out.push_str(&col.name);
616        out.push(' ');
617        out.push_str(&render_data_type(col.ty));
618        if !col.nullable {
619            out.push_str(" NOT NULL");
620        }
621        if col.auto_increment {
622            out.push_str(" AUTO_INCREMENT");
623        }
624    }
625    out.push(')');
626    out
627}
628
629fn render_data_type(ty: DataType) -> String {
630    match ty {
631        DataType::SmallInt => "SMALLINT".into(),
632        DataType::Int => "INT".into(),
633        DataType::BigInt => "BIGINT".into(),
634        DataType::Float => "FLOAT".into(),
635        DataType::Text => "TEXT".into(),
636        DataType::Varchar(n) => alloc::format!("VARCHAR({n})"),
637        DataType::Char(n) => alloc::format!("CHAR({n})"),
638        DataType::Bool => "BOOL".into(),
639        DataType::Vector { dim, encoding } => match encoding {
640            spg_storage::VecEncoding::F32 => alloc::format!("VECTOR({dim})"),
641            spg_storage::VecEncoding::Sq8 => alloc::format!("VECTOR({dim}) USING SQ8"),
642            spg_storage::VecEncoding::F16 => alloc::format!("VECTOR({dim}) USING HALF"),
643        },
644        DataType::Numeric { precision, scale } => {
645            alloc::format!("NUMERIC({precision},{scale})")
646        }
647        DataType::Date => "DATE".into(),
648        DataType::Timestamp => "TIMESTAMP".into(),
649        DataType::Interval => "INTERVAL".into(),
650        DataType::Json => "JSON".into(),
651    }
652}
653
654/// v6.5.2 — one row of `spg_stat_activity`. Engine-public so
655/// spg-server can construct rows without re-exporting internal
656/// dispatch types.
657#[derive(Debug, Clone)]
658pub struct ActivityRow {
659    pub pid: u32,
660    pub user: String,
661    pub started_at_us: i64,
662    pub current_sql: String,
663    pub wait_event: String,
664    pub elapsed_us: i64,
665    pub in_transaction: bool,
666}
667
668/// v6.5.2 — provider callback type. Fresh snapshot returned each
669/// call; engine doesn't cache the slice.
670pub type ActivityProvider = fn() -> Vec<ActivityRow>;
671
672/// v6.5.3 — one row of `spg_audit_chain`. Engine-public so
673/// spg-server can construct rows directly from `AuditEntry`.
674#[derive(Debug, Clone)]
675pub struct AuditRow {
676    pub seq: i64,
677    pub ts_ms: i64,
678    pub prev_hash_hex: String,
679    pub entry_hash_hex: String,
680    pub sql: String,
681}
682
683/// v6.5.3 — chain-table provider + verifier. spg-server registers
684/// fn pointers that snapshot / verify the audit log. `verify`
685/// returns `(verified_count, broken_at_seq)` — `broken_at_seq` is
686/// `-1` on a clean chain.
687pub type AuditChainProvider = fn() -> Vec<AuditRow>;
688pub type AuditVerifier = fn() -> (i64, i64);
689
690impl Engine {
691    pub fn new() -> Self {
692        Self {
693            catalog: Catalog::new(),
694            tx_catalogs: BTreeMap::new(),
695            current_tx: None,
696            next_tx_id: 1,
697            clock: None,
698            salt_fn: None,
699            max_query_rows: None,
700            users: UserStore::new(),
701            publications: publications::Publications::new(),
702            subscriptions: subscriptions::Subscriptions::new(),
703            statistics: statistics::Statistics::new(),
704            plan_cache: plan_cache::PlanCache::new(),
705            query_stats: query_stats::QueryStats::new(),
706            activity_provider: None,
707            audit_chain_provider: None,
708            audit_verifier: None,
709            slow_query_threshold_us: None,
710            slow_query_logger: None,
711        }
712    }
713
714    /// Construct an engine restored from a previously-snapshotted catalog
715    /// (see `snapshot()`).
716    pub fn restore(catalog: Catalog) -> Self {
717        Self {
718            catalog,
719            tx_catalogs: BTreeMap::new(),
720            current_tx: None,
721            next_tx_id: 1,
722            clock: None,
723            salt_fn: None,
724            max_query_rows: None,
725            users: UserStore::new(),
726            publications: publications::Publications::new(),
727            subscriptions: subscriptions::Subscriptions::new(),
728            statistics: statistics::Statistics::new(),
729            plan_cache: plan_cache::PlanCache::new(),
730            query_stats: query_stats::QueryStats::new(),
731            activity_provider: None,
732            audit_chain_provider: None,
733            audit_verifier: None,
734            slow_query_threshold_us: None,
735            slow_query_logger: None,
736        }
737    }
738
739    /// Restore an engine + user table from a v4.1 envelope produced
740    /// by `snapshot_with_users()`. Falls back to plain catalog-only
741    /// restore if the envelope magic isn't present (so v3.x snapshot
742    /// files still load). v6.1.2 adds the optional publications
743    /// trailer (envelope v3); a v1/v2 envelope deserialises to an
744    /// empty publication table.
745    pub fn restore_envelope(buf: &[u8]) -> Result<Self, EngineError> {
746        match split_envelope(buf) {
747            EnvelopeParse::Pair {
748                catalog: catalog_bytes,
749                users: user_bytes,
750                publications: pub_bytes,
751                subscriptions: sub_bytes,
752                statistics: stats_bytes,
753            } => {
754                let catalog = Catalog::deserialize(catalog_bytes).map_err(EngineError::Storage)?;
755                let users = users::deserialize_users(user_bytes)
756                    .map_err(|e| EngineError::Unsupported(alloc::format!("users restore: {e}")))?;
757                let publications = match pub_bytes {
758                    Some(b) => publications::Publications::deserialize(b).map_err(|e| {
759                        EngineError::Unsupported(alloc::format!("publications restore: {e:?}"))
760                    })?,
761                    None => publications::Publications::new(),
762                };
763                let subscriptions = match sub_bytes {
764                    Some(b) => subscriptions::Subscriptions::deserialize(b).map_err(|e| {
765                        EngineError::Unsupported(alloc::format!("subscriptions restore: {e:?}"))
766                    })?,
767                    None => subscriptions::Subscriptions::new(),
768                };
769                let statistics = match stats_bytes {
770                    Some(b) => statistics::Statistics::deserialize(b).map_err(|e| {
771                        EngineError::Unsupported(alloc::format!("statistics restore: {e:?}"))
772                    })?,
773                    None => statistics::Statistics::new(),
774                };
775                Ok(Self {
776                    catalog,
777                    tx_catalogs: BTreeMap::new(),
778                    current_tx: None,
779                    next_tx_id: 1,
780                    clock: None,
781                    salt_fn: None,
782                    max_query_rows: None,
783                    users,
784                    publications,
785                    subscriptions,
786                    statistics,
787                    plan_cache: plan_cache::PlanCache::new(),
788                    query_stats: query_stats::QueryStats::new(),
789                    activity_provider: None,
790                    audit_chain_provider: None,
791                    audit_verifier: None,
792                    slow_query_threshold_us: None,
793                    slow_query_logger: None,
794                })
795            }
796            EnvelopeParse::CrcMismatch { expected, computed } => {
797                Err(EngineError::Storage(StorageError::Corrupt(alloc::format!(
798                    "snapshot envelope CRC32 mismatch (expected={expected:#010x}, computed={computed:#010x})"
799                ))))
800            }
801            EnvelopeParse::Bare => {
802                let catalog = Catalog::deserialize(buf).map_err(EngineError::Storage)?;
803                Ok(Self::restore(catalog))
804            }
805        }
806    }
807
808    pub const fn users(&self) -> &UserStore {
809        &self.users
810    }
811
812    /// `salt` is supplied by the caller (the host has a random
813    /// source; the engine is `no_std`). Caller should pass a fresh
814    /// 16-byte random value per user.
815    pub fn create_user(
816        &mut self,
817        name: &str,
818        password: &str,
819        role: Role,
820        salt: [u8; 16],
821    ) -> Result<(), UserError> {
822        self.users.create(name, password, role, salt)?;
823        // v4.8: also derive SCRAM-SHA-256 secrets so PG-wire SASL
824        // auth can verify without re-running PBKDF2 per attempt.
825        // Uses a fresh salt from the host RNG (falls back to a
826        // deterministic per-username salt when no RNG is wired, same
827        // as the legacy hash path).
828        let scram_salt = self.salt_fn.map_or_else(
829            || {
830                let mut s = [0u8; users::SCRAM_SALT_LEN];
831                let digest = spg_crypto::hash(name.as_bytes());
832                // Use bytes 16..32 of BLAKE3 so we don't reuse the
833                // exact same fallback salt as the BLAKE3 hash path.
834                s.copy_from_slice(&digest[16..32]);
835                s
836            },
837            |f| f(),
838        );
839        self.users
840            .enable_scram(name, password, scram_salt, users::SCRAM_DEFAULT_ITERS)?;
841        Ok(())
842    }
843
844    pub fn drop_user(&mut self, name: &str) -> Result<(), UserError> {
845        self.users.drop(name)
846    }
847
848    pub fn verify_user(&self, name: &str, password: &str) -> Option<Role> {
849        self.users.verify(name, password)
850    }
851
852    /// Builder: attach a wall clock so `NOW()` / `CURRENT_TIMESTAMP` /
853    /// `CURRENT_DATE` evaluate to a real value instead of erroring out.
854    #[must_use]
855    pub const fn with_clock(mut self, clock: ClockFn) -> Self {
856        self.clock = Some(clock);
857        self
858    }
859
860    /// Builder: attach an OS-backed RNG for per-user password salts.
861    /// The host (`spg-server`) typically wires this to `/dev/urandom`.
862    #[must_use]
863    pub const fn with_salt_fn(mut self, f: SaltFn) -> Self {
864        self.salt_fn = Some(f);
865        self
866    }
867
868    /// Builder: cap the number of rows a single SELECT may return.
869    /// Exceeding the cap raises `EngineError::RowLimitExceeded` —
870    /// the bound is checked inside the executor so a runaway
871    /// catalog scan can't allocate millions of rows before the
872    /// server gets a chance to reject the result.
873    #[must_use]
874    pub const fn with_max_query_rows(mut self, n: usize) -> Self {
875        self.max_query_rows = Some(n);
876        self
877    }
878
879    /// The *committed* catalog. Note: during a transaction this returns the
880    /// pre-TX state — `SELECT` inside a TX goes through `execute()` and reads
881    /// the shadow. Tests that inspect outside-TX state should use this.
882    pub const fn catalog(&self) -> &Catalog {
883        &self.catalog
884    }
885
886    /// Serialize the *committed* catalog to bytes. v0.6 was full-snapshot; v0.9
887    /// adds the rule that an open TX's shadow is never snapshotted — only the
888    /// post-COMMIT state is persisted. v4.1 wraps the catalog in an envelope
889    /// when there are users to persist; an empty user table snapshots as the
890    /// bare catalog format (backwards-compat with v3.x readers). v6.1.2
891    /// adds publications to the envelope condition: either non-empty
892    /// users OR non-empty publications now triggers the envelope path.
893    pub fn snapshot(&self) -> Vec<u8> {
894        if self.users.is_empty()
895            && self.publications.is_empty()
896            && self.subscriptions.is_empty()
897            && self.statistics.is_empty()
898        {
899            self.catalog.serialize()
900        } else {
901            build_envelope(
902                &self.catalog.serialize(),
903                &users::serialize_users(&self.users),
904                &self.publications.serialize(),
905                &self.subscriptions.serialize(),
906                &self.statistics.serialize(),
907            )
908        }
909    }
910
911    /// True when at least one TX slot is in flight. v4.41.1 runtime
912    /// invariant: at most one slot active at a time (dispatch holds
913    /// `engine.write()` across the entire wrap). v4.42 will let this
914    /// return true with multiple slots concurrently.
915    pub fn in_transaction(&self) -> bool {
916        !self.tx_catalogs.is_empty()
917    }
918
919    /// v4.41.1 allocate a fresh TX handle. Used by spg-server dispatch
920    /// to scope each implicit-wrap BEGIN..stmt..COMMIT to its own slot
921    /// in `tx_catalogs`. v4.42 — the commit-barrier leader allocates
922    /// one of these per task in its group, runs `BEGIN`+sql+`COMMIT`
923    /// sequentially under a single `engine.write()` so each task's
924    /// mutations accumulate into shared state, then either keeps the
925    /// accumulated state (fsync OK) or restores the pre-image via
926    /// `replace_catalog` (fsync err).
927    pub fn alloc_tx_id(&mut self) -> TxId {
928        let id = TxId(self.next_tx_id);
929        self.next_tx_id = self.next_tx_id.saturating_add(1);
930        id
931    }
932
933    /// v4.42 — atomically replace the live catalog. Used by the
934    /// commit-barrier leader to roll back a group whose batched
935    /// fsync failed: the leader snapshots `engine.catalog().clone()`
936    /// (O(1) Arc bump after the v4.39/v4.40 persistent migration)
937    /// at group start, sequentially applies each task's BEGIN+sql+
938    /// COMMIT under the same write lock to accumulate mutations
939    /// into shared state, batches the WAL bytes, fsyncs once, and
940    /// on failure calls this with the pre-image to undo every
941    /// task in the group at once.
942    ///
943    /// **Does NOT touch `tx_catalogs` / `current_tx`.** Any
944    /// explicit-TX slot from a concurrent client (created via the
945    /// legacy `IMPLICIT_TX`-less dispatch path or via the future
946    /// MVCC-readers v5+ work) has its own snapshot baked into the
947    /// slot — restoring `self.catalog` to the pre-image leaves
948    /// those slots untouched, exactly as they were when the leader
949    /// took the lock. The leader's own implicit-TX slots are all
950    /// already discarded (`exec_commit` removed them as each
951    /// task's COMMIT ran) by the time this is reached.
952    pub fn replace_catalog(&mut self, catalog: Catalog) {
953        self.catalog = catalog;
954    }
955
956    /// v6.7.0 — public shim around `Catalog::freeze_oldest_to_cold`
957    /// so tests + the spg-server freezer can drive a freeze without
958    /// reaching into the private `active_catalog_mut`. v6.7.4
959    /// parallel freezer will build on this surface.
960    ///
961    /// Marks the table's cached `cold_row_count` stale because the
962    /// freeze added cold locators that ANALYZE hasn't yet refreshed.
963    pub fn freeze_oldest_to_cold(
964        &mut self,
965        table_name: &str,
966        index_name: &str,
967        max_rows: usize,
968    ) -> Result<spg_storage::FreezeReport, EngineError> {
969        let report = self
970            .active_catalog_mut()
971            .freeze_oldest_to_cold(table_name, index_name, max_rows)
972            .map_err(EngineError::Storage)?;
973        if let Some(t) = self.active_catalog_mut().get_mut(table_name) {
974            t.mark_cold_row_count_stale();
975        }
976        Ok(report)
977    }
978
979    /// v6.7.5 — public shim used by the spg-server follower's
980    /// segment-forwarding receiver. Registers a cold-tier segment
981    /// at a specific id (the master's id, as transmitted on the
982    /// wire) so the follower's BTree-Cold locators stay byte-
983    /// identical with the master's. Wraps
984    /// `Catalog::load_segment_bytes_at` under the standard
985    /// clone-mutate-replace pattern.
986    ///
987    /// Returns `Ok(())` on success **and** on the "slot already
988    /// occupied" case — a follower mid-reconnect may receive a
989    /// segment chunk for a segment_id it already has on disk
990    /// (forwarded last session); the caller should treat that
991    /// path as a no-op rather than a fatal error.
992    pub fn receive_cold_segment(
993        &mut self,
994        segment_id: u32,
995        bytes: Vec<u8>,
996    ) -> Result<(), EngineError> {
997        let mut new_cat = self.catalog.clone();
998        match new_cat.load_segment_bytes_at(segment_id, bytes) {
999            Ok(()) => {
1000                self.replace_catalog(new_cat);
1001                Ok(())
1002            }
1003            Err(StorageError::Corrupt(msg)) if msg.contains("already occupied") => Ok(()),
1004            Err(e) => Err(EngineError::Storage(e)),
1005        }
1006    }
1007
1008    /// v6.7.3 — public shim around `Catalog::compact_cold_segments`
1009    /// driving every BTree index on every user table. Returns one
1010    /// `(table, index, report)` triple for each merge that
1011    /// actually happened (no-op (table, index) pairs are filtered
1012    /// out so callers can size persist-side work to the live
1013    /// merges). Caller is responsible for persisting each
1014    /// `report.merged_segment_bytes` and updating the on-disk
1015    /// segment registry; engine layer is no_std and never
1016    /// touches disk.
1017    ///
1018    /// Marks every touched table's cached `cold_row_count` stale
1019    /// — compaction GC'd some shadowed rows, so the count must be
1020    /// re-derived on the next ANALYZE.
1021    pub fn compact_cold_segments_with_target(
1022        &mut self,
1023        target_segment_bytes: u64,
1024    ) -> Result<Vec<(String, String, CompactReport)>, EngineError> {
1025        let table_names = self.active_catalog().table_names();
1026        let mut reports: Vec<(String, String, CompactReport)> = Vec::new();
1027        for tname in table_names {
1028            if is_internal_table_name(&tname) {
1029                continue;
1030            }
1031            let idx_names: Vec<String> = {
1032                let Some(t) = self.active_catalog().get(&tname) else {
1033                    continue;
1034                };
1035                t.indices()
1036                    .iter()
1037                    .filter(|i| matches!(i.kind, IndexKind::BTree(_)))
1038                    .map(|i| i.name.clone())
1039                    .collect()
1040            };
1041            for iname in idx_names {
1042                let report = self
1043                    .active_catalog_mut()
1044                    .compact_cold_segments(&tname, &iname, target_segment_bytes)
1045                    .map_err(EngineError::Storage)?;
1046                if report.merged_segment_id.is_some() {
1047                    if let Some(t) = self.active_catalog_mut().get_mut(&tname) {
1048                        t.mark_cold_row_count_stale();
1049                    }
1050                    reports.push((tname.clone(), iname, report));
1051                }
1052            }
1053        }
1054        Ok(reports)
1055    }
1056
1057    fn active_catalog(&self) -> &Catalog {
1058        match self.current_tx {
1059            Some(t) => self
1060                .tx_catalogs
1061                .get(&t)
1062                .map_or(&self.catalog, |s| &s.catalog),
1063            None => &self.catalog,
1064        }
1065    }
1066
1067    fn active_catalog_mut(&mut self) -> &mut Catalog {
1068        let tx = self.current_tx;
1069        match tx {
1070            Some(t) => match self.tx_catalogs.get_mut(&t) {
1071                Some(s) => &mut s.catalog,
1072                None => &mut self.catalog,
1073            },
1074            None => &mut self.catalog,
1075        }
1076    }
1077
1078    /// Read-only execute path. Succeeds for `SELECT` / `SHOW TABLES`
1079    /// / `SHOW COLUMNS`; returns `EngineError::WriteRequired` for
1080    /// every other statement, so the caller can fall through to the
1081    /// `&mut self` `execute` path under a write lock. Engine state is
1082    /// not mutated even on the success path (`rewrite_clock_calls`
1083    /// and `resolve_order_by_position` both mutate the locally-owned
1084    /// AST, not `self`).
1085    ///
1086    /// **v4.0 concurrency**: this is the entry point the server takes
1087    /// under an `RwLock::read()` so multiple `SELECT` clients run in
1088    /// parallel without serialising on a single mutex.
1089    pub fn execute_readonly(&self, sql: &str) -> Result<QueryResult, EngineError> {
1090        self.execute_readonly_with_cancel(sql, CancelToken::none())
1091    }
1092
1093    /// v4.5 — read path with cooperative cancellation. Token's
1094    /// `is_cancelled` is checked at the start (so a watchdog that
1095    /// already fired returns Cancelled immediately) and at row-loop
1096    /// checkpoints inside `exec_select`. SHOW paths are O(small) and
1097    /// don't bother checking.
1098    pub fn execute_readonly_with_cancel(
1099        &self,
1100        sql: &str,
1101        cancel: CancelToken<'_>,
1102    ) -> Result<QueryResult, EngineError> {
1103        cancel.check()?;
1104        let mut stmt = parser::parse_statement(sql)?;
1105        let now_micros = self.clock.map(|f| f());
1106        rewrite_clock_calls(&mut stmt, now_micros);
1107        if let Statement::Select(s) = &mut stmt {
1108            resolve_order_by_position(s);
1109            // v6.2.3 — cost-based JOIN reorder (read path).
1110            reorder::reorder_joins(s, &self.catalog, &self.statistics);
1111        }
1112        let result = match stmt {
1113            Statement::Select(s) => self.exec_select_cancel(&s, cancel),
1114            Statement::ShowTables => Ok(self.exec_show_tables()),
1115            Statement::ShowColumns(table) => self.exec_show_columns(&table),
1116            Statement::ShowUsers => Ok(self.exec_show_users()),
1117            Statement::ShowPublications => Ok(self.exec_show_publications()),
1118            Statement::ShowSubscriptions => Ok(self.exec_show_subscriptions()),
1119            Statement::WaitForWalPosition { .. } => Err(EngineError::Unsupported(
1120                "WAIT FOR WAL POSITION must be handled by the server layer".into(),
1121            )),
1122            Statement::Explain(e) => self.exec_explain(&e, cancel),
1123            _ => Err(EngineError::WriteRequired),
1124        };
1125        self.enforce_row_limit(result)
1126    }
1127
1128    /// v4.2: cap result-set size. Applied after the executor
1129    /// materialises rows but before they leave the engine — wrapping
1130    /// every Rows-returning exec_* function would scatter the check.
1131    fn enforce_row_limit(
1132        &self,
1133        result: Result<QueryResult, EngineError>,
1134    ) -> Result<QueryResult, EngineError> {
1135        if let (Ok(QueryResult::Rows { rows, .. }), Some(cap)) = (&result, self.max_query_rows)
1136            && rows.len() > cap
1137        {
1138            return Err(EngineError::RowLimitExceeded(cap));
1139        }
1140        result
1141    }
1142
1143    pub fn execute(&mut self, sql: &str) -> Result<QueryResult, EngineError> {
1144        self.execute_in_with_cancel(sql, IMPLICIT_TX, CancelToken::none())
1145    }
1146
1147    /// v4.5 — write path with cooperative cancellation. Same dispatch
1148    /// as `execute_in_with_cancel(sql, IMPLICIT_TX, cancel)`. Kept as
1149    /// a separate entry point for backward-compat with the v4.5
1150    /// public API.
1151    pub fn execute_with_cancel(
1152        &mut self,
1153        sql: &str,
1154        cancel: CancelToken<'_>,
1155    ) -> Result<QueryResult, EngineError> {
1156        self.execute_in_with_cancel(sql, IMPLICIT_TX, cancel)
1157    }
1158
1159    /// v4.41.1 multi-slot write entry. Routes `sql` through the TX
1160    /// slot identified by `tx_id` so spg-server dispatch can scope
1161    /// each implicit-wrap BEGIN..stmt..COMMIT to its own slot in
1162    /// `tx_catalogs`. `IMPLICIT_TX` is the legacy single-slot path
1163    /// every other caller (engine self-tests, replay, spg-embedded)
1164    /// implicitly takes via `execute()` / `execute_with_cancel()`.
1165    pub fn execute_in(&mut self, sql: &str, tx_id: TxId) -> Result<QueryResult, EngineError> {
1166        self.execute_in_with_cancel(sql, tx_id, CancelToken::none())
1167    }
1168
1169    /// v4.41.1 write path with cooperative cancellation + explicit TX
1170    /// scope. Sets `self.current_tx` for the duration of the call so
1171    /// every `exec_*` helper transparently sees its TX's shadow
1172    /// catalog and savepoint stack; restores on exit so the field is
1173    /// only valid mid-call (no leakage across calls).
1174    pub fn execute_in_with_cancel(
1175        &mut self,
1176        sql: &str,
1177        tx_id: TxId,
1178        cancel: CancelToken<'_>,
1179    ) -> Result<QueryResult, EngineError> {
1180        let saved = self.current_tx;
1181        self.current_tx = Some(tx_id);
1182        let result = self.execute_inner_with_cancel(sql, cancel);
1183        self.current_tx = saved;
1184        result
1185    }
1186
1187    /// v6.1.1 — parse and pre-process a SQL string ONCE so the
1188    /// resulting [`Statement`] can be cached and re-executed via
1189    /// [`Engine::execute_prepared`]. Returns the same `Statement`
1190    /// the simple-query path would synthesise internally (clock
1191    /// rewrites + ORDER BY position-ref resolution applied at
1192    /// prepare time, since both are session-independent). The
1193    /// `$N` placeholders in the SQL stay as `Expr::Placeholder(n)`
1194    /// nodes; they're resolved to concrete values per-call by
1195    /// `execute_prepared`'s substitution walk.
1196    ///
1197    /// Pgwire's `Parse` (P) message lands here.
1198    pub fn prepare(&self, sql: &str) -> Result<Statement, ParseError> {
1199        let mut stmt = parser::parse_statement(sql)?;
1200        let now_micros = self.clock.map(|f| f());
1201        rewrite_clock_calls(&mut stmt, now_micros);
1202        if let Statement::Select(s) = &mut stmt {
1203            // v6.4.1 — expand `GROUP BY ALL` to every non-aggregate
1204            // SELECT-list item BEFORE position / alias resolution so
1205            // downstream passes see the explicit list.
1206            expand_group_by_all(s);
1207            resolve_order_by_position(s);
1208            // v6.2.3 — cost-based JOIN reorder. No-op for
1209            // single-table FROMs or any non-INNER join shape.
1210            reorder::reorder_joins(s, &self.catalog, &self.statistics);
1211        }
1212        Ok(stmt)
1213    }
1214
1215    /// v6.3.0 — cached prepare. Returns a cloned `Statement` from
1216    /// the plan cache on hit, runs the full `prepare()` path on miss
1217    /// and inserts the resulting plan before returning. Skipping the
1218    /// parse + JOIN-reorder pipeline on hit is the dominant win for
1219    /// JDBC / sqlx / pgx clients that reuse the same SQL string.
1220    ///
1221    /// Returns a cloned `Statement` (not a borrow) because the
1222    /// pgwire layer owns its `PreparedStmt` map per-session and the
1223    /// engine-level cache must stay available for other sessions.
1224    /// Clone cost on a 5-table JOIN AST is well under the parse cost
1225    /// it replaces.
1226    pub fn prepare_cached(&mut self, sql: &str) -> Result<Statement, ParseError> {
1227        // v6.3.1 — version-aware lookup. If the cached plan was
1228        // prepared before the most recent ANALYZE, evict and replan.
1229        let current_version = self.statistics.version();
1230        if let Some(plan) = self.plan_cache.get(sql) {
1231            if plan.statistics_version == current_version {
1232                return Ok(plan.stmt.clone());
1233            }
1234            // Stale entry — fall through to evict + re-prepare.
1235        }
1236        self.plan_cache.evict(sql);
1237        let stmt = self.prepare(sql)?;
1238        let source_tables = plan_cache::collect_source_tables(&stmt);
1239        let plan = plan_cache::PreparedPlan {
1240            stmt: stmt.clone(),
1241            statistics_version: current_version,
1242            source_tables,
1243            describe_columns: alloc::vec::Vec::new(),
1244        };
1245        self.plan_cache.insert(String::from(sql), plan);
1246        Ok(stmt)
1247    }
1248
1249    /// v6.3.0 — read-only accessor for tests and v6.3.1 invalidation.
1250    pub fn plan_cache(&self) -> &plan_cache::PlanCache {
1251        &self.plan_cache
1252    }
1253
1254    /// v6.3.0 — mutable accessor for v6.3.1 invalidation hooks.
1255    pub fn plan_cache_mut(&mut self) -> &mut plan_cache::PlanCache {
1256        &mut self.plan_cache
1257    }
1258
1259    /// v6.3.3 — Describe a prepared `Statement` without executing.
1260    /// Returns `(parameter_oids, output_columns)`. Empty
1261    /// `output_columns` means the statement has no row-producing
1262    /// shape we could resolve here (JOIN, subquery, non-SELECT, …)
1263    /// — pgwire layer maps that to a `NoData` reply.
1264    pub fn describe_prepared(
1265        &self,
1266        stmt: &Statement,
1267    ) -> (Vec<u32>, Vec<ColumnSchema>) {
1268        describe::describe_prepared(stmt, self.active_catalog())
1269    }
1270
1271    /// v6.1.1 — execute a [`Statement`] previously returned by
1272    /// [`Engine::prepare`], substituting `Expr::Placeholder(n)`
1273    /// nodes for the corresponding [`Value`] in `params` (1-based
1274    /// per PG: `$1` → `params[0]`). Bind-time string parameters
1275    /// are decoded into typed `Value`s by the pgwire layer before
1276    /// this call so the resulting AST hits the same execution
1277    /// path as a simple query — no SQL re-parse.
1278    ///
1279    /// Pgwire's `Execute` (E) message after a `Bind` (B) lands here.
1280    pub fn execute_prepared(
1281        &mut self,
1282        mut stmt: Statement,
1283        params: &[Value],
1284    ) -> Result<QueryResult, EngineError> {
1285        substitute_placeholders(&mut stmt, params)?;
1286        self.execute_stmt_with_cancel(stmt, CancelToken::none())
1287    }
1288
1289    fn execute_inner_with_cancel(
1290        &mut self,
1291        sql: &str,
1292        cancel: CancelToken<'_>,
1293    ) -> Result<QueryResult, EngineError> {
1294        cancel.check()?;
1295        let stmt = self.prepare(sql)?;
1296        // v6.5.1 — wrap the executor with a wall-clock window so we
1297        // can record into spg_stat_query. Skip when the engine has
1298        // no clock attached (no_std embedded callers).
1299        let start_us = self.clock.map(|f| f());
1300        let result = self.execute_stmt_with_cancel(stmt, cancel);
1301        if let (Some(t0), Ok(_)) = (start_us, &result) {
1302            let now = self.clock.map_or(t0, |f| f());
1303            let elapsed = now.saturating_sub(t0).max(0) as u64;
1304            self.query_stats.record(sql, elapsed, now as u64);
1305            // v6.5.6 — slow-query log: fire callback when elapsed
1306            // exceeds the configured floor.
1307            if let (Some(threshold), Some(logger)) =
1308                (self.slow_query_threshold_us, self.slow_query_logger)
1309                && elapsed >= threshold
1310            {
1311                logger(sql, elapsed);
1312            }
1313        }
1314        result
1315    }
1316
1317    fn execute_stmt_with_cancel(
1318        &mut self,
1319        stmt: Statement,
1320        cancel: CancelToken<'_>,
1321    ) -> Result<QueryResult, EngineError> {
1322        cancel.check()?;
1323        let result = match stmt {
1324            Statement::CreateTable(s) => self.exec_create_table(s),
1325            Statement::CreateIndex(s) => self.exec_create_index(s),
1326            Statement::Insert(s) => self.exec_insert(s),
1327            Statement::Update(s) => self.exec_update_cancel(&s, cancel),
1328            Statement::Delete(s) => self.exec_delete_cancel(&s, cancel),
1329            Statement::Select(s) => self.exec_select_cancel(&s, cancel),
1330            Statement::Begin => self.exec_begin(),
1331            Statement::Commit => self.exec_commit(),
1332            Statement::Rollback => self.exec_rollback(),
1333            Statement::Savepoint(name) => self.exec_savepoint(name),
1334            Statement::RollbackToSavepoint(name) => self.exec_rollback_to_savepoint(&name),
1335            Statement::ReleaseSavepoint(name) => self.exec_release_savepoint(&name),
1336            Statement::ShowTables => Ok(self.exec_show_tables()),
1337            Statement::ShowColumns(table) => self.exec_show_columns(&table),
1338            Statement::ShowUsers => Ok(self.exec_show_users()),
1339            Statement::ShowPublications => Ok(self.exec_show_publications()),
1340            Statement::ShowSubscriptions => Ok(self.exec_show_subscriptions()),
1341            Statement::CreateUser(s) => self.exec_create_user(&s),
1342            Statement::DropUser(name) => self.exec_drop_user(&name),
1343            Statement::Explain(e) => self.exec_explain(&e, cancel),
1344            Statement::AlterIndex(s) => self.exec_alter_index(s),
1345            Statement::AlterTable(s) => self.exec_alter_table(s),
1346            Statement::CreatePublication(s) => self.exec_create_publication(s),
1347            Statement::DropPublication(name) => self.exec_drop_publication(&name),
1348            Statement::CreateSubscription(s) => self.exec_create_subscription(s),
1349            Statement::DropSubscription(name) => self.exec_drop_subscription(&name),
1350            // v6.1.7 — WAIT FOR WAL POSITION needs `lag_state`,
1351            // which lives in spg-server's ServerState. The engine
1352            // surfaces a clear error; the server-layer dispatch
1353            // intercepts the SQL before it reaches the engine on
1354            // a server build, so this arm only fires for
1355            // engine-only callers (spg-embedded, lib tests).
1356            Statement::WaitForWalPosition { .. } => Err(EngineError::Unsupported(
1357                "WAIT FOR WAL POSITION must be handled by the server layer".into(),
1358            )),
1359            // v6.2.0 — ANALYZE recomputes per-column histograms.
1360            Statement::Analyze(target) => self.exec_analyze(target.as_deref()),
1361            // v6.7.3 — COMPACT COLD SEGMENTS.
1362            Statement::CompactColdSegments => self.exec_compact_cold_segments(),
1363        };
1364        self.enforce_row_limit(result)
1365    }
1366
1367    /// v6.1.2 — `CREATE PUBLICATION` runtime path. Duplicate names
1368    /// surface as `EngineError::Unsupported` so the existing PG-wire
1369    /// error mapping stays uniform; the message carries the name so
1370    /// operators can grep replication-log noise. Inside-transaction
1371    /// invocation is rejected (matches `CREATE USER` / `DROP USER`
1372    /// stance) — replication-catalog mutation is a connection-level
1373    /// administrative op, not a transactional one.
1374    fn exec_create_publication(
1375        &mut self,
1376        s: CreatePublicationStatement,
1377    ) -> Result<QueryResult, EngineError> {
1378        // v6.1.4 — the v6.1.2 "no DDL inside a transaction" guard
1379        // was over-cautious: it also blocked the auto-commit wrap
1380        // path (which begins an internal TX around every WAL-
1381        // logged statement). PG itself allows CREATE PUBLICATION
1382        // inside a transaction (it rolls back with the TX).
1383        self.publications
1384            .create(s.name, s.scope)
1385            .map_err(|e| EngineError::Unsupported(alloc::format!("CREATE PUBLICATION: {e:?}")))?;
1386        Ok(QueryResult::CommandOk {
1387            affected: 1,
1388            modified_catalog: true,
1389        })
1390    }
1391
1392    /// v6.1.2 — `DROP PUBLICATION` runtime path. PG-compatible silent
1393    /// no-op when the publication doesn't exist (returns `affected=0`
1394    /// in that case so the wire-level command tag distinguishes
1395    /// "dropped" from "no-op", though both succeed).
1396    fn exec_drop_publication(&mut self, name: &str) -> Result<QueryResult, EngineError> {
1397        let removed = self.publications.drop(name);
1398        Ok(QueryResult::CommandOk {
1399            affected: usize::from(removed),
1400            modified_catalog: removed,
1401        })
1402    }
1403
1404    /// v6.1.2 — read access to the publication catalog. Used by
1405    /// the v6.1.5 publisher-side WAL filter, by `SHOW PUBLICATIONS`
1406    /// (v6.1.3+), and by e2e tests that need to assert state without
1407    /// going through the wire.
1408    pub const fn publications(&self) -> &publications::Publications {
1409        &self.publications
1410    }
1411
1412    /// v6.1.4 — `CREATE SUBSCRIPTION` runtime path. Defaults
1413    /// `enabled = true` and `last_received_pos = 0` for a freshly-
1414    /// created subscription. The actual worker thread is spawned
1415    /// by spg-server once the engine returns success.
1416    fn exec_create_subscription(
1417        &mut self,
1418        s: CreateSubscriptionStatement,
1419    ) -> Result<QueryResult, EngineError> {
1420        // See exec_create_publication — the in_transaction gate
1421        // was over-cautious; the auto-commit wrap path holds an
1422        // internal TX that this check was incorrectly blocking.
1423        let sub = subscriptions::Subscription {
1424            conn_str: s.conn_str,
1425            publications: s.publications,
1426            enabled: true,
1427            last_received_pos: 0,
1428        };
1429        self.subscriptions
1430            .create(s.name, sub)
1431            .map_err(|e| EngineError::Unsupported(alloc::format!("CREATE SUBSCRIPTION: {e:?}")))?;
1432        Ok(QueryResult::CommandOk {
1433            affected: 1,
1434            modified_catalog: true,
1435        })
1436    }
1437
1438    /// v6.1.4 — `DROP SUBSCRIPTION`. Silent no-op when the name
1439    /// doesn't exist (PG-compatible). The associated worker is
1440    /// torn down by spg-server when it observes the catalog
1441    /// change at the next snapshot or via the engine's
1442    /// subscriptions accessor (the worker polls the catalog on
1443    /// reconnect; v6.1.5's filter-side will tighten this to an
1444    /// explicit signal).
1445    fn exec_drop_subscription(&mut self, name: &str) -> Result<QueryResult, EngineError> {
1446        let removed = self.subscriptions.drop(name);
1447        Ok(QueryResult::CommandOk {
1448            affected: usize::from(removed),
1449            modified_catalog: removed,
1450        })
1451    }
1452
1453    /// v6.1.4 — read access to the subscription catalog. Used by
1454    /// the subscription worker (read its own row to find its
1455    /// publications + last applied position), by SHOW SUBSCRIPTIONS,
1456    /// and by e2e tests asserting state directly.
1457    pub const fn subscriptions(&self) -> &subscriptions::Subscriptions {
1458        &self.subscriptions
1459    }
1460
1461    /// v6.1.4 — write access to `last_received_pos`. Worker
1462    /// calls this after each apply batch (under the engine's
1463    /// write-lock). Returns `false` when the subscription was
1464    /// dropped between when the worker received the record and
1465    /// when this call landed.
1466    pub fn subscription_advance(&mut self, name: &str, pos: u64) -> bool {
1467        self.subscriptions.update_last_received_pos(name, pos)
1468    }
1469
1470    /// v6.1.4 — `SHOW SUBSCRIPTIONS` row materialisation. Returns
1471    /// `(name, conn_str, publications, enabled, last_received_pos)`
1472    /// ordered by subscription name. The `publications` column is
1473    /// the comma-joined list ("p1, p2") for ergonomic SHOW output;
1474    /// callers wanting structured access read `Engine::subscriptions`.
1475    fn exec_show_subscriptions(&self) -> QueryResult {
1476        let columns = alloc::vec![
1477            ColumnSchema::new("name", DataType::Text, false),
1478            ColumnSchema::new("conn_str", DataType::Text, false),
1479            ColumnSchema::new("publications", DataType::Text, false),
1480            ColumnSchema::new("enabled", DataType::Bool, false),
1481            ColumnSchema::new("last_received_pos", DataType::BigInt, false),
1482        ];
1483        let rows: Vec<Row> = self
1484            .subscriptions
1485            .iter()
1486            .map(|(name, sub)| {
1487                Row::new(alloc::vec![
1488                    Value::Text(name.clone()),
1489                    Value::Text(sub.conn_str.clone()),
1490                    Value::Text(sub.publications.join(", ")),
1491                    Value::Bool(sub.enabled),
1492                    Value::BigInt(i64::try_from(sub.last_received_pos).unwrap_or(i64::MAX)),
1493                ])
1494            })
1495            .collect();
1496        QueryResult::Rows { columns, rows }
1497    }
1498
1499    /// v6.2.0 — materialise `spg_statistic` rows. One row per
1500    /// `(table, column)` pair tracked in `Statistics`, with
1501    /// `histogram_bounds` rendered as a `[v0, v1, ...]` string —
1502    /// the same canonical form vector literals use for round-trip.
1503    fn exec_spg_statistic(&self) -> QueryResult {
1504        let columns = alloc::vec![
1505            ColumnSchema::new("table_name", DataType::Text, false),
1506            ColumnSchema::new("column_name", DataType::Text, false),
1507            ColumnSchema::new("null_frac", DataType::Float, false),
1508            ColumnSchema::new("n_distinct", DataType::BigInt, false),
1509            ColumnSchema::new("histogram_bounds", DataType::Text, false),
1510            // v6.7.0 — appended column (v6.2.0 stability contract
1511            // allows APPEND to spg_statistic, not reorder/rename).
1512            // Reports the cached per-table cold-row count; same
1513            // value across every column row of the same table.
1514            ColumnSchema::new("cold_row_count", DataType::BigInt, false),
1515        ];
1516        let rows: Vec<Row> = self
1517            .statistics
1518            .iter()
1519            .map(|((t, c), s)| {
1520                let cold = self
1521                    .catalog
1522                    .get(t)
1523                    .map_or(0, |table| table.cold_row_count());
1524                Row::new(alloc::vec![
1525                    Value::Text(t.clone()),
1526                    Value::Text(c.clone()),
1527                    Value::Float(f64::from(s.null_frac)),
1528                    Value::BigInt(i64::try_from(s.n_distinct).unwrap_or(i64::MAX)),
1529                    Value::Text(render_histogram_bounds(&s.histogram_bounds)),
1530                    Value::BigInt(i64::try_from(cold).unwrap_or(i64::MAX)),
1531                ])
1532            })
1533            .collect();
1534        QueryResult::Rows { columns, rows }
1535    }
1536
1537    /// v6.5.0 — materialise `spg_stat_replication` rows. One row
1538    /// per subscription with `(name, conn_str, publications,
1539    /// last_received_pos, enabled)`. Surface mirrors
1540    /// `SHOW SUBSCRIPTIONS` but follows the virtual-table dispatch
1541    /// shape so it composes with SELECT clauses (WHERE, projection
1542    /// onto specific columns, etc).
1543    fn exec_spg_stat_replication(&self) -> QueryResult {
1544        let columns = alloc::vec![
1545            ColumnSchema::new("name", DataType::Text, false),
1546            ColumnSchema::new("conn_str", DataType::Text, false),
1547            ColumnSchema::new("publications", DataType::Text, false),
1548            ColumnSchema::new("last_received_pos", DataType::BigInt, false),
1549            ColumnSchema::new("enabled", DataType::Bool, false),
1550        ];
1551        let rows: Vec<Row> = self
1552            .subscriptions
1553            .iter()
1554            .map(|(name, sub)| {
1555                Row::new(alloc::vec![
1556                    Value::Text(name.clone()),
1557                    Value::Text(sub.conn_str.clone()),
1558                    Value::Text(sub.publications.join(",")),
1559                    Value::BigInt(i64::try_from(sub.last_received_pos).unwrap_or(i64::MAX)),
1560                    Value::Bool(sub.enabled),
1561                ])
1562            })
1563            .collect();
1564        QueryResult::Rows { columns, rows }
1565    }
1566
1567    /// v6.5.0 — materialise `spg_stat_segment` rows. One row per
1568    /// cold-tier segment with `(segment_id, num_rows, num_pages,
1569    /// total_bytes)`.
1570    ///
1571    /// v6.7.0 — appended `table_name` column resolves the v6.5.0
1572    /// carve-out. Walks every user table's BTree indices to find
1573    /// which table's Cold locators point at each segment. Empty
1574    /// string for orphan segments (loaded via SPG_PRELOAD_COLD_SEGMENT
1575    /// before any index registered a locator). The walk is
1576    /// O(tables × indices × keys); cached per call, not across
1577    /// calls — re-walked on every `SELECT * FROM spg_stat_segment`.
1578    fn exec_spg_stat_segment(&self) -> QueryResult {
1579        let columns = alloc::vec![
1580            ColumnSchema::new("segment_id", DataType::BigInt, false),
1581            ColumnSchema::new("table_name", DataType::Text, false),
1582            ColumnSchema::new("num_rows", DataType::BigInt, false),
1583            ColumnSchema::new("num_pages", DataType::BigInt, false),
1584            ColumnSchema::new("total_bytes", DataType::BigInt, false),
1585        ];
1586        // v6.7.0 — build a segment_id → table_name map by walking
1587        // every user table's BTree indices once. O(tables × indices
1588        // × keys) for the v6.5.0 carve-out resolution; acceptable
1589        // because spg_stat_segment is operator-facing (not on a
1590        // hot-loop path).
1591        let mut segment_owners: alloc::collections::BTreeMap<u32, String> = BTreeMap::new();
1592        for tname in self.catalog.table_names() {
1593            if is_internal_table_name(&tname) {
1594                continue;
1595            }
1596            let Some(t) = self.catalog.get(&tname) else {
1597                continue;
1598            };
1599            for idx in t.indices() {
1600                if let spg_storage::IndexKind::BTree(map) = &idx.kind {
1601                    for (_, locs) in map.iter() {
1602                        for loc in locs {
1603                            if let spg_storage::RowLocator::Cold { segment_id, .. } = loc {
1604                                segment_owners.entry(*segment_id).or_insert_with(|| tname.clone());
1605                            }
1606                        }
1607                    }
1608                }
1609            }
1610        }
1611        let rows: Vec<Row> = self
1612            .catalog
1613            .cold_segment_ids_global()
1614            .iter()
1615            .filter_map(|&id| {
1616                let seg = self.catalog.cold_segment(id)?;
1617                let meta = seg.meta();
1618                let owner = segment_owners
1619                    .get(&id)
1620                    .cloned()
1621                    .unwrap_or_default();
1622                Some(Row::new(alloc::vec![
1623                    Value::BigInt(i64::from(id)),
1624                    Value::Text(owner),
1625                    Value::BigInt(i64::try_from(meta.num_rows).unwrap_or(i64::MAX)),
1626                    Value::BigInt(i64::from(meta.num_pages)),
1627                    Value::BigInt(i64::try_from(meta.total_bytes).unwrap_or(i64::MAX)),
1628                ]))
1629            })
1630            .collect();
1631        QueryResult::Rows { columns, rows }
1632    }
1633
1634    /// v6.5.1 — materialise `spg_stat_query` rows. One row per
1635    /// distinct SQL text recorded since the engine booted, capped
1636    /// at `QUERY_STATS_MAX` (1024). Columns:
1637    ///   sql, exec_count, total_us, mean_us, max_us, last_seen_us
1638    /// mean_us = total_us / exec_count (saturating).
1639    fn exec_spg_stat_query(&self) -> QueryResult {
1640        let columns = alloc::vec![
1641            ColumnSchema::new("sql", DataType::Text, false),
1642            ColumnSchema::new("exec_count", DataType::BigInt, false),
1643            ColumnSchema::new("total_us", DataType::BigInt, false),
1644            ColumnSchema::new("mean_us", DataType::BigInt, false),
1645            ColumnSchema::new("max_us", DataType::BigInt, false),
1646            ColumnSchema::new("last_seen_us", DataType::BigInt, false),
1647        ];
1648        let rows: Vec<Row> = self
1649            .query_stats
1650            .snapshot()
1651            .into_iter()
1652            .map(|(sql, s)| {
1653                let mean = if s.exec_count == 0 {
1654                    0
1655                } else {
1656                    s.total_us / s.exec_count
1657                };
1658                Row::new(alloc::vec![
1659                    Value::Text(sql),
1660                    Value::BigInt(i64::try_from(s.exec_count).unwrap_or(i64::MAX)),
1661                    Value::BigInt(i64::try_from(s.total_us).unwrap_or(i64::MAX)),
1662                    Value::BigInt(i64::try_from(mean).unwrap_or(i64::MAX)),
1663                    Value::BigInt(i64::try_from(s.max_us).unwrap_or(i64::MAX)),
1664                    Value::BigInt(i64::try_from(s.last_seen_us).unwrap_or(i64::MAX)),
1665                ])
1666            })
1667            .collect();
1668        QueryResult::Rows { columns, rows }
1669    }
1670
1671    /// v6.5.2 — register a connection-state provider. spg-server
1672    /// calls this at startup with a function that snapshots its
1673    /// per-pgwire-connection registry. Engine reads through the
1674    /// callback on `SELECT * FROM spg_stat_activity`.
1675    #[must_use]
1676    pub const fn with_activity_provider(mut self, f: ActivityProvider) -> Self {
1677        self.activity_provider = Some(f);
1678        self
1679    }
1680
1681    /// v6.5.3 — register audit chain provider + verifier.
1682    #[must_use]
1683    pub const fn with_audit_providers(
1684        mut self,
1685        chain: AuditChainProvider,
1686        verify: AuditVerifier,
1687    ) -> Self {
1688        self.audit_chain_provider = Some(chain);
1689        self.audit_verifier = Some(verify);
1690        self
1691    }
1692
1693    /// v6.5.6 — register a slow-query log callback. `threshold_us`
1694    /// is the floor (in microseconds); only executes above the floor
1695    /// fire the callback. spg-server wires this from
1696    /// `SPG_SLOW_QUERY_THRESHOLD_MS` (default 100 ms).
1697    #[must_use]
1698    pub const fn with_slow_query_log(
1699        mut self,
1700        threshold_us: u64,
1701        logger: SlowQueryLogger,
1702    ) -> Self {
1703        self.slow_query_threshold_us = Some(threshold_us);
1704        self.slow_query_logger = Some(logger);
1705        self
1706    }
1707
1708    /// v6.5.6 — operator knob for plan cache cap. spg-server reads
1709    /// `SPG_PLAN_CACHE_MAX` env at startup; uses this to override
1710    /// the compile-time default of 256.
1711    pub fn set_plan_cache_max(&mut self, n: usize) {
1712        self.plan_cache.set_max_entries(n);
1713    }
1714
1715    /// v6.5.2 — materialise `spg_stat_activity` rows. Pulls a fresh
1716    /// snapshot from the registered `ActivityProvider`. Returns an
1717    /// empty result set when no provider is registered (the no_std
1718    /// embedded path with no pgwire layer).
1719    fn exec_spg_stat_activity(&self) -> QueryResult {
1720        let columns = alloc::vec![
1721            ColumnSchema::new("pid", DataType::Int, false),
1722            ColumnSchema::new("user", DataType::Text, false),
1723            ColumnSchema::new("started_at_us", DataType::BigInt, false),
1724            ColumnSchema::new("current_sql", DataType::Text, false),
1725            ColumnSchema::new("wait_event", DataType::Text, false),
1726            ColumnSchema::new("elapsed_us", DataType::BigInt, false),
1727            ColumnSchema::new("in_transaction", DataType::Bool, false),
1728        ];
1729        let rows: Vec<Row> = self
1730            .activity_provider
1731            .map(|f| f())
1732            .unwrap_or_default()
1733            .into_iter()
1734            .map(|r| {
1735                Row::new(alloc::vec![
1736                    Value::Int(i32::try_from(r.pid).unwrap_or(i32::MAX)),
1737                    Value::Text(r.user),
1738                    Value::BigInt(r.started_at_us),
1739                    Value::Text(r.current_sql),
1740                    Value::Text(r.wait_event),
1741                    Value::BigInt(r.elapsed_us),
1742                    Value::Bool(r.in_transaction),
1743                ])
1744            })
1745            .collect();
1746        QueryResult::Rows { columns, rows }
1747    }
1748
1749    /// v6.5.4 — materialise `spg_table_ddl` rows. One row per user
1750    /// table with `(table_name, ddl)`. Reconstructed from catalog
1751    /// state on demand.
1752    fn exec_spg_table_ddl(&self) -> QueryResult {
1753        let columns = alloc::vec![
1754            ColumnSchema::new("table_name", DataType::Text, false),
1755            ColumnSchema::new("ddl", DataType::Text, false),
1756        ];
1757        let rows: Vec<Row> = self
1758            .catalog
1759            .table_names()
1760            .into_iter()
1761            .filter(|n| !is_internal_table_name(n))
1762            .filter_map(|name| {
1763                let table = self.catalog.get(&name)?;
1764                let ddl = render_create_table(&name, &table.schema().columns);
1765                Some(Row::new(alloc::vec![
1766                    Value::Text(name),
1767                    Value::Text(ddl),
1768                ]))
1769            })
1770            .collect();
1771        QueryResult::Rows { columns, rows }
1772    }
1773
1774    /// v6.5.4 — materialise `spg_role_ddl` rows. One row per user
1775    /// with `(role_name, ddl)`. Password is redacted (matches the
1776    /// `Statement::CreateUser` Display which prints `'<redacted>'`).
1777    fn exec_spg_role_ddl(&self) -> QueryResult {
1778        let columns = alloc::vec![
1779            ColumnSchema::new("role_name", DataType::Text, false),
1780            ColumnSchema::new("ddl", DataType::Text, false),
1781        ];
1782        let rows: Vec<Row> = self
1783            .users
1784            .iter()
1785            .map(|(name, rec)| {
1786                let ddl = alloc::format!(
1787                    "CREATE USER {name} WITH PASSWORD '<redacted>' ROLE '{}'",
1788                    rec.role.as_str(),
1789                );
1790                Row::new(alloc::vec![Value::Text(String::from(name)), Value::Text(ddl)])
1791            })
1792            .collect();
1793        QueryResult::Rows { columns, rows }
1794    }
1795
1796    /// v6.5.4 — materialise `spg_database_ddl`: single row whose
1797    /// `ddl` column concatenates every user table's CREATE +
1798    /// every role's CREATE in deterministic catalog order. Suitable
1799    /// for piping back through `Engine::execute` to recreate a
1800    /// schema-equivalent database.
1801    fn exec_spg_database_ddl(&self) -> QueryResult {
1802        let columns = alloc::vec![ColumnSchema::new("ddl", DataType::Text, false)];
1803        let mut out = String::new();
1804        for (name, rec) in self.users.iter() {
1805            out.push_str(&alloc::format!(
1806                "CREATE USER {name} WITH PASSWORD '<redacted>' ROLE '{}';\n",
1807                rec.role.as_str(),
1808            ));
1809        }
1810        for name in self.catalog.table_names() {
1811            if is_internal_table_name(&name) {
1812                continue;
1813            }
1814            if let Some(table) = self.catalog.get(&name) {
1815                out.push_str(&render_create_table(&name, &table.schema().columns));
1816                out.push_str(";\n");
1817            }
1818        }
1819        QueryResult::Rows {
1820            columns,
1821            rows: alloc::vec![Row::new(alloc::vec![Value::Text(out)])],
1822        }
1823    }
1824
1825    /// v6.5.3 — materialise `spg_audit_chain` rows. Pulls a fresh
1826    /// snapshot from the registered provider; empty when no
1827    /// provider is set.
1828    fn exec_spg_audit_chain(&self) -> QueryResult {
1829        let columns = alloc::vec![
1830            ColumnSchema::new("seq", DataType::BigInt, false),
1831            ColumnSchema::new("ts_ms", DataType::BigInt, false),
1832            ColumnSchema::new("prev_hash", DataType::Text, false),
1833            ColumnSchema::new("entry_hash", DataType::Text, false),
1834            ColumnSchema::new("sql", DataType::Text, false),
1835        ];
1836        let rows: Vec<Row> = self
1837            .audit_chain_provider
1838            .map(|f| f())
1839            .unwrap_or_default()
1840            .into_iter()
1841            .map(|r| {
1842                Row::new(alloc::vec![
1843                    Value::BigInt(r.seq),
1844                    Value::BigInt(r.ts_ms),
1845                    Value::Text(r.prev_hash_hex),
1846                    Value::Text(r.entry_hash_hex),
1847                    Value::Text(r.sql),
1848                ])
1849            })
1850            .collect();
1851        QueryResult::Rows { columns, rows }
1852    }
1853
1854    /// v6.5.3 — materialise `spg_audit_verify` single-row result.
1855    /// `(verified_count, broken_at_seq)` — broken_at_seq is `-1`
1856    /// on a clean chain. Returns one row with both values 0 when
1857    /// no verifier is registered (no-data fallback for embedded
1858    /// callers).
1859    fn exec_spg_audit_verify(&self) -> QueryResult {
1860        let columns = alloc::vec![
1861            ColumnSchema::new("verified_count", DataType::BigInt, false),
1862            ColumnSchema::new("broken_at_seq", DataType::BigInt, false),
1863        ];
1864        let (verified, broken) = self.audit_verifier.map(|f| f()).unwrap_or((0, -1));
1865        let row = Row::new(alloc::vec![
1866            Value::BigInt(verified),
1867            Value::BigInt(broken),
1868        ]);
1869        QueryResult::Rows {
1870            columns,
1871            rows: alloc::vec![row],
1872        }
1873    }
1874
1875    /// v6.5.1 — read-only accessor for tests + v6.5.6 ops resets.
1876    pub fn query_stats(&self) -> &query_stats::QueryStats {
1877        &self.query_stats
1878    }
1879
1880    /// v6.5.1 — mutable accessor (clear, etc).
1881    pub fn query_stats_mut(&mut self) -> &mut query_stats::QueryStats {
1882        &mut self.query_stats
1883    }
1884
1885    /// v6.2.0 — read access to the per-column statistics table.
1886    /// Used by the planner (v6.2.2 selectivity functions read this),
1887    /// by `SELECT * FROM spg_statistic`, and by e2e tests.
1888    pub const fn statistics(&self) -> &statistics::Statistics {
1889        &self.statistics
1890    }
1891
1892    /// v6.2.1 — return tables whose modified-row count crossed the
1893    /// auto-analyze threshold since the last ANALYZE on that table.
1894    /// The threshold is `0.1 × max(row_count, MIN_ROWS_FOR_AUTO_
1895    /// ANALYZE)` — combines PG-style fractional + absolute lower
1896    /// bound so a fresh / tiny table doesn't get hammered on every
1897    /// INSERT.
1898    ///
1899    /// Designed to be cheap: walks every user table's
1900    /// `Catalog::table_names()` + reads `statistics::modified_
1901    /// since_last_analyze()` (BTreeMap lookup). The background
1902    /// worker calls this under `engine.read()` then drops the lock
1903    /// before re-acquiring `engine.write()` for the actual ANALYZE.
1904    pub fn tables_needing_analyze(&self) -> Vec<String> {
1905        const MIN_ROWS: u64 = 100;
1906        let mut out = Vec::new();
1907        for name in self.catalog.table_names() {
1908            if is_internal_table_name(&name) {
1909                continue;
1910            }
1911            let Some(table) = self.catalog.get(&name) else {
1912                continue;
1913            };
1914            let row_count = table.rows().len() as u64;
1915            let modified = self.statistics.modified_since_last_analyze(&name);
1916            // Threshold: ceil(0.1 × max(row_count, MIN_ROWS)),
1917            // computed in integer arithmetic so spg-engine stays
1918            // no_std without pulling in libm. `(n + 9) / 10` is
1919            // `ceil(n / 10)` for non-negative `n`.
1920            let base = row_count.max(MIN_ROWS);
1921            let threshold = base.saturating_add(9) / 10;
1922            if modified >= threshold {
1923                out.push(name);
1924            }
1925        }
1926        out
1927    }
1928
1929    /// v6.2.0 — `ANALYZE [<table>]` runtime. Bare `ANALYZE` walks
1930    /// every user table; `ANALYZE <name>` re-stats one. For each
1931    /// target table, single-pass scan + per-column histogram +
1932    /// `null_frac` + `n_distinct`. Replaces the table's prior
1933    /// stats; resets the modified-row counter.
1934    ///
1935    /// v6.2.0 doesn't sample — it scans the full table. v6.2.x
1936    /// can add reservoir sampling at the > 100 K-row mark; not a
1937    /// scope blocker for the current commit since rows ≤ 100 K
1938    /// analyse in milliseconds.
1939    fn exec_analyze(&mut self, target: Option<&str>) -> Result<QueryResult, EngineError> {
1940        let names: Vec<String> = if let Some(name) = target {
1941            // Verify the table exists; surface a clear error if not.
1942            if self.catalog.get(name).is_none() {
1943                return Err(EngineError::Storage(StorageError::TableNotFound {
1944                    name: name.to_string(),
1945                }));
1946            }
1947            alloc::vec![name.to_string()]
1948        } else {
1949            self.catalog
1950                .table_names()
1951                .into_iter()
1952                .filter(|n| !is_internal_table_name(n))
1953                .collect()
1954        };
1955        let mut analysed = 0usize;
1956        for table_name in &names {
1957            self.analyze_one_table(table_name)?;
1958            analysed += 1;
1959        }
1960        // v6.3.1 — plan cache invalidation. Bump stats version so
1961        // future lookups see the new generation, and selectively
1962        // evict every plan whose `source_tables` overlap with the
1963        // ANALYZE target set. Bare ANALYZE (all tables) clears the
1964        // whole cache.
1965        if analysed > 0 {
1966            self.statistics.bump_version();
1967            if target.is_some() {
1968                for t in &names {
1969                    self.plan_cache.evict_referencing(t);
1970                }
1971            } else {
1972                self.plan_cache.clear();
1973            }
1974        }
1975        Ok(QueryResult::CommandOk {
1976            affected: analysed,
1977            modified_catalog: true,
1978        })
1979    }
1980
1981    /// v6.7.3 — `COMPACT COLD SEGMENTS` runtime path. Drives the
1982    /// engine-layer compaction shim with the default
1983    /// 4 MiB segment-size threshold. spg-server intercepts the
1984    /// SQL before it reaches the engine on a server build —
1985    /// it reads `SPG_COMPACTION_TARGET_SEGMENT_BYTES`, calls
1986    /// `Engine::compact_cold_segments_with_target` directly with
1987    /// the env value, and persists every merged segment to
1988    /// `<db>.spg/segments/`. This arm only fires for engine-only
1989    /// callers (spg-embedded, lib tests); in that mode merged
1990    /// segments live in memory and are dropped at process exit.
1991    fn exec_compact_cold_segments(&mut self) -> Result<QueryResult, EngineError> {
1992        let target = COMPACTION_TARGET_DEFAULT_BYTES;
1993        let reports = self.compact_cold_segments_with_target(target)?;
1994        let columns = alloc::vec![
1995            ColumnSchema::new("table_name", DataType::Text, false),
1996            ColumnSchema::new("index_name", DataType::Text, false),
1997            ColumnSchema::new("sources_merged", DataType::BigInt, false),
1998            ColumnSchema::new("merged_segment_id", DataType::BigInt, false),
1999            ColumnSchema::new("merged_rows", DataType::BigInt, false),
2000            ColumnSchema::new("deleted_rows_pruned", DataType::BigInt, false),
2001            ColumnSchema::new("bytes_reclaimed_estimate", DataType::BigInt, false),
2002        ];
2003        let rows: Vec<Row> = reports
2004            .into_iter()
2005            .map(|(tname, iname, report)| {
2006                Row::new(alloc::vec![
2007                    Value::Text(tname),
2008                    Value::Text(iname),
2009                    Value::BigInt(i64::try_from(report.sources.len()).unwrap_or(i64::MAX)),
2010                    Value::BigInt(i64::from(report.merged_segment_id.unwrap_or(0))),
2011                    Value::BigInt(i64::try_from(report.merged_rows).unwrap_or(i64::MAX)),
2012                    Value::BigInt(
2013                        i64::try_from(report.deleted_rows_pruned).unwrap_or(i64::MAX),
2014                    ),
2015                    Value::BigInt(
2016                        i64::try_from(report.bytes_reclaimed_estimate).unwrap_or(i64::MAX),
2017                    ),
2018                ])
2019            })
2020            .collect();
2021        Ok(QueryResult::Rows { columns, rows })
2022    }
2023
2024    /// Walk a single table's rows once and (re-)populate per-column
2025    /// stats. Drops the existing stats for `table` first so columns
2026    /// that have been DROP-ed between ANALYZEs don't leave stale
2027    /// rows.
2028    fn analyze_one_table(&mut self, table_name: &str) -> Result<(), EngineError> {
2029        let table = self.catalog.get(table_name).ok_or_else(|| {
2030            EngineError::Storage(StorageError::TableNotFound {
2031                name: table_name.to_string(),
2032            })
2033        })?;
2034        let schema = table.schema().clone();
2035        let row_count = table.rows().len();
2036        // For each column, collect (sorted) non-NULL textual values
2037        // + count NULLs; then ask `statistics::build_histogram` to
2038        // produce the 101 bounds and `estimate_n_distinct` the
2039        // distinct count.
2040        self.statistics.clear_table(table_name);
2041        for (col_pos, col_schema) in schema.columns.iter().enumerate() {
2042            // v6.2.0 skip: vector columns have their own stats
2043            // shape (HNSW graph topology). v6.2 deliberation #1.
2044            if matches!(col_schema.ty, DataType::Vector { .. }) {
2045                continue;
2046            }
2047            let mut non_null_values: Vec<Value> = Vec::with_capacity(row_count);
2048            let mut nulls: u64 = 0;
2049            for row in table.rows() {
2050                match row.values.get(col_pos) {
2051                    Some(Value::Null) | None => nulls += 1,
2052                    Some(v) => non_null_values.push(v.clone()),
2053                }
2054            }
2055            // Sort by type-aware ordering (Int as int, Text as
2056            // lex, etc.) so histogram bounds reflect the column's
2057            // natural order — not lexicographic on the string
2058            // representation, which would put "9" after "49".
2059            non_null_values.sort_by(|a, b| sort_values_for_histogram(a, b));
2060            let non_null: Vec<String> = non_null_values
2061                .iter()
2062                .map(canonical_value_repr)
2063                .collect();
2064            let null_frac = if row_count == 0 {
2065                0.0
2066            } else {
2067                #[allow(clippy::cast_precision_loss)]
2068                let f = nulls as f32 / row_count as f32;
2069                f
2070            };
2071            let n_distinct = statistics::estimate_n_distinct(&non_null);
2072            let histogram_bounds = statistics::build_histogram(&non_null);
2073            self.statistics.set(
2074                table_name.to_string(),
2075                col_schema.name.clone(),
2076                statistics::ColumnStats {
2077                    null_frac,
2078                    n_distinct,
2079                    histogram_bounds,
2080                },
2081            );
2082        }
2083        self.statistics.reset_modified(table_name);
2084        // v6.7.0 — refresh the per-table cold_rows cache. Walk the
2085        // BTree indices and count Cold locators (MAX across
2086        // indices); store the result on the table. Surfaced via
2087        // `spg_statistic.cold_row_count` (new column) and
2088        // `spg_stat_segment.table_name` (new column).
2089        let cold_count = {
2090            let table = self
2091                .active_catalog()
2092                .get(table_name)
2093                .expect("table still present");
2094            table.count_cold_locators()
2095        };
2096        let table_mut = self
2097            .active_catalog_mut()
2098            .get_mut(table_name)
2099            .expect("table still present");
2100        table_mut.set_cold_row_count(cold_count);
2101        Ok(())
2102    }
2103
2104    /// v6.1.3 — `SHOW PUBLICATIONS` row materialisation. Returns
2105    /// `(name, scope, table_count)` ordered by publication name.
2106    ///   - `scope` is the human-readable string:
2107    ///       `"FOR ALL TABLES"` /
2108    ///       `"FOR TABLE t1, t2"` /
2109    ///       `"FOR ALL TABLES EXCEPT t1, t2"`.
2110    ///   - `table_count` is NULL for `AllTables`, the list length
2111    ///     otherwise. NULLability lets clients distinguish "publish
2112    ///     everything" from "publish exactly 0 tables" (the v6.1.3
2113    ///     parser forbids the empty list, but the column shape is
2114    ///     ready for the v6.1.5 publisher-side semantics).
2115    fn exec_show_publications(&self) -> QueryResult {
2116        let columns = alloc::vec![
2117            ColumnSchema::new("name", DataType::Text, false),
2118            ColumnSchema::new("scope", DataType::Text, false),
2119            ColumnSchema::new("table_count", DataType::Int, true),
2120        ];
2121        let rows: Vec<Row> = self
2122            .publications
2123            .iter()
2124            .map(|(name, scope)| {
2125                let (scope_str, count_val) = match scope {
2126                    spg_sql::ast::PublicationScope::AllTables => {
2127                        ("FOR ALL TABLES".to_string(), Value::Null)
2128                    }
2129                    spg_sql::ast::PublicationScope::ForTables(ts) => (
2130                        alloc::format!("FOR TABLE {}", ts.join(", ")),
2131                        Value::Int(i32::try_from(ts.len()).unwrap_or(i32::MAX)),
2132                    ),
2133                    spg_sql::ast::PublicationScope::AllTablesExcept(ts) => (
2134                        alloc::format!("FOR ALL TABLES EXCEPT {}", ts.join(", ")),
2135                        Value::Int(i32::try_from(ts.len()).unwrap_or(i32::MAX)),
2136                    ),
2137                };
2138                Row::new(alloc::vec![
2139                    Value::Text(name.clone()),
2140                    Value::Text(scope_str),
2141                    count_val,
2142                ])
2143            })
2144            .collect();
2145        QueryResult::Rows { columns, rows }
2146    }
2147
2148    /// v4.1 `SHOW USERS` — `(name, role)` per row, ordered by name.
2149    fn exec_show_users(&self) -> QueryResult {
2150        let columns = alloc::vec![
2151            ColumnSchema::new("name", DataType::Text, false),
2152            ColumnSchema::new("role", DataType::Text, false),
2153        ];
2154        let rows: Vec<Row> = self
2155            .users
2156            .iter()
2157            .map(|(name, rec)| {
2158                Row::new(alloc::vec![
2159                    Value::Text(name.to_string()),
2160                    Value::Text(rec.role.as_str().to_string()),
2161                ])
2162            })
2163            .collect();
2164        QueryResult::Rows { columns, rows }
2165    }
2166
2167    fn exec_create_user(&mut self, s: &CreateUserStatement) -> Result<QueryResult, EngineError> {
2168        if self.in_transaction() {
2169            return Err(EngineError::Unsupported(
2170                "CREATE USER is not allowed inside a transaction".into(),
2171            ));
2172        }
2173        let role = users::Role::parse(&s.role).ok_or_else(|| {
2174            EngineError::Unsupported(alloc::format!("invalid role: {:?}", s.role))
2175        })?;
2176        // Prefer the host-injected RNG. Falls back to a deterministic
2177        // salt derived from the username only when no RNG is wired —
2178        // acceptable for tests; the server always installs one.
2179        let salt = self.salt_fn.map_or_else(
2180            || {
2181                let mut s_bytes = [0u8; 16];
2182                let digest = spg_crypto::hash(s.name.as_bytes());
2183                s_bytes.copy_from_slice(&digest[..16]);
2184                s_bytes
2185            },
2186            |f| f(),
2187        );
2188        self.users
2189            .create(&s.name, &s.password, role, salt)
2190            .map_err(|e| EngineError::Unsupported(alloc::format!("CREATE USER: {e}")))?;
2191        Ok(QueryResult::CommandOk {
2192            affected: 1,
2193            modified_catalog: true,
2194        })
2195    }
2196
2197    fn exec_drop_user(&mut self, name: &str) -> Result<QueryResult, EngineError> {
2198        if self.in_transaction() {
2199            return Err(EngineError::Unsupported(
2200                "DROP USER is not allowed inside a transaction".into(),
2201            ));
2202        }
2203        self.users
2204            .drop(name)
2205            .map_err(|e| EngineError::Unsupported(alloc::format!("DROP USER: {e}")))?;
2206        Ok(QueryResult::CommandOk {
2207            affected: 1,
2208            modified_catalog: true,
2209        })
2210    }
2211
2212    /// v4.4 `UPDATE <table> SET col = expr [, ...] [WHERE cond]`.
2213    /// Filter pass uses the same WHERE eval as `exec_select`. Per
2214    /// matched row, evaluate each RHS expression against the *old*
2215    /// row, then call `Table::update_row` which rebuilds indices.
2216    /// Indexed columns are correctly reflected because rebuild
2217    /// happens after the cell rewrite.
2218    fn exec_update_cancel(
2219        &mut self,
2220        stmt: &spg_sql::ast::UpdateStatement,
2221        cancel: CancelToken<'_>,
2222    ) -> Result<QueryResult, EngineError> {
2223        // v5.2.3: if the WHERE is a PK equality and matches a cold-
2224        // tier row, promote it back to the hot tier *before* the
2225        // hot-row walk. The promote pushes the row to the end of
2226        // `table.rows`, where the upcoming SET-evaluation loop will
2227        // pick it up and apply the assignments. Lookups for the key
2228        // never observe a gap because `promote_cold_row` inserts the
2229        // hot row before retiring the cold locator.
2230        if let Some(w) = &stmt.where_ {
2231            let schema_cols = self
2232                .active_catalog()
2233                .get(&stmt.table)
2234                .ok_or_else(|| {
2235                    EngineError::Storage(StorageError::TableNotFound {
2236                        name: stmt.table.clone(),
2237                    })
2238                })?
2239                .schema()
2240                .columns
2241                .clone();
2242            if let Some((col_pos, key)) = try_pk_predicate(w, &schema_cols, stmt.table.as_str())
2243                && let Some(idx_name) = self
2244                    .active_catalog()
2245                    .get(&stmt.table)
2246                    .and_then(|t| t.index_on(col_pos).map(|i| i.name.clone()))
2247            {
2248                // Promote may be a no-op (key is hot-only or absent);
2249                // we don't care about the return value here — the
2250                // subsequent hot walk will either match or not.
2251                let _ = self
2252                    .active_catalog_mut()
2253                    .promote_cold_row(&stmt.table, &idx_name, &key);
2254            }
2255        }
2256
2257        let table = self
2258            .active_catalog_mut()
2259            .get_mut(&stmt.table)
2260            .ok_or_else(|| {
2261                EngineError::Storage(StorageError::TableNotFound {
2262                    name: stmt.table.clone(),
2263                })
2264            })?;
2265        let schema_cols: Vec<ColumnSchema> = table.schema().columns.clone();
2266        // Resolve each SET target to a column position once, validate
2267        // up front so a typo'd column doesn't leave a partial mutation
2268        // behind.
2269        let mut targets: Vec<(usize, &Expr)> = Vec::with_capacity(stmt.assignments.len());
2270        for (col, expr) in &stmt.assignments {
2271            let pos = schema_cols
2272                .iter()
2273                .position(|c| c.name == *col)
2274                .ok_or_else(|| {
2275                    EngineError::Eval(EvalError::ColumnNotFound { name: col.clone() })
2276                })?;
2277            targets.push((pos, expr));
2278        }
2279        let ctx = EvalContext::new(&schema_cols, Some(stmt.table.as_str()));
2280        // Walk every row, evaluate WHERE then SET expressions. We
2281        // gather (position, new_values) tuples first and apply them
2282        // afterwards so the WHERE/RHS evaluation reads the original
2283        // row state — matches PG semantics (UPDATE doesn't see its
2284        // own writes).
2285        let mut planned: Vec<(usize, Vec<Value>)> = Vec::new();
2286        for (i, row) in table.rows().iter().enumerate() {
2287            // v4.5: cooperative cancel checkpoint every 256 rows so
2288            // a runaway UPDATE without WHERE doesn't drag past the
2289            // server's query-timeout watchdog.
2290            if i.is_multiple_of(256) {
2291                cancel.check()?;
2292            }
2293            if let Some(w) = &stmt.where_ {
2294                let cond = eval::eval_expr(w, row, &ctx)?;
2295                if !matches!(cond, Value::Bool(true)) {
2296                    continue;
2297                }
2298            }
2299            let mut new_vals = row.values.clone();
2300            for (pos, expr) in &targets {
2301                let v = eval::eval_expr(expr, row, &ctx)?;
2302                new_vals[*pos] =
2303                    coerce_value(v, schema_cols[*pos].ty, &schema_cols[*pos].name, *pos)?;
2304            }
2305            planned.push((i, new_vals));
2306        }
2307        // v7.6.6 — capture pre-update row values for the FK
2308        // enforcement passes below. `planned` carries new values
2309        // only; pair them with the old row.
2310        let plan_with_old: Vec<(usize, Vec<Value>, Vec<Value>)> = planned
2311            .iter()
2312            .map(|(pos, new_vals)| (*pos, table.rows()[*pos].values.clone(), new_vals.clone()))
2313            .collect();
2314        let self_fks = table.schema().foreign_keys.clone();
2315        let affected = planned.len();
2316        // Release mutable borrow on `table` for the FK passes.
2317        let _ = table;
2318        // v7.6.6 — Stage 2a: outbound FK check. For every row whose
2319        // local FK columns changed, the new value must exist in the
2320        // parent.
2321        if !self_fks.is_empty() {
2322            let new_rows: Vec<Vec<Value>> = planned
2323                .iter()
2324                .map(|(_pos, new_vals)| new_vals.clone())
2325                .collect();
2326            enforce_fk_inserts(self.active_catalog(), &stmt.table, &self_fks, &new_rows)?;
2327        }
2328        // v7.6.6 — Stage 2b: inbound FK check. For every row that
2329        // changed value in a column that *some other table* uses as
2330        // a FK parent column, react per `on_update` action.
2331        let child_plan = plan_fk_parent_updates(self.active_catalog(), &stmt.table, &plan_with_old)?;
2332        // Stage 3a — apply each child-side action.
2333        for step in &child_plan {
2334            apply_fk_child_step(self.active_catalog_mut(), step)?;
2335        }
2336        // Stage 3b — apply the original UPDATE.
2337        let table = self
2338            .active_catalog_mut()
2339            .get_mut(&stmt.table)
2340            .ok_or_else(|| {
2341                EngineError::Storage(StorageError::TableNotFound {
2342                    name: stmt.table.clone(),
2343                })
2344            })?;
2345        for (pos, vals) in planned {
2346            table.update_row(pos, vals)?;
2347        }
2348        // v6.2.1 — auto-analyze modified-row tracking for UPDATE.
2349        if !self.in_transaction() && affected > 0 {
2350            self.statistics
2351                .record_modifications(&stmt.table, affected as u64);
2352        }
2353        Ok(QueryResult::CommandOk {
2354            affected,
2355            modified_catalog: !self.in_transaction(),
2356        })
2357    }
2358
2359    /// v4.4 `DELETE FROM <table> [WHERE cond]`. Collects matching
2360    /// positions then delegates to `Table::delete_rows` (single index
2361    /// rebuild for the batch).
2362    fn exec_delete_cancel(
2363        &mut self,
2364        stmt: &spg_sql::ast::DeleteStatement,
2365        cancel: CancelToken<'_>,
2366    ) -> Result<QueryResult, EngineError> {
2367        // v5.2.3: PK-targeted DELETE → first retire any cold-tier
2368        // locator for the key. The cold row body stays in the
2369        // segment (becoming shadowed garbage that a future
2370        // compaction pass reclaims) but the index no longer
2371        // resolves it. The shadow count contributes to the
2372        // affected total; the subsequent hot walk handles any hot
2373        // rows for the same key.
2374        let mut cold_shadow_count: usize = 0;
2375        if let Some(w) = &stmt.where_ {
2376            let schema_cols = self
2377                .active_catalog()
2378                .get(&stmt.table)
2379                .ok_or_else(|| {
2380                    EngineError::Storage(StorageError::TableNotFound {
2381                        name: stmt.table.clone(),
2382                    })
2383                })?
2384                .schema()
2385                .columns
2386                .clone();
2387            if let Some((col_pos, key)) = try_pk_predicate(w, &schema_cols, stmt.table.as_str())
2388                && let Some(idx_name) = self
2389                    .active_catalog()
2390                    .get(&stmt.table)
2391                    .and_then(|t| t.index_on(col_pos).map(|i| i.name.clone()))
2392            {
2393                cold_shadow_count = self
2394                    .active_catalog_mut()
2395                    .shadow_cold_row(&stmt.table, &idx_name, &key)
2396                    .unwrap_or(0);
2397            }
2398        }
2399
2400        let table = self
2401            .active_catalog_mut()
2402            .get_mut(&stmt.table)
2403            .ok_or_else(|| {
2404                EngineError::Storage(StorageError::TableNotFound {
2405                    name: stmt.table.clone(),
2406                })
2407            })?;
2408        let schema_cols: Vec<ColumnSchema> = table.schema().columns.clone();
2409        let ctx = EvalContext::new(&schema_cols, Some(stmt.table.as_str()));
2410        let mut positions: Vec<usize> = Vec::new();
2411        // v7.6.3 — collect every to-delete row's full Value tuple
2412        // alongside its position, so the FK enforcement pass can
2413        // run after the mut borrow drops.
2414        let mut to_delete_rows: Vec<Vec<Value>> = Vec::new();
2415        for (i, row) in table.rows().iter().enumerate() {
2416            if i.is_multiple_of(256) {
2417                cancel.check()?;
2418            }
2419            let keep = if let Some(w) = &stmt.where_ {
2420                let cond = eval::eval_expr(w, row, &ctx)?;
2421                !matches!(cond, Value::Bool(true))
2422            } else {
2423                false
2424            };
2425            if !keep {
2426                positions.push(i);
2427                to_delete_rows.push(row.values.clone());
2428            }
2429        }
2430        // v7.6.3 / v7.6.4 — Stage 2: FK enforcement on the immutable
2431        // catalog. Release the mut borrow and run reverse-scan
2432        // against every child table whose FK targets this table.
2433        // RESTRICT / NoAction raise an error; CASCADE returns a
2434        // cascade plan that stage 3 applies after the primary delete.
2435        // SET NULL / SET DEFAULT remain Unsupported until v7.6.5.
2436        let _ = table;
2437        let cascade_plan = plan_fk_parent_deletions(
2438            self.active_catalog(),
2439            &stmt.table,
2440            &positions,
2441            &to_delete_rows,
2442        )?;
2443        // Stage 3a — apply each FK child step (SET NULL / SET
2444        // DEFAULT / CASCADE delete) before deleting the parent.
2445        // The plan is already ordered: nulls/defaults first, then
2446        // cascade deletes (so a row mutated and later deleted
2447        // surfaces as deleted — though v7.6.5 doesn't produce
2448        // that overlap today).
2449        for step in &cascade_plan {
2450            apply_fk_child_step(self.active_catalog_mut(), step)?;
2451        }
2452        // Stage 3b — actually delete the original target rows.
2453        let table = self
2454            .active_catalog_mut()
2455            .get_mut(&stmt.table)
2456            .ok_or_else(|| {
2457                EngineError::Storage(StorageError::TableNotFound {
2458                    name: stmt.table.clone(),
2459                })
2460            })?;
2461        let affected = table.delete_rows(&positions) + cold_shadow_count;
2462        // v6.2.1 — auto-analyze modified-row tracking for DELETE.
2463        if !self.in_transaction() && affected > 0 {
2464            self.statistics
2465                .record_modifications(&stmt.table, affected as u64);
2466        }
2467        Ok(QueryResult::CommandOk {
2468            affected,
2469            modified_catalog: !self.in_transaction(),
2470        })
2471    }
2472
2473    /// `SHOW TABLES` — one row per table in the active catalog.
2474    /// Column name is `name` so result-set consumers can downstream
2475    /// `SELECT name FROM ...` style logic if needed.
2476    /// v4.26: `EXPLAIN [ANALYZE] <select>`. Returns a single-column
2477    /// `QUERY PLAN` text table — first line names the top operator
2478    /// (Scan / Aggregate / Window / etc.), indented children list
2479    /// FROM joins, WHERE filters, ORDER BY / LIMIT, projection
2480    /// shape, and any active index hits. `ANALYZE` execs the inner
2481    /// SELECT and appends actual-row + elapsed-micros annotations.
2482    #[allow(clippy::format_push_string)]
2483    fn exec_explain(
2484        &self,
2485        e: &spg_sql::ast::ExplainStatement,
2486        cancel: CancelToken<'_>,
2487    ) -> Result<QueryResult, EngineError> {
2488        let mut lines = Vec::<String>::new();
2489        explain_select(&e.inner, self, 0, &mut lines);
2490        if e.suggest {
2491            // v6.8.3 — index advisor. Walks the SELECT's FROM
2492            // tables + WHERE column refs; for each (table, column)
2493            // pair that lacks an index, append a SUGGEST line with
2494            // a copy-pastable `CREATE INDEX` statement. This is a
2495            // pure-syntax heuristic — no cardinality estimation —
2496            // matching the v6.8.3 design intent of "tell the
2497            // operator where indexes are missing", not "give the
2498            // mathematically optimal index set".
2499            let suggestions = build_index_suggestions(&e.inner, self);
2500            for s in suggestions {
2501                lines.push(s);
2502            }
2503        } else if e.analyze {
2504            // v6.2.4 — EXPLAIN ANALYZE annotates each operator line
2505            // with `(rows=N)` where the row count is computable
2506            // without re-executing the full query:
2507            //   - Top-level operator (first non-indented line):
2508            //     rows = final result.len()
2509            //   - "From: <table> [full scan]" lines: rows =
2510            //     table.rows().len() (catalog read; no execution)
2511            //   - "From: <table> [index seek]": indeterminate —
2512            //     the index step would need re-execution; v6.2.5
2513            //     adds per-operator wall-clock + hot/cold rows
2514            //     instrumentation that makes this concrete.
2515            //   - Everything else: marked `(—)` so the surface
2516            //     stays well-defined without silently dropping
2517            //     stats. v6.2.5 fills in via inline executor
2518            //     instrumentation.
2519            // Total elapsed lands on a trailing `Total: …` line.
2520            let started = self.clock.map(|f| f());
2521            let exec = self.exec_select_cancel(&e.inner, cancel)?;
2522            let elapsed_micros = match (self.clock, started) {
2523                (Some(f), Some(s)) => Some(f().saturating_sub(s)),
2524                _ => None,
2525            };
2526            let row_count = if let QueryResult::Rows { rows, .. } = &exec {
2527                rows.len()
2528            } else {
2529                0
2530            };
2531            annotate_explain_lines(&mut lines, row_count, self);
2532            let mut total = alloc::format!("Total: rows={row_count}");
2533            if let Some(us) = elapsed_micros {
2534                total.push_str(&alloc::format!(" elapsed={us}us"));
2535            }
2536            lines.push(total);
2537        }
2538        let columns = alloc::vec![ColumnSchema::new("QUERY PLAN", DataType::Text, false)];
2539        let rows: Vec<Row> = lines
2540            .into_iter()
2541            .map(|l| Row::new(alloc::vec![Value::Text(l)]))
2542            .collect();
2543        Ok(QueryResult::Rows { columns, rows })
2544    }
2545
2546    fn exec_show_tables(&self) -> QueryResult {
2547        let columns = alloc::vec![ColumnSchema::new("name", DataType::Text, false)];
2548        let rows: Vec<Row> = self
2549            .active_catalog()
2550            .table_names()
2551            .into_iter()
2552            .map(|n| Row::new(alloc::vec![Value::Text(n)]))
2553            .collect();
2554        QueryResult::Rows { columns, rows }
2555    }
2556
2557    /// `SHOW COLUMNS FROM <table>` — one row per column with the
2558    /// declared name, SQL type rendering, and nullability flag.
2559    fn exec_show_columns(&self, table_name: &str) -> Result<QueryResult, EngineError> {
2560        let table =
2561            self.active_catalog()
2562                .get(table_name)
2563                .ok_or_else(|| StorageError::TableNotFound {
2564                    name: table_name.into(),
2565                })?;
2566        let columns = alloc::vec![
2567            ColumnSchema::new("name", DataType::Text, false),
2568            ColumnSchema::new("type", DataType::Text, false),
2569            ColumnSchema::new("nullable", DataType::Bool, false),
2570        ];
2571        let rows: Vec<Row> = table
2572            .schema()
2573            .columns
2574            .iter()
2575            .map(|c| {
2576                Row::new(alloc::vec![
2577                    Value::Text(c.name.clone()),
2578                    Value::Text(alloc::format!("{}", c.ty)),
2579                    Value::Bool(c.nullable),
2580                ])
2581            })
2582            .collect();
2583        Ok(QueryResult::Rows { columns, rows })
2584    }
2585
2586    fn exec_begin(&mut self) -> Result<QueryResult, EngineError> {
2587        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2588        if self.tx_catalogs.contains_key(&tx_id) {
2589            return Err(EngineError::TransactionAlreadyOpen);
2590        }
2591        self.tx_catalogs.insert(
2592            tx_id,
2593            TxState {
2594                catalog: self.catalog.clone(),
2595                savepoints: Vec::new(),
2596            },
2597        );
2598        Ok(QueryResult::CommandOk {
2599            affected: 0,
2600            modified_catalog: false,
2601        })
2602    }
2603
2604    fn exec_commit(&mut self) -> Result<QueryResult, EngineError> {
2605        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2606        let state = self
2607            .tx_catalogs
2608            .remove(&tx_id)
2609            .ok_or(EngineError::NoActiveTransaction)?;
2610        self.catalog = state.catalog;
2611        // All savepoints become permanent at COMMIT and the stack
2612        // resets for the next TX (`state.savepoints` is discarded with
2613        // `state`).
2614        Ok(QueryResult::CommandOk {
2615            affected: 0,
2616            modified_catalog: true,
2617        })
2618    }
2619
2620    fn exec_rollback(&mut self) -> Result<QueryResult, EngineError> {
2621        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2622        if self.tx_catalogs.remove(&tx_id).is_none() {
2623            return Err(EngineError::NoActiveTransaction);
2624        }
2625        // savepoints discarded with the TxState
2626        Ok(QueryResult::CommandOk {
2627            affected: 0,
2628            modified_catalog: false,
2629        })
2630    }
2631
2632    fn exec_savepoint(&mut self, name: String) -> Result<QueryResult, EngineError> {
2633        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2634        let state = self
2635            .tx_catalogs
2636            .get_mut(&tx_id)
2637            .ok_or(EngineError::NoActiveTransaction)?;
2638        // PG re-uses an existing savepoint name by dropping the older
2639        // entry and pushing a fresh one — match that behaviour so
2640        // application code can `SAVEPOINT sp; ...; SAVEPOINT sp` freely.
2641        state.savepoints.retain(|(n, _)| n != &name);
2642        let snapshot = state.catalog.clone();
2643        state.savepoints.push((name, snapshot));
2644        Ok(QueryResult::CommandOk {
2645            affected: 0,
2646            modified_catalog: false,
2647        })
2648    }
2649
2650    fn exec_rollback_to_savepoint(&mut self, name: &str) -> Result<QueryResult, EngineError> {
2651        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2652        let state = self
2653            .tx_catalogs
2654            .get_mut(&tx_id)
2655            .ok_or(EngineError::NoActiveTransaction)?;
2656        let pos = state
2657            .savepoints
2658            .iter()
2659            .rposition(|(n, _)| n == name)
2660            .ok_or_else(|| {
2661                EngineError::Unsupported(alloc::format!("savepoint not found: {name}"))
2662            })?;
2663        // The savepoint stays on the stack (PG semantics): a later
2664        // `RELEASE` or further `ROLLBACK TO` is still allowed. Everything
2665        // after it is discarded.
2666        let snapshot = state.savepoints[pos].1.clone();
2667        state.savepoints.truncate(pos + 1);
2668        state.catalog = snapshot;
2669        Ok(QueryResult::CommandOk {
2670            affected: 0,
2671            modified_catalog: false,
2672        })
2673    }
2674
2675    fn exec_release_savepoint(&mut self, name: &str) -> Result<QueryResult, EngineError> {
2676        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2677        let state = self
2678            .tx_catalogs
2679            .get_mut(&tx_id)
2680            .ok_or(EngineError::NoActiveTransaction)?;
2681        let pos = state
2682            .savepoints
2683            .iter()
2684            .rposition(|(n, _)| n == name)
2685            .ok_or_else(|| {
2686                EngineError::Unsupported(alloc::format!("savepoint not found: {name}"))
2687            })?;
2688        // RELEASE keeps the work since the savepoint, just discards the
2689        // bookmark plus everything nested under it.
2690        state.savepoints.truncate(pos);
2691        Ok(QueryResult::CommandOk {
2692            affected: 0,
2693            modified_catalog: false,
2694        })
2695    }
2696
2697    /// v6.0.4 — synchronous `ALTER INDEX <name> REBUILD [WITH
2698    /// (encoding = …)]`. Walks every table in the active catalog
2699    /// looking for an index matching `stmt.name`, then delegates the
2700    /// rebuild (including any encoding switch) to
2701    /// `Table::rebuild_nsw_index`. The "live" non-blocking
2702    /// optimisation is v6.0.4.1 / v6.1.x territory.
2703    /// v6.7.2 — `ALTER TABLE t SET hot_tier_bytes = X`. Dispatch
2704    /// arm. Currently the only setting is `hot_tier_bytes`; later
2705    /// v6.7.x can extend `AlterTableTarget` without touching this
2706    /// arm structure.
2707    fn exec_alter_table(
2708        &mut self,
2709        s: spg_sql::ast::AlterTableStatement,
2710    ) -> Result<QueryResult, EngineError> {
2711        match s.target {
2712            spg_sql::ast::AlterTableTarget::SetHotTierBytes(n) => {
2713                let table = self
2714                    .active_catalog_mut()
2715                    .get_mut(&s.name)
2716                    .ok_or_else(|| {
2717                        EngineError::Storage(StorageError::TableNotFound {
2718                            name: s.name.clone(),
2719                        })
2720                    })?;
2721                table.schema_mut().hot_tier_bytes = Some(n);
2722            }
2723            spg_sql::ast::AlterTableTarget::AddForeignKey(fk) => {
2724                // v7.6.8 — resolve FK against the live catalog first
2725                // (validates parent table, columns, indices). Then
2726                // verify every existing row in the child table
2727                // satisfies the new constraint. Then install it.
2728                let cols_snapshot = self
2729                    .active_catalog()
2730                    .get(&s.name)
2731                    .ok_or_else(|| {
2732                        EngineError::Storage(StorageError::TableNotFound {
2733                            name: s.name.clone(),
2734                        })
2735                    })?
2736                    .schema()
2737                    .columns
2738                    .clone();
2739                let storage_fk = resolve_foreign_key(
2740                    &s.name,
2741                    &cols_snapshot,
2742                    fk,
2743                    self.active_catalog(),
2744                )?;
2745                // Verify existing rows. Treat them as a virtual
2746                // INSERT batch — reusing the v7.6.2 enforce helper.
2747                let existing_rows: Vec<Vec<Value>> = self
2748                    .active_catalog()
2749                    .get(&s.name)
2750                    .expect("checked above")
2751                    .rows()
2752                    .iter()
2753                    .map(|r| r.values.clone())
2754                    .collect();
2755                enforce_fk_inserts(
2756                    self.active_catalog(),
2757                    &s.name,
2758                    core::slice::from_ref(&storage_fk),
2759                    &existing_rows,
2760                )?;
2761                // Reject duplicate constraint name.
2762                let table = self
2763                    .active_catalog_mut()
2764                    .get_mut(&s.name)
2765                    .expect("checked above");
2766                if let Some(name) = &storage_fk.name
2767                    && table
2768                        .schema()
2769                        .foreign_keys
2770                        .iter()
2771                        .any(|f| f.name.as_ref() == Some(name))
2772                {
2773                    return Err(EngineError::Unsupported(alloc::format!(
2774                        "ALTER TABLE ADD CONSTRAINT: a constraint named {name:?} already exists"
2775                    )));
2776                }
2777                table.schema_mut().foreign_keys.push(storage_fk);
2778            }
2779            spg_sql::ast::AlterTableTarget::DropForeignKey(name) => {
2780                let table = self
2781                    .active_catalog_mut()
2782                    .get_mut(&s.name)
2783                    .ok_or_else(|| {
2784                        EngineError::Storage(StorageError::TableNotFound {
2785                            name: s.name.clone(),
2786                        })
2787                    })?;
2788                let fks = &mut table.schema_mut().foreign_keys;
2789                let before = fks.len();
2790                fks.retain(|f| f.name.as_ref() != Some(&name));
2791                if fks.len() == before {
2792                    return Err(EngineError::Unsupported(alloc::format!(
2793                        "ALTER TABLE DROP CONSTRAINT: no FK named {name:?} on {:?}",
2794                        s.name
2795                    )));
2796                }
2797            }
2798        }
2799        Ok(QueryResult::CommandOk {
2800            affected: 0,
2801            modified_catalog: !self.in_transaction(),
2802        })
2803    }
2804
2805    fn exec_alter_index(
2806        &mut self,
2807        stmt: spg_sql::ast::AlterIndexStatement,
2808    ) -> Result<QueryResult, EngineError> {
2809        // Translate the optional SQL-side encoding choice into the
2810        // storage-side enum; the same SqlVecEncoding -> VecEncoding
2811        // bridge `column_type_to_data_type` uses.
2812        let spg_sql::ast::AlterIndexStatement {
2813            name: idx_name,
2814            target,
2815        } = stmt;
2816        let spg_sql::ast::AlterIndexTarget::Rebuild { encoding } = target;
2817        let target = encoding.map(|e| match e {
2818            SqlVecEncoding::F32 => VecEncoding::F32,
2819            SqlVecEncoding::Sq8 => VecEncoding::Sq8,
2820            SqlVecEncoding::F16 => VecEncoding::F16,
2821        });
2822        // Linear scan: index names are globally unique within a
2823        // catalog (enforced by add_nsw_index_inner) so the first
2824        // match is the only one. Save the table name to avoid
2825        // borrowing while we then take a mut borrow.
2826        let table_name = {
2827            let cat = self.active_catalog();
2828            let mut found: Option<String> = None;
2829            for tname in cat.table_names() {
2830                if let Some(t) = cat.get(&tname)
2831                    && t.indices().iter().any(|i| i.name == idx_name)
2832                {
2833                    found = Some(tname);
2834                    break;
2835                }
2836            }
2837            found.ok_or_else(|| {
2838                EngineError::Storage(StorageError::IndexNotFound {
2839                    name: idx_name.clone(),
2840                })
2841            })?
2842        };
2843        let table = self
2844            .active_catalog_mut()
2845            .get_mut(&table_name)
2846            .expect("table found above");
2847        table.rebuild_nsw_index(&idx_name, target)?;
2848        // v6.3.1 — ALTER INDEX REBUILD potentially with new encoding
2849        // changes cost characteristics; evict any cached plans.
2850        self.plan_cache.evict_referencing(&table_name);
2851        Ok(QueryResult::CommandOk {
2852            affected: 0,
2853            modified_catalog: !self.in_transaction(),
2854        })
2855    }
2856
2857    fn exec_create_index(
2858        &mut self,
2859        stmt: CreateIndexStatement,
2860    ) -> Result<QueryResult, EngineError> {
2861        let table = self
2862            .active_catalog_mut()
2863            .get_mut(&stmt.table)
2864            .ok_or_else(|| {
2865                EngineError::Storage(StorageError::TableNotFound {
2866                    name: stmt.table.clone(),
2867                })
2868            })?;
2869        // `IF NOT EXISTS` reduces DuplicateIndex to a no-op CommandOk.
2870        if stmt.if_not_exists && table.indices().iter().any(|i| i.name == stmt.name) {
2871            return Ok(QueryResult::CommandOk {
2872                affected: 0,
2873                modified_catalog: false,
2874            });
2875        }
2876        let table_name = stmt.table.clone();
2877        // v6.8.0 — resolve INCLUDE column names to positions. Done
2878        // before `add_index` so a typo error surfaces before any
2879        // catalog mutation lands.
2880        let included_positions: Vec<usize> = if stmt.included_columns.is_empty() {
2881            Vec::new()
2882        } else {
2883            let schema = table.schema();
2884            stmt.included_columns
2885                .iter()
2886                .map(|c| {
2887                    schema.column_position(c).ok_or_else(|| {
2888                        EngineError::Storage(StorageError::ColumnNotFound {
2889                            column: c.clone(),
2890                        })
2891                    })
2892                })
2893                .collect::<Result<Vec<_>, _>>()?
2894        };
2895        match stmt.method {
2896            IndexMethod::BTree => table.add_index(stmt.name.clone(), &stmt.column)?,
2897            IndexMethod::Hnsw => {
2898                if !included_positions.is_empty() {
2899                    return Err(EngineError::Unsupported(
2900                        "INCLUDE columns are not supported on HNSW indexes".into(),
2901                    ));
2902                }
2903                table.add_nsw_index(stmt.name.clone(), &stmt.column, spg_storage::NSW_DEFAULT_M)?;
2904            }
2905            // v6.7.1 — BRIN. Pure metadata; no in-memory data.
2906            IndexMethod::Brin => {
2907                if !included_positions.is_empty() {
2908                    return Err(EngineError::Unsupported(
2909                        "INCLUDE columns are not supported on BRIN indexes".into(),
2910                    ));
2911                }
2912                table.add_brin_index(stmt.name.clone(), &stmt.column)?;
2913            }
2914        }
2915        if !included_positions.is_empty()
2916            && let Some(idx) = table.indices_mut().iter_mut().find(|i| i.name == stmt.name)
2917        {
2918            idx.included_columns = included_positions;
2919        }
2920        // v6.8.1 — persist partial-index predicate. Stored as the
2921        // expression's Display form so the catalog snapshot stays
2922        // pure (storage has no spg-sql dependency). The runtime
2923        // maintenance path treats partial indexes identically to
2924        // full indexes for v6.8.1 (over-maintenance is safe; the
2925        // planner-side "use partial when query WHERE implies the
2926        // predicate" pass is STABILITY carve-out).
2927        if let Some(pred_expr) = &stmt.partial_predicate {
2928            let canonical = pred_expr.to_string();
2929            if matches!(stmt.method, IndexMethod::Hnsw | IndexMethod::Brin) {
2930                return Err(EngineError::Unsupported(
2931                    "WHERE predicates are not supported on HNSW or BRIN indexes".into(),
2932                ));
2933            }
2934            if let Some(idx) = table.indices_mut().iter_mut().find(|i| i.name == stmt.name) {
2935                idx.partial_predicate = Some(canonical);
2936            }
2937        }
2938        // v6.8.2 — persist expression index key. Same Display-form
2939        // storage; the runtime maintenance pass evaluates each
2940        // row's expression to derive the index key, but for v6.8.2
2941        // the engine falls through to the bare-column-reference
2942        // path and the expression is preserved for format-layer
2943        // round-trip + future planner work. Carved-out in
2944        // STABILITY § "Out of v6.8".
2945        if let Some(key_expr) = &stmt.expression {
2946            if matches!(stmt.method, IndexMethod::Hnsw | IndexMethod::Brin) {
2947                return Err(EngineError::Unsupported(
2948                    "Expression keys are not supported on HNSW or BRIN indexes".into(),
2949                ));
2950            }
2951            let canonical = key_expr.to_string();
2952            if let Some(idx) = table.indices_mut().iter_mut().find(|i| i.name == stmt.name) {
2953                idx.expression = Some(canonical);
2954            }
2955        }
2956        // v6.3.1 — adding an index can change the optimal plan for
2957        // any cached query that references this table.
2958        self.plan_cache.evict_referencing(&table_name);
2959        Ok(QueryResult::CommandOk {
2960            affected: 0,
2961            modified_catalog: !self.in_transaction(),
2962        })
2963    }
2964
2965    fn exec_create_table(
2966        &mut self,
2967        stmt: CreateTableStatement,
2968    ) -> Result<QueryResult, EngineError> {
2969        if stmt.if_not_exists && self.active_catalog().get(&stmt.name).is_some() {
2970            return Ok(QueryResult::CommandOk {
2971                affected: 0,
2972                modified_catalog: false,
2973            });
2974        }
2975        let table_name = stmt.name.clone();
2976        let cols = stmt
2977            .columns
2978            .into_iter()
2979            .map(column_def_to_schema)
2980            .collect::<Result<Vec<_>, _>>()?;
2981        // v7.6.1 — resolve every FK in the statement against the
2982        // already-known catalog. Validates: parent table exists,
2983        // parent column names exist, arity matches, parent columns
2984        // have a PK / UNIQUE index. Self-referencing FKs (parent
2985        // table == this table) resolve against the column list we
2986        // just built — they don't need the catalog yet.
2987        let mut fks: Vec<spg_storage::ForeignKeyConstraint> =
2988            Vec::with_capacity(stmt.foreign_keys.len());
2989        for fk in stmt.foreign_keys {
2990            fks.push(resolve_foreign_key(
2991                &table_name,
2992                &cols,
2993                fk,
2994                self.active_catalog(),
2995            )?);
2996        }
2997        let mut schema = TableSchema::new(table_name, cols);
2998        schema.foreign_keys = fks;
2999        self.active_catalog_mut().create_table(schema)?;
3000        Ok(QueryResult::CommandOk {
3001            affected: 0,
3002            modified_catalog: !self.in_transaction(),
3003        })
3004    }
3005
3006    fn exec_insert(&mut self, stmt: InsertStatement) -> Result<QueryResult, EngineError> {
3007        let table = self
3008            .active_catalog_mut()
3009            .get_mut(&stmt.table)
3010            .ok_or_else(|| {
3011                EngineError::Storage(StorageError::TableNotFound {
3012                    name: stmt.table.clone(),
3013                })
3014            })?;
3015        // v3.1.5: clone the columns vector only (not the whole
3016        // TableSchema — saves one String alloc for the table name).
3017        // We need an owned snapshot because we'll call `table.insert`
3018        // (mutable borrow on `table`) inside the row loop while
3019        // reading schema fields.
3020        let column_meta: Vec<ColumnSchema> = table.schema().columns.clone();
3021        let schema_cols_len = column_meta.len();
3022        // Build a permutation `tuple_pos[c] = Some(j)` meaning schema
3023        // column `c` is filled from the `j`-th tuple slot; `None` means
3024        // "fill with NULL". Validated once and reused for every row.
3025        let tuple_pos: Option<Vec<Option<usize>>> = match &stmt.columns {
3026            None => None, // 1-1 mapping, fast path
3027            Some(cols) => {
3028                let mut map = alloc::vec![None; schema_cols_len];
3029                for (j, name) in cols.iter().enumerate() {
3030                    let idx = column_meta
3031                        .iter()
3032                        .position(|c| c.name == *name)
3033                        .ok_or_else(|| {
3034                            EngineError::Eval(EvalError::ColumnNotFound { name: name.clone() })
3035                        })?;
3036                    if map[idx].is_some() {
3037                        return Err(EngineError::Storage(StorageError::ArityMismatch {
3038                            expected: schema_cols_len,
3039                            actual: cols.len(),
3040                        }));
3041                    }
3042                    map[idx] = Some(j);
3043                }
3044                // Omitted columns must either be nullable, carry a
3045                // DEFAULT, or be AUTO_INCREMENT. Catch NOT NULL
3046                // omissions up front so the WAL stays clean.
3047                for (i, col) in column_meta.iter().enumerate() {
3048                    if map[i].is_none()
3049                        && !col.nullable
3050                        && col.default.is_none()
3051                        && !col.auto_increment
3052                    {
3053                        return Err(EngineError::Storage(StorageError::NullInNotNull {
3054                            column: col.name.clone(),
3055                        }));
3056                    }
3057                }
3058                Some(map)
3059            }
3060        };
3061        let expected_tuple_len = stmt.columns.as_ref().map_or(schema_cols_len, Vec::len);
3062        // v7.6.2 — snapshot this table's FK list before the
3063        // mutable-borrow window so we can run parent lookups
3064        // against the immutable catalog after parsing. Empty vec is
3065        // the no-FK fast path; clone cost is O(fks * arity) which
3066        // is < 100 ns for typical schemas.
3067        let fks = table.schema().foreign_keys.clone();
3068        let mut affected = 0usize;
3069        // Stage 1 — parse + AUTO_INC + coerce all rows under the
3070        // single mutable borrow.
3071        let mut all_values: Vec<Vec<Value>> = Vec::with_capacity(stmt.rows.len());
3072        for tuple in stmt.rows {
3073            if tuple.len() != expected_tuple_len {
3074                return Err(EngineError::Storage(StorageError::ArityMismatch {
3075                    expected: expected_tuple_len,
3076                    actual: tuple.len(),
3077                }));
3078            }
3079            // Fast path: no column-list permutation → tuple slot j
3080            // maps to schema column j. We can zip schema with tuple
3081            // and skip the `raw_tuple` staging allocation entirely.
3082            let values: Vec<Value> = if let Some(map) = &tuple_pos {
3083                // Permuted path: still need raw_tuple to index by `map[i]`.
3084                let raw_tuple: Vec<Value> = tuple
3085                    .into_iter()
3086                    .map(literal_expr_to_value)
3087                    .collect::<Result<_, _>>()?;
3088                let mut out = Vec::with_capacity(schema_cols_len);
3089                for (i, col) in column_meta.iter().enumerate() {
3090                    let mut raw = match map[i] {
3091                        Some(j) => raw_tuple[j].clone(),
3092                        None => col.default.clone().unwrap_or(Value::Null),
3093                    };
3094                    if col.auto_increment && raw.is_null() {
3095                        let next = table.next_auto_value(i).ok_or_else(|| {
3096                            EngineError::Unsupported(alloc::format!(
3097                                "AUTO_INCREMENT applies to integer columns only (column `{}`)",
3098                                col.name
3099                            ))
3100                        })?;
3101                        raw = Value::BigInt(next);
3102                    }
3103                    out.push(coerce_value(raw, col.ty, &col.name, i)?);
3104                }
3105                out
3106            } else {
3107                // 1-1 mapping fast path: single Vec alloc, no raw_tuple.
3108                let mut out = Vec::with_capacity(schema_cols_len);
3109                for (i, (col, expr)) in column_meta.iter().zip(tuple).enumerate() {
3110                    let mut raw = literal_expr_to_value(expr)?;
3111                    if col.auto_increment && raw.is_null() {
3112                        let next = table.next_auto_value(i).ok_or_else(|| {
3113                            EngineError::Unsupported(alloc::format!(
3114                                "AUTO_INCREMENT applies to integer columns only (column `{}`)",
3115                                col.name
3116                            ))
3117                        })?;
3118                        raw = Value::BigInt(next);
3119                    }
3120                    out.push(coerce_value(raw, col.ty, &col.name, i)?);
3121                }
3122                out
3123            };
3124            all_values.push(values);
3125        }
3126        // Stage 2 — FK enforcement on the immutable catalog.
3127        // Non-lexical lifetimes release the mutable borrow on
3128        // `table` here since stage 1 was the last use. The
3129        // parent-table lookup runs before any row is committed.
3130        let _ = table;
3131        if !fks.is_empty() {
3132            enforce_fk_inserts(self.active_catalog(), &stmt.table, &fks, &all_values)?;
3133        }
3134        // Stage 3 — insert all rows under a fresh mutable borrow.
3135        let table = self
3136            .active_catalog_mut()
3137            .get_mut(&stmt.table)
3138            .ok_or_else(|| {
3139                EngineError::Storage(StorageError::TableNotFound {
3140                    name: stmt.table.clone(),
3141                })
3142            })?;
3143        for values in all_values {
3144            table.insert(Row::new(values))?;
3145            affected += 1;
3146        }
3147        // v6.2.1 — auto-analyze: track per-table modified-row
3148        // counter so the background sweep can decide when to
3149        // re-ANALYZE. Cheap path on the autocommit-wrap hot loop
3150        // — one BTreeMap entry update per INSERT batch.
3151        if !self.in_transaction() && affected > 0 {
3152            self.statistics
3153                .record_modifications(&stmt.table, affected as u64);
3154        }
3155        Ok(QueryResult::CommandOk {
3156            affected,
3157            modified_catalog: !self.in_transaction(),
3158        })
3159    }
3160
3161    /// v4.5: SELECT with cooperative cancellation. The token is
3162    /// honoured between UNION peers and inside the bare-SELECT row
3163    /// loop; HNSW kNN graph walks and the aggregate executor don't
3164    /// honour it yet (deferred — those paths bound their work
3165    /// internally by `LIMIT k` and `GROUP BY` cardinality).
3166    /// v6.10.2 — cold-tier time-travel scan. Resolves the segment
3167    /// by id, decodes each row body against the table's current
3168    /// schema, applies the SELECT's projection + optional WHERE +
3169    /// optional LIMIT, returns a `Rows` result. JOINs / aggregates
3170    /// / ORDER BY are unsupported on this path (STABILITY carve-
3171    /// out); operators wanting them should restore the segment
3172    /// into a regular table first.
3173    fn exec_select_as_of_segment(
3174        &self,
3175        stmt: &SelectStatement,
3176        from: &spg_sql::ast::FromClause,
3177        segment_id: u32,
3178    ) -> Result<QueryResult, EngineError> {
3179        // v6.10.2 scope: no joins, no aggregates, no ORDER BY,
3180        // no GROUP BY / HAVING / UNION / OFFSET / DISTINCT.
3181        if !from.joins.is_empty()
3182            || stmt.group_by.is_some()
3183            || stmt.having.is_some()
3184            || !stmt.unions.is_empty()
3185            || !stmt.order_by.is_empty()
3186            || stmt.offset.is_some()
3187            || stmt.distinct
3188            || aggregate::uses_aggregate(stmt)
3189        {
3190            return Err(EngineError::Unsupported(
3191                "AS OF SEGMENT supports SELECT projection + WHERE + LIMIT only \
3192                 (joins / aggregates / ORDER BY are STABILITY § \"Out of v6.10\")"
3193                    .into(),
3194            ));
3195        }
3196        let table = self
3197            .active_catalog()
3198            .get(&from.primary.name)
3199            .ok_or_else(|| StorageError::TableNotFound {
3200                name: from.primary.name.clone(),
3201            })?;
3202        let schema = table.schema().clone();
3203        let schema_cols = &schema.columns;
3204        let alias = from
3205            .primary
3206            .alias
3207            .as_deref()
3208            .unwrap_or(from.primary.name.as_str());
3209        let ctx = EvalContext::new(schema_cols, Some(alias));
3210        let seg = self
3211            .active_catalog()
3212            .cold_segment(segment_id)
3213            .ok_or_else(|| {
3214                EngineError::Unsupported(alloc::format!(
3215                    "AS OF SEGMENT: cold segment {segment_id} not registered"
3216                ))
3217            })?;
3218        let mut out_rows: Vec<Row> = Vec::new();
3219        let mut limit_remaining: Option<usize> =
3220            stmt.limit.as_ref().and_then(|n| usize::try_from(*n).ok());
3221        for (_key, body) in seg.scan() {
3222            let (row, _consumed) = spg_storage::decode_row_body_dense(&body, &schema)
3223                .map_err(EngineError::Storage)?;
3224            if let Some(where_expr) = &stmt.where_ {
3225                let cond = self.eval_expr_simple(where_expr, &row, &ctx)?;
3226                if !matches!(cond, Value::Bool(true)) {
3227                    continue;
3228                }
3229            }
3230            // Projection.
3231            let projected = self.project_row_simple(&row, &stmt.items, schema_cols, alias)?;
3232            out_rows.push(projected);
3233            if let Some(rem) = limit_remaining.as_mut() {
3234                if *rem == 0 {
3235                    out_rows.pop();
3236                    break;
3237                }
3238                *rem -= 1;
3239            }
3240        }
3241        // Output column schema: derive from SELECT items.
3242        let columns = self.derive_output_columns(&stmt.items, schema_cols, alias);
3243        Ok(QueryResult::Rows {
3244            columns,
3245            rows: out_rows,
3246        })
3247    }
3248
3249    /// v6.10.2 — simple-path WHERE eval that doesn't go through
3250    /// the correlated-subquery / Memoize machinery. AS OF SEGMENT
3251    /// scan paths predicate against a snapshot frozen segment, no
3252    /// cross-row state.
3253    fn eval_expr_simple(
3254        &self,
3255        expr: &Expr,
3256        row: &Row,
3257        ctx: &EvalContext,
3258    ) -> Result<Value, EngineError> {
3259        let cancel = CancelToken::none();
3260        self.eval_expr_with_correlated(expr, row, ctx, cancel, None)
3261    }
3262
3263    /// v6.10.2 — projection for AS OF SEGMENT. Resolves
3264    /// `SelectItem::Wildcard` to all schema columns and
3265    /// `SelectItem::Expr` via the regular eval path.
3266    fn project_row_simple(
3267        &self,
3268        row: &Row,
3269        items: &[SelectItem],
3270        schema_cols: &[ColumnSchema],
3271        alias: &str,
3272    ) -> Result<Row, EngineError> {
3273        let ctx = EvalContext::new(schema_cols, Some(alias));
3274        let cancel = CancelToken::none();
3275        let mut out_vals = Vec::new();
3276        for item in items {
3277            match item {
3278                SelectItem::Wildcard => {
3279                    out_vals.extend(row.values.iter().cloned());
3280                }
3281                SelectItem::Expr { expr, .. } => {
3282                    let v = self.eval_expr_with_correlated(expr, row, &ctx, cancel, None)?;
3283                    out_vals.push(v);
3284                }
3285            }
3286        }
3287        Ok(Row::new(out_vals))
3288    }
3289
3290    /// v6.10.2 — derive the output `ColumnSchema` list for an
3291    /// AS OF SEGMENT projection. Wildcards take the full schema;
3292    /// expressions take the alias if present or a synthetic
3293    /// `?column?` (PG convention) otherwise.
3294    fn derive_output_columns(
3295        &self,
3296        items: &[SelectItem],
3297        schema_cols: &[ColumnSchema],
3298        _alias: &str,
3299    ) -> Vec<ColumnSchema> {
3300        let mut out = Vec::new();
3301        for item in items {
3302            match item {
3303                SelectItem::Wildcard => {
3304                    out.extend(schema_cols.iter().cloned());
3305                }
3306                SelectItem::Expr { alias, .. } => {
3307                    let name = alias
3308                        .clone()
3309                        .unwrap_or_else(|| "?column?".to_string());
3310                    // Default to Text; the caller's row values
3311                    // carry the actual type. v6.10.2 scope.
3312                    out.push(ColumnSchema::new(name, DataType::Text, true));
3313                }
3314            }
3315        }
3316        out
3317    }
3318
3319    fn exec_select_cancel(
3320        &self,
3321        stmt: &SelectStatement,
3322        cancel: CancelToken<'_>,
3323    ) -> Result<QueryResult, EngineError> {
3324        cancel.check()?;
3325        // v6.10.2 — cold-tier time-travel short-circuit. When the
3326        // primary TableRef carries `AS OF SEGMENT '<id>'`, run a
3327        // dedicated cold-segment scan instead of the regular
3328        // hot+index path. The scope is intentionally narrow for
3329        // v6.10.2 — bare `SELECT * FROM <t> AS OF SEGMENT 'id'`,
3330        // optionally with a single-column-equality WHERE. JOINs /
3331        // aggregates / ORDER BY / subqueries on top of a time-
3332        // travelled scan are STABILITY § "Out of v6.10".
3333        if let Some(from) = &stmt.from
3334            && let Some(seg_id) = from.primary.as_of_segment
3335        {
3336            return self.exec_select_as_of_segment(stmt, from, seg_id);
3337        }
3338        // v6.2.0 / v6.5.0 — virtual-table short-circuits. Detected
3339        // pre-CTE because they don't read from the catalog and
3340        // shouldn't participate in regular FROM resolution.
3341        if let Some(from) = &stmt.from
3342            && from.joins.is_empty()
3343            && stmt.where_.is_none()
3344            && stmt.group_by.is_none()
3345            && stmt.having.is_none()
3346            && stmt.unions.is_empty()
3347            && stmt.order_by.is_empty()
3348            && stmt.limit.is_none()
3349            && stmt.offset.is_none()
3350            && !stmt.distinct
3351            && stmt.items.iter().all(|i| matches!(i, SelectItem::Wildcard))
3352        {
3353            let lower = from.primary.name.to_ascii_lowercase();
3354            match lower.as_str() {
3355                "spg_statistic" => return Ok(self.exec_spg_statistic()),
3356                // v6.5.0 — observability v2 virtual tables.
3357                "spg_stat_replication" => return Ok(self.exec_spg_stat_replication()),
3358                "spg_stat_segment" => return Ok(self.exec_spg_stat_segment()),
3359                "spg_stat_query" => return Ok(self.exec_spg_stat_query()),
3360                "spg_stat_activity" => return Ok(self.exec_spg_stat_activity()),
3361                "spg_audit_chain" => return Ok(self.exec_spg_audit_chain()),
3362                "spg_audit_verify" => return Ok(self.exec_spg_audit_verify()),
3363                "spg_table_ddl" => return Ok(self.exec_spg_table_ddl()),
3364                "spg_role_ddl" => return Ok(self.exec_spg_role_ddl()),
3365                "spg_database_ddl" => return Ok(self.exec_spg_database_ddl()),
3366                _ => {}
3367            }
3368        }
3369        // v4.11: CTEs materialise into a temporary enriched catalog
3370        // *before* anything else — the body SELECT can then refer
3371        // to CTE names via the regular FROM-clause resolution.
3372        // Uncorrelated only: each CTE body runs once against the
3373        // current catalog, not against later CTEs' results (left-
3374        // to-right materialisation would relax this, but we keep
3375        // it simple for v4.11 MVP).
3376        if !stmt.ctes.is_empty() {
3377            return self.exec_with_ctes(stmt, cancel);
3378        }
3379        // v4.10: subqueries (uncorrelated) are resolved here, before
3380        // the executor sees the row loop. We clone the statement so
3381        // we can mutate without disturbing the caller's AST — most
3382        // queries pass through with no subquery nodes and the clone
3383        // is cheap; with subqueries the materialisation cost
3384        // dominates anyway.
3385        let mut stmt_owned;
3386        let stmt_ref: &SelectStatement = if expr_tree_has_subquery(stmt) {
3387            stmt_owned = stmt.clone();
3388            self.resolve_select_subqueries(&mut stmt_owned, cancel)?;
3389            &stmt_owned
3390        } else {
3391            stmt
3392        };
3393        if stmt_ref.unions.is_empty() {
3394            return self.exec_bare_select_cancel(stmt_ref, cancel);
3395        }
3396        // UNION path: clone-strip the head into a bare block (its own
3397        // DISTINCT and any inner ORDER BY are dropped by parser rule —
3398        // the wrapper SelectStatement carries them), execute, then chain
3399        // peers with left-associative dedup semantics.
3400        let mut head = stmt_ref.clone();
3401        head.unions = Vec::new();
3402        head.order_by = Vec::new();
3403        head.limit = None;
3404        let QueryResult::Rows { columns, mut rows } =
3405            self.exec_bare_select_cancel(&head, cancel)?
3406        else {
3407            unreachable!("bare SELECT cannot return CommandOk")
3408        };
3409        for (kind, peer) in &stmt_ref.unions {
3410            let QueryResult::Rows {
3411                columns: peer_cols,
3412                rows: peer_rows,
3413            } = self.exec_bare_select_cancel(peer, cancel)?
3414            else {
3415                unreachable!("bare SELECT cannot return CommandOk")
3416            };
3417            if peer_cols.len() != columns.len() {
3418                return Err(EngineError::Unsupported(alloc::format!(
3419                    "UNION arity mismatch: head has {} columns, peer has {}",
3420                    columns.len(),
3421                    peer_cols.len()
3422                )));
3423            }
3424            rows.extend(peer_rows);
3425            if matches!(kind, UnionKind::Distinct) {
3426                rows = dedup_rows(rows);
3427            }
3428        }
3429        // ORDER BY at the top of a UNION applies to the combined result.
3430        // Eval against the projected schema (NOT the source table).
3431        if !stmt.order_by.is_empty() {
3432            let synth_ctx = EvalContext::new(&columns, None);
3433            let descs: Vec<bool> = stmt.order_by.iter().map(|o| o.desc).collect();
3434            let mut tagged: Vec<(Vec<f64>, Row)> = Vec::with_capacity(rows.len());
3435            for r in rows {
3436                let keys = build_order_keys(&stmt.order_by, &r, &synth_ctx)?;
3437                tagged.push((keys, r));
3438            }
3439            sort_by_keys(&mut tagged, &descs);
3440            rows = tagged.into_iter().map(|(_, r)| r).collect();
3441        }
3442        apply_offset_and_limit(&mut rows, stmt.offset, stmt.limit);
3443        Ok(QueryResult::Rows { columns, rows })
3444    }
3445
3446    #[allow(clippy::too_many_lines)]
3447    #[allow(clippy::too_many_lines)] // huge match — splitting fragments the planner
3448    fn exec_bare_select_cancel(
3449        &self,
3450        stmt: &SelectStatement,
3451        cancel: CancelToken<'_>,
3452    ) -> Result<QueryResult, EngineError> {
3453        // v4.12: window-function path. When the projection contains
3454        // any `name(args) OVER (...)` we route to the dedicated
3455        // executor — partition + sort + per-row window value before
3456        // the regular projection.
3457        if select_has_window(stmt) {
3458            return self.exec_select_with_window(stmt, cancel);
3459        }
3460        // Constant SELECT (no FROM) — evaluate each item once against an
3461        // empty dummy row. Useful for `SELECT 1`, `SELECT coalesce(...)`,
3462        // `SELECT '7'::INT`. Column references will surface as
3463        // ColumnNotFound on eval since the schema is empty.
3464        let Some(from) = &stmt.from else {
3465            let empty_schema: Vec<ColumnSchema> = Vec::new();
3466            let ctx = EvalContext::new(&empty_schema, None);
3467            let projection = build_projection(&stmt.items, &empty_schema, "")?;
3468            let dummy_row = Row::new(Vec::new());
3469            let mut values = Vec::with_capacity(projection.len());
3470            for p in &projection {
3471                values.push(eval::eval_expr(&p.expr, &dummy_row, &ctx)?);
3472            }
3473            let columns: Vec<ColumnSchema> = projection
3474                .into_iter()
3475                .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
3476                .collect();
3477            return Ok(QueryResult::Rows {
3478                columns,
3479                rows: alloc::vec![Row::new(values)],
3480            });
3481        };
3482        // Multi-table FROM (one or more joined peers) goes through the
3483        // nested-loop join executor. Single-table FROM stays on the
3484        // existing scan + index-seek path.
3485        if !from.joins.is_empty() {
3486            return self.exec_joined_select(stmt, from);
3487        }
3488        let primary = &from.primary;
3489        let table = self.active_catalog().get(&primary.name).ok_or_else(|| {
3490            StorageError::TableNotFound {
3491                name: primary.name.clone(),
3492            }
3493        })?;
3494        let schema_cols = &table.schema().columns;
3495        // The qualifier accepted on column refs is the alias (if any) else the
3496        // bare table name.
3497        let alias = primary.alias.as_deref().unwrap_or(primary.name.as_str());
3498        let ctx = EvalContext::new(schema_cols, Some(alias));
3499
3500        // NSW kNN planner: `ORDER BY col <-> literal LIMIT k` with no
3501        // WHERE and an NSW index on `col` skips the full scan. The
3502        // walk returns rows already in ascending-distance order, so
3503        // ORDER BY / LIMIT are honoured implicitly.
3504        if let Some(nsw_rows) = try_nsw_knn(stmt, table, schema_cols, alias) {
3505            return materialise_in_order(stmt, table, schema_cols, alias, &nsw_rows);
3506        }
3507
3508        // Index seek: if WHERE is `col = literal` (or commuted) and the
3509        // referenced column has an index, dispatch each locator through
3510        // the catalog (hot tier → borrow, cold tier → page-read +
3511        // decode) and iterate just those rows. Otherwise fall back to a
3512        // full scan over the hot tier (cold-tier rows are only reached
3513        // via index seek in v5.1 — full table scans against cold-tier
3514        // data ship in v5.2 with the freezer's per-segment scan API).
3515        let indexed_rows: Option<Vec<Cow<'_, Row>>> = stmt
3516            .where_
3517            .as_ref()
3518            .and_then(|w| try_index_seek(w, schema_cols, self.active_catalog(), table, alias));
3519
3520        // Aggregate path: filter rows first, then hand off to the
3521        // aggregate executor which does its own projection + ORDER BY.
3522        if aggregate::uses_aggregate(stmt) {
3523            let mut filtered: Vec<&Row> = Vec::new();
3524            // v6.2.6 — Memoize: per-query LRU cache for correlated
3525            // scalar subqueries. Fresh per row-loop entry so each
3526            // SELECT execution gets an isolated cache.
3527            let mut memo = memoize::MemoizeCache::new();
3528            if let Some(rows) = &indexed_rows {
3529                for cow in rows {
3530                    let row = cow.as_ref();
3531                    if let Some(where_expr) = &stmt.where_ {
3532                        let cond = self.eval_expr_with_correlated(
3533                            where_expr,
3534                            row,
3535                            &ctx,
3536                            cancel,
3537                            Some(&mut memo),
3538                        )?;
3539                        if !matches!(cond, Value::Bool(true)) {
3540                            continue;
3541                        }
3542                    }
3543                    filtered.push(row);
3544                }
3545            } else {
3546                for i in 0..table.row_count() {
3547                    let row = &table.rows()[i];
3548                    if let Some(where_expr) = &stmt.where_ {
3549                        let cond = self.eval_expr_with_correlated(
3550                            where_expr,
3551                            row,
3552                            &ctx,
3553                            cancel,
3554                            Some(&mut memo),
3555                        )?;
3556                        if !matches!(cond, Value::Bool(true)) {
3557                            continue;
3558                        }
3559                    }
3560                    filtered.push(row);
3561                }
3562            }
3563            let mut agg = aggregate::run(stmt, &filtered, schema_cols, Some(alias))?;
3564            apply_offset_and_limit(&mut agg.rows, stmt.offset, stmt.limit);
3565            return Ok(QueryResult::Rows {
3566                columns: agg.columns,
3567                rows: agg.rows,
3568            });
3569        }
3570
3571        let projection = build_projection(&stmt.items, schema_cols, alias)?;
3572
3573        // Materialise the filter pass into `(order_key, projected_row)`
3574        // tuples. The order key is `None` when there's no ORDER BY clause.
3575        let mut tagged: Vec<(Vec<f64>, Row)> = Vec::new();
3576        // v6.2.6 — Memoize per-row WHERE eval shares one cache.
3577        let mut memo = memoize::MemoizeCache::new();
3578        // Inline the per-row work in a closure so the indexed and full-
3579        // scan branches share the body.
3580        let mut process_row = |row: &Row, loop_idx: usize| -> Result<(), EngineError> {
3581            if loop_idx.is_multiple_of(256) {
3582                cancel.check()?;
3583            }
3584            if let Some(where_expr) = &stmt.where_ {
3585                let cond = self.eval_expr_with_correlated(
3586                    where_expr,
3587                    row,
3588                    &ctx,
3589                    cancel,
3590                    Some(&mut memo),
3591                )?;
3592                if !matches!(cond, Value::Bool(true)) {
3593                    return Ok(());
3594                }
3595            }
3596            let mut values = Vec::with_capacity(projection.len());
3597            for p in &projection {
3598                values.push(eval::eval_expr(&p.expr, row, &ctx)?);
3599            }
3600            let order_keys = if stmt.order_by.is_empty() {
3601                Vec::new()
3602            } else {
3603                build_order_keys(&stmt.order_by, row, &ctx)?
3604            };
3605            tagged.push((order_keys, Row::new(values)));
3606            Ok(())
3607        };
3608        if let Some(rows) = &indexed_rows {
3609            for (loop_idx, cow) in rows.iter().enumerate() {
3610                process_row(cow.as_ref(), loop_idx)?;
3611            }
3612        } else {
3613            for i in 0..table.row_count() {
3614                process_row(&table.rows()[i], i)?;
3615            }
3616        }
3617
3618        if !stmt.order_by.is_empty() {
3619            // Partial-sort fast path: when LIMIT is small relative to
3620            // the row count, select_nth_unstable + sort just the
3621            // prefix is O(n + k log k) instead of O(n log n). DISTINCT
3622            // requires the full sort because de-dup happens after.
3623            let keep = if stmt.distinct {
3624                None
3625            } else {
3626                stmt.limit
3627                    .map(|l| l as usize + stmt.offset.map_or(0, |o| o as usize))
3628            };
3629            let descs: Vec<bool> = stmt.order_by.iter().map(|o| o.desc).collect();
3630            partial_sort_tagged(&mut tagged, keep, &descs);
3631        }
3632
3633        let mut output_rows: Vec<Row> = tagged.into_iter().map(|(_, r)| r).collect();
3634        if stmt.distinct {
3635            output_rows = dedup_rows(output_rows);
3636        }
3637        apply_offset_and_limit(&mut output_rows, stmt.offset, stmt.limit);
3638
3639        let columns: Vec<ColumnSchema> = projection
3640            .into_iter()
3641            .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
3642            .collect();
3643
3644        Ok(QueryResult::Rows {
3645            columns,
3646            rows: output_rows,
3647        })
3648    }
3649
3650    /// Multi-table SELECT executor (one or more JOIN peers).
3651    ///
3652    /// v1.10 builds the joined row set up-front via nested-loop joins,
3653    /// then runs WHERE + projection + ORDER BY against the combined
3654    /// rows. No index seek. Aggregates and DISTINCT still work because
3655    /// the executor delegates projection through the same shared paths.
3656    #[allow(clippy::too_many_lines)]
3657    fn exec_joined_select(
3658        &self,
3659        stmt: &SelectStatement,
3660        from: &FromClause,
3661    ) -> Result<QueryResult, EngineError> {
3662        // Resolve every table reference up front so we surface
3663        // TableNotFound before we start the cartesian work.
3664        let primary_table = self
3665            .active_catalog()
3666            .get(&from.primary.name)
3667            .ok_or_else(|| StorageError::TableNotFound {
3668                name: from.primary.name.clone(),
3669            })?;
3670        let primary_alias = from
3671            .primary
3672            .alias
3673            .as_deref()
3674            .unwrap_or(from.primary.name.as_str())
3675            .to_string();
3676        let mut joined_tables: Vec<(&Table, String, JoinKind, Option<&Expr>)> = Vec::new();
3677        for j in &from.joins {
3678            let t = self.active_catalog().get(&j.table.name).ok_or_else(|| {
3679                StorageError::TableNotFound {
3680                    name: j.table.name.clone(),
3681                }
3682            })?;
3683            let a = j
3684                .table
3685                .alias
3686                .as_deref()
3687                .unwrap_or(j.table.name.as_str())
3688                .to_string();
3689            joined_tables.push((t, a, j.kind, j.on.as_ref()));
3690        }
3691
3692        // Build the combined schema: composite "alias.col" names so the
3693        // qualified-column resolver can find anything by exact match.
3694        let mut combined_schema: Vec<ColumnSchema> = Vec::new();
3695        for col in &primary_table.schema().columns {
3696            combined_schema.push(ColumnSchema::new(
3697                alloc::format!("{primary_alias}.{}", col.name),
3698                col.ty,
3699                col.nullable,
3700            ));
3701        }
3702        for (t, a, _, _) in &joined_tables {
3703            for col in &t.schema().columns {
3704                combined_schema.push(ColumnSchema::new(
3705                    alloc::format!("{a}.{}", col.name),
3706                    col.ty,
3707                    col.nullable,
3708                ));
3709            }
3710        }
3711        let ctx = EvalContext::new(&combined_schema, None);
3712
3713        // Nested-loop join. Starting set: every primary row, padded with
3714        // (no joined columns yet).
3715        let mut working: Vec<Row> = primary_table.rows().iter().cloned().collect();
3716        let mut produced_len = primary_table.schema().columns.len();
3717        for (t, _, kind, on) in &joined_tables {
3718            let right_arity = t.schema().columns.len();
3719            let mut next: Vec<Row> = Vec::new();
3720            for left in &working {
3721                let mut left_matched = false;
3722                for right in t.rows() {
3723                    let mut combined_vals = left.values.clone();
3724                    combined_vals.extend(right.values.iter().cloned());
3725                    // Pad combined to the eventual full width so the
3726                    // partial schema still matches positions used by ON.
3727                    let combined = Row::new(combined_vals);
3728                    let keep = if let Some(on_expr) = on {
3729                        let cond = eval::eval_expr(on_expr, &combined, &ctx)?;
3730                        matches!(cond, Value::Bool(true))
3731                    } else {
3732                        // CROSS / comma-list: every pair survives.
3733                        true
3734                    };
3735                    if keep {
3736                        next.push(combined);
3737                        left_matched = true;
3738                    }
3739                }
3740                if !left_matched && matches!(kind, JoinKind::Left) {
3741                    // LEFT OUTER JOIN: emit the left row with NULLs on
3742                    // the right side when no peer matched.
3743                    let mut combined_vals = left.values.clone();
3744                    for _ in 0..right_arity {
3745                        combined_vals.push(Value::Null);
3746                    }
3747                    next.push(Row::new(combined_vals));
3748                }
3749            }
3750            working = next;
3751            produced_len += right_arity;
3752            debug_assert!(produced_len <= combined_schema.len());
3753        }
3754
3755        // WHERE filter against combined rows.
3756        let mut filtered: Vec<Row> = Vec::new();
3757        for row in working {
3758            if let Some(where_expr) = &stmt.where_ {
3759                let cond = eval::eval_expr(where_expr, &row, &ctx)?;
3760                if !matches!(cond, Value::Bool(true)) {
3761                    continue;
3762                }
3763            }
3764            filtered.push(row);
3765        }
3766
3767        // Aggregate path: handle GROUP BY / aggregate calls over the
3768        // joined+filtered rows.
3769        if aggregate::uses_aggregate(stmt) {
3770            let refs: Vec<&Row> = filtered.iter().collect();
3771            let mut agg = aggregate::run(stmt, &refs, &combined_schema, None)?;
3772            apply_offset_and_limit(&mut agg.rows, stmt.offset, stmt.limit);
3773            return Ok(QueryResult::Rows {
3774                columns: agg.columns,
3775                rows: agg.rows,
3776            });
3777        }
3778
3779        let projection = build_projection(&stmt.items, &combined_schema, "")?;
3780        let mut tagged: Vec<(Vec<f64>, Row)> = Vec::new();
3781        for row in &filtered {
3782            let mut values = Vec::with_capacity(projection.len());
3783            for p in &projection {
3784                values.push(eval::eval_expr(&p.expr, row, &ctx)?);
3785            }
3786            let order_keys = if stmt.order_by.is_empty() {
3787                Vec::new()
3788            } else {
3789                build_order_keys(&stmt.order_by, row, &ctx)?
3790            };
3791            tagged.push((order_keys, Row::new(values)));
3792        }
3793        if !stmt.order_by.is_empty() {
3794            let keep = if stmt.distinct {
3795                None
3796            } else {
3797                stmt.limit
3798                    .map(|l| l as usize + stmt.offset.map_or(0, |o| o as usize))
3799            };
3800            let descs: Vec<bool> = stmt.order_by.iter().map(|o| o.desc).collect();
3801            partial_sort_tagged(&mut tagged, keep, &descs);
3802        }
3803        let mut output_rows: Vec<Row> = tagged.into_iter().map(|(_, r)| r).collect();
3804        if stmt.distinct {
3805            output_rows = dedup_rows(output_rows);
3806        }
3807        apply_offset_and_limit(&mut output_rows, stmt.offset, stmt.limit);
3808        let columns: Vec<ColumnSchema> = projection
3809            .into_iter()
3810            .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
3811            .collect();
3812        Ok(QueryResult::Rows {
3813            columns,
3814            rows: output_rows,
3815        })
3816    }
3817}
3818
3819/// One row-producing projection: an expression to evaluate, the resulting
3820/// column's user-visible name, its inferred type, and nullability.
3821#[derive(Debug, Clone)]
3822struct ProjectedItem {
3823    expr: Expr,
3824    output_name: String,
3825    ty: DataType,
3826    nullable: bool,
3827}
3828
3829/// Dedupe a row set, preserving first-seen order. `Row`'s `PartialEq` is
3830/// structural (`Vec<Value>` ⇒ pairwise `Value` equality), which gives SQL
3831/// `NULL = NULL → TRUE` and `NaN = NaN → FALSE`. The first agrees with
3832/// the spec's "two NULLs are not distinct"; the second is a tolerated
3833/// quirk for v1 (no NaN literals are reachable from the SQL surface).
3834fn dedup_rows(rows: Vec<Row>) -> Vec<Row> {
3835    let mut out: Vec<Row> = Vec::with_capacity(rows.len());
3836    for r in rows {
3837        if !out.iter().any(|seen| seen == &r) {
3838            out.push(r);
3839        }
3840    }
3841    out
3842}
3843
3844/// Coerce a `Value` to an `f64` sort key for ORDER BY. Numbers map directly;
3845/// NULL sorts last (treated as `+∞`); booleans are 0.0 / 1.0; text uses lex
3846/// order via the byte values; vectors are not sortable.
3847fn value_to_order_key(v: &Value) -> Result<f64, EngineError> {
3848    match v {
3849        Value::Null => Ok(f64::INFINITY),
3850        Value::SmallInt(n) => Ok(f64::from(*n)),
3851        Value::Int(n) => Ok(f64::from(*n)),
3852        Value::Date(d) => Ok(f64::from(*d)),
3853        #[allow(clippy::cast_precision_loss)]
3854        Value::Timestamp(t) => Ok(*t as f64),
3855        #[allow(clippy::cast_precision_loss)]
3856        Value::Numeric { scaled, scale } => {
3857            // Scaled integer / 10^scale, computed via f64 for sort
3858            // ordering only. Precision losses here only matter for
3859            // ORDER BY tie-breaks well past 15 significant digits.
3860            // `f64::powi` lives in std; we hand-roll the loop so the
3861            // no_std engine crate doesn't need it.
3862            let mut divisor = 1.0_f64;
3863            for _ in 0..*scale {
3864                divisor *= 10.0;
3865            }
3866            Ok((*scaled as f64) / divisor)
3867        }
3868        #[allow(clippy::cast_precision_loss)]
3869        Value::BigInt(n) => Ok(*n as f64),
3870        Value::Float(x) => Ok(*x),
3871        Value::Bool(b) => Ok(if *b { 1.0 } else { 0.0 }),
3872        Value::Text(s) => {
3873            // Lex order by codepoints — good enough for ORDER BY name.
3874            // Map first 8 bytes packed into u64 as a coarse key; ties fall to
3875            // partial_cmp Equal. v1.x can swap in a real string comparator.
3876            let mut key: u64 = 0;
3877            for &b in s.as_bytes().iter().take(8) {
3878                key = (key << 8) | u64::from(b);
3879            }
3880            #[allow(clippy::cast_precision_loss)]
3881            Ok(key as f64)
3882        }
3883        Value::Vector(_) | Value::Sq8Vector(_) | Value::HalfVector(_) => {
3884            Err(EngineError::Unsupported(
3885                "ORDER BY of a raw vector column is not meaningful — use `<->`".into(),
3886            ))
3887        }
3888        Value::Interval { .. } => Err(EngineError::Unsupported(
3889            "ORDER BY of an INTERVAL is not supported in v2.11 \
3890             (months vs micros has no single canonical ordering)"
3891                .into(),
3892        )),
3893        Value::Json(_) => Err(EngineError::Unsupported(
3894            "ORDER BY of a JSON value is not supported — cast the document to text first".into(),
3895        )),
3896        // v7.5.0 — Value is #[non_exhaustive]; future variants need
3897        // an explicit ORDER BY mapping. Surface as Unsupported until
3898        // engine support is added.
3899        _ => Err(EngineError::Unsupported(
3900            "ORDER BY of this value type is not supported".into(),
3901        )),
3902    }
3903}
3904
3905/// Try to plan a WHERE clause as an equality lookup against an existing
3906/// index. Returns the candidate row indices on success; `None` means the
3907/// caller should fall back to a full scan.
3908///
3909/// v0.8 recognises a single top-level `col = literal` (in either operand
3910/// order). AND chains and range scans land in later milestones.
3911/// Look for `ORDER BY col <dist-op> literal LIMIT k` against an
3912/// NSW-indexed vector column. Recognised distance ops: `<->` (L2),
3913/// `<#>` (inner product), `<=>` (cosine). When a WHERE clause is
3914/// present, the planner does an "over-fetch and filter" pass — it
3915/// asks the graph for `k * over_fetch` candidates, evaluates WHERE
3916/// against each, and trims back to `k`. Returns the row indices in
3917/// ascending-distance order when the plan applies.
3918fn try_nsw_knn(
3919    stmt: &SelectStatement,
3920    table: &Table,
3921    schema_cols: &[ColumnSchema],
3922    table_alias: &str,
3923) -> Option<Vec<usize>> {
3924    if stmt.distinct {
3925        return None;
3926    }
3927    let limit = usize::try_from(stmt.limit?).ok()?;
3928    if limit == 0 {
3929        return None;
3930    }
3931    // v6.4.0 — NSW kNN dispatch needs a single ORDER BY key on the
3932    // distance metric. Multi-key ORDER BY falls through to the
3933    // generic sort path.
3934    if stmt.order_by.len() != 1 {
3935        return None;
3936    }
3937    let order = &stmt.order_by[0];
3938    // NSW kNN returns rows ascending by distance — DESC inverts the
3939    // natural order, so the planner can't handle it without a sort
3940    // pass. Fall back to the generic ORDER BY path.
3941    if order.desc {
3942        return None;
3943    }
3944    let Expr::Binary { lhs, op, rhs } = &order.expr else {
3945        return None;
3946    };
3947    let metric = match op {
3948        BinOp::L2Distance => spg_storage::NswMetric::L2,
3949        BinOp::InnerProduct => spg_storage::NswMetric::InnerProduct,
3950        BinOp::CosineDistance => spg_storage::NswMetric::Cosine,
3951        _ => return None,
3952    };
3953    // Accept both `col <op> literal` and `literal <op> col`.
3954    let ((Expr::Column(col), literal) | (literal, Expr::Column(col))) =
3955        (lhs.as_ref(), rhs.as_ref())
3956    else {
3957        return None;
3958    };
3959    if let Some(q) = &col.qualifier
3960        && q != table_alias
3961    {
3962        return None;
3963    }
3964    let col_pos = schema_cols.iter().position(|s| s.name == col.name)?;
3965    let query = literal_to_vector(literal)?;
3966    let idx = spg_storage::nsw_index_on(table, col_pos)?;
3967    if let Some(where_expr) = &stmt.where_ {
3968        // Over-fetch and filter. The factor (10×) is a heuristic that
3969        // covers typical selectivity for the corpus tests; v2.x will
3970        // make it configurable.
3971        let over_fetch = limit.saturating_mul(10).max(NSW_OVER_FETCH_FLOOR);
3972        let candidates = spg_storage::nsw_query(table, &idx.name, &query, over_fetch, metric);
3973        let ctx = EvalContext::new(schema_cols, Some(table_alias));
3974        let mut kept: Vec<usize> = Vec::with_capacity(limit);
3975        for i in candidates {
3976            let row = &table.rows()[i];
3977            let cond = eval::eval_expr(where_expr, row, &ctx).ok()?;
3978            if matches!(cond, Value::Bool(true)) {
3979                kept.push(i);
3980                if kept.len() >= limit {
3981                    break;
3982                }
3983            }
3984        }
3985        Some(kept)
3986    } else {
3987        Some(spg_storage::nsw_query(
3988            table, &idx.name, &query, limit, metric,
3989        ))
3990    }
3991}
3992
3993/// Lower bound on the over-fetch pool when WHERE is present — even
3994/// for tiny `LIMIT 1` queries we keep enough candidates to absorb a
3995/// few WHERE rejections.
3996const NSW_OVER_FETCH_FLOOR: usize = 32;
3997
3998/// Pull a `Vec<f32>` out of a literal-or-cast expression. Returns
3999/// `None` for anything we can't fold at plan time.
4000fn literal_to_vector(e: &Expr) -> Option<Vec<f32>> {
4001    match e {
4002        Expr::Literal(Literal::Vector(v)) => Some(v.clone()),
4003        Expr::Cast { expr, .. } => literal_to_vector(expr),
4004        _ => None,
4005    }
4006}
4007
4008/// Materialise rows in a planner-supplied order (used by the NSW path)
4009/// without re-running ORDER BY. The projection + LIMIT slot mirror the
4010/// equivalent block in `exec_bare_select`.
4011fn materialise_in_order(
4012    stmt: &SelectStatement,
4013    table: &Table,
4014    schema_cols: &[ColumnSchema],
4015    table_alias: &str,
4016    ordered_rows: &[usize],
4017) -> Result<QueryResult, EngineError> {
4018    let ctx = EvalContext::new(schema_cols, Some(table_alias));
4019    let projection = build_projection(&stmt.items, schema_cols, table_alias)?;
4020    let mut output_rows: Vec<Row> = Vec::with_capacity(ordered_rows.len());
4021    for &i in ordered_rows {
4022        let row = &table.rows()[i];
4023        let mut values = Vec::with_capacity(projection.len());
4024        for p in &projection {
4025            values.push(eval::eval_expr(&p.expr, row, &ctx)?);
4026        }
4027        output_rows.push(Row::new(values));
4028    }
4029    apply_offset_and_limit(&mut output_rows, stmt.offset, stmt.limit);
4030    let columns: Vec<ColumnSchema> = projection
4031        .into_iter()
4032        .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
4033        .collect();
4034    Ok(QueryResult::Rows {
4035        columns,
4036        rows: output_rows,
4037    })
4038}
4039
4040fn try_index_seek<'a>(
4041    where_expr: &Expr,
4042    schema_cols: &[ColumnSchema],
4043    catalog: &'a Catalog,
4044    table: &'a Table,
4045    table_alias: &str,
4046) -> Option<Vec<Cow<'a, Row>>> {
4047    let Expr::Binary {
4048        lhs,
4049        op: BinOp::Eq,
4050        rhs,
4051    } = where_expr
4052    else {
4053        return None;
4054    };
4055    let (col_pos, value) = resolve_col_literal_pair(lhs, rhs, schema_cols, table_alias)
4056        .or_else(|| resolve_col_literal_pair(rhs, lhs, schema_cols, table_alias))?;
4057    let idx = table.index_on(col_pos)?;
4058    let key = IndexKey::from_value(&value)?;
4059    let locators = idx.lookup_eq(&key);
4060    let table_name = table.schema().name.as_str();
4061    // v5.1: each locator dispatches to either the hot tier (zero-
4062    // copy borrow of `table.rows()[i]`) or a cold-tier segment
4063    // (one page read + dense row decode, ~µs scale). Cold rows are
4064    // returned as `Cow::Owned` so the caller's `&Row` iteration
4065    // doesn't see a tier distinction; pre-freezer (no cold
4066    // segments loaded) every locator is `Hot` and every entry is
4067    // `Cow::Borrowed` — identical cost to the pre-v5.1 path.
4068    let mut out: Vec<Cow<'a, Row>> = Vec::with_capacity(locators.len());
4069    for loc in locators {
4070        match *loc {
4071            spg_storage::RowLocator::Hot(i) => {
4072                if let Some(row) = table.rows().get(i) {
4073                    out.push(Cow::Borrowed(row));
4074                }
4075            }
4076            spg_storage::RowLocator::Cold { segment_id, .. } => {
4077                if let Some(row) = catalog.resolve_cold_locator(table_name, segment_id, &key) {
4078                    out.push(Cow::Owned(row));
4079                }
4080            }
4081        }
4082    }
4083    Some(out)
4084}
4085
4086/// v5.2.3: extract `(column_position, IndexKey)` when `where_expr`
4087/// is a simple `col = literal` predicate suitable for a `BTree` index
4088/// seek. Used by `exec_update_cancel` / `exec_delete_cancel` to
4089/// decide whether a write touches a cold-tier row (which requires
4090/// promote-on-write / shadow-on-delete) before falling through to
4091/// the hot-tier row walk.
4092///
4093/// Returns `None` for any predicate shape the planner can't push
4094/// down to an index seek — complex WHERE clauses always take the
4095/// hot-only path (cold rows are immutable to non-indexed writes
4096/// until a future scan-fanout sub-version).
4097fn try_pk_predicate(
4098    where_expr: &Expr,
4099    schema_cols: &[ColumnSchema],
4100    table_alias: &str,
4101) -> Option<(usize, IndexKey)> {
4102    let Expr::Binary {
4103        lhs,
4104        op: BinOp::Eq,
4105        rhs,
4106    } = where_expr
4107    else {
4108        return None;
4109    };
4110    let (col_pos, value) = resolve_col_literal_pair(lhs, rhs, schema_cols, table_alias)
4111        .or_else(|| resolve_col_literal_pair(rhs, lhs, schema_cols, table_alias))?;
4112    let key = IndexKey::from_value(&value)?;
4113    Some((col_pos, key))
4114}
4115
4116fn resolve_col_literal_pair(
4117    col_side: &Expr,
4118    lit_side: &Expr,
4119    schema_cols: &[ColumnSchema],
4120    table_alias: &str,
4121) -> Option<(usize, Value)> {
4122    let Expr::Column(c) = col_side else {
4123        return None;
4124    };
4125    if let Some(q) = &c.qualifier
4126        && q != table_alias
4127    {
4128        return None;
4129    }
4130    let pos = schema_cols.iter().position(|s| s.name == c.name)?;
4131    let Expr::Literal(l) = lit_side else {
4132        return None;
4133    };
4134    let v = match l {
4135        Literal::Integer(n) => {
4136            if let Ok(small) = i32::try_from(*n) {
4137                Value::Int(small)
4138            } else {
4139                Value::BigInt(*n)
4140            }
4141        }
4142        Literal::Float(x) => Value::Float(*x),
4143        Literal::String(s) => Value::Text(s.clone()),
4144        Literal::Bool(b) => Value::Bool(*b),
4145        Literal::Null => Value::Null,
4146        // Vector and Interval literals can't be used as B-tree index keys.
4147        // Tell the planner to fall back to full-scan.
4148        Literal::Vector(_) | Literal::Interval { .. } => return None,
4149    };
4150    Some((pos, v))
4151}
4152
4153/// Find the schema entry that a SELECT-list `Expr::Column` refers to.
4154/// Mirrors `resolve_column` in `eval.rs`, but returns a proper
4155/// `EngineError` so the projection-build path keeps `UnknownQualifier`
4156/// vs `ColumnNotFound` distinct.
4157fn resolve_projection_column<'a>(
4158    c: &ColumnName,
4159    schema_cols: &'a [ColumnSchema],
4160    table_alias: &str,
4161) -> Result<&'a ColumnSchema, EngineError> {
4162    if let Some(q) = &c.qualifier {
4163        let composite = alloc::format!("{q}.{name}", name = c.name);
4164        if let Some(s) = schema_cols.iter().find(|s| s.name == composite) {
4165            return Ok(s);
4166        }
4167        // Single-table case: the qualifier may equal the active alias —
4168        // then look for the bare column name.
4169        if q == table_alias
4170            && let Some(s) = schema_cols.iter().find(|s| s.name == c.name)
4171        {
4172            return Ok(s);
4173        }
4174        // For multi-table schemas the qualifier is unknown only if no
4175        // column bears the "<q>." prefix. For single-table, the alias
4176        // mismatch alone is enough.
4177        let prefix = alloc::format!("{q}.");
4178        let qualifier_known =
4179            q == table_alias || schema_cols.iter().any(|s| s.name.starts_with(&prefix));
4180        if !qualifier_known {
4181            return Err(EngineError::Eval(EvalError::UnknownQualifier {
4182                qualifier: q.clone(),
4183            }));
4184        }
4185        return Err(EngineError::Eval(EvalError::ColumnNotFound {
4186            name: c.name.clone(),
4187        }));
4188    }
4189    if let Some(s) = schema_cols.iter().find(|s| s.name == c.name) {
4190        return Ok(s);
4191    }
4192    let suffix = alloc::format!(".{name}", name = c.name);
4193    let mut matches = schema_cols.iter().filter(|s| s.name.ends_with(&suffix));
4194    let first = matches.next();
4195    let extra = matches.next();
4196    match (first, extra) {
4197        (Some(s), None) => Ok(s),
4198        (Some(_), Some(_)) => Err(EngineError::Eval(EvalError::TypeMismatch {
4199            detail: alloc::format!("ambiguous column reference: {}", c.name),
4200        })),
4201        _ => Err(EngineError::Eval(EvalError::ColumnNotFound {
4202            name: c.name.clone(),
4203        })),
4204    }
4205}
4206
4207fn build_projection(
4208    items: &[SelectItem],
4209    schema_cols: &[ColumnSchema],
4210    table_alias: &str,
4211) -> Result<Vec<ProjectedItem>, EngineError> {
4212    let mut out = Vec::new();
4213    for item in items {
4214        match item {
4215            SelectItem::Wildcard => {
4216                for col in schema_cols {
4217                    out.push(ProjectedItem {
4218                        expr: Expr::Column(ColumnName {
4219                            qualifier: None,
4220                            name: col.name.clone(),
4221                        }),
4222                        output_name: col.name.clone(),
4223                        ty: col.ty,
4224                        nullable: col.nullable,
4225                    });
4226                }
4227            }
4228            SelectItem::Expr { expr, alias } => {
4229                // Plain column ref keeps full schema info (real type +
4230                // nullability). Compound expressions evaluate fine but have
4231                // no static type — surface them as nullable TEXT, which is
4232                // what most clients render anyway.
4233                if let Expr::Column(c) = expr {
4234                    let sch = resolve_projection_column(c, schema_cols, table_alias)?;
4235                    let output_name = alias.clone().unwrap_or_else(|| c.name.clone());
4236                    out.push(ProjectedItem {
4237                        expr: expr.clone(),
4238                        output_name,
4239                        ty: sch.ty,
4240                        nullable: sch.nullable,
4241                    });
4242                } else {
4243                    let output_name = alias.clone().unwrap_or_else(|| expr.to_string());
4244                    out.push(ProjectedItem {
4245                        expr: expr.clone(),
4246                        output_name,
4247                        ty: DataType::Text,
4248                        nullable: true,
4249                    });
4250                }
4251            }
4252        }
4253    }
4254    Ok(out)
4255}
4256
4257/// Promote an integer to a NUMERIC value at the requested scale.
4258/// Rejects values that, after scaling, would overflow the column's
4259/// precision budget.
4260fn numeric_from_integer(
4261    n: i128,
4262    precision: u8,
4263    scale: u8,
4264    col_name: &str,
4265) -> Result<Value, EngineError> {
4266    let factor = pow10_i128(scale);
4267    let scaled = n.checked_mul(factor).ok_or_else(|| {
4268        EngineError::Unsupported(alloc::format!(
4269            "integer overflow scaling value for column `{col_name}` to scale {scale}"
4270        ))
4271    })?;
4272    check_precision(scaled, precision, col_name)?;
4273    Ok(Value::Numeric { scaled, scale })
4274}
4275
4276/// Float → NUMERIC. Uses round-half-away-from-zero on `x * 10^scale`,
4277/// then verifies the result fits the column's precision.
4278#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
4279fn numeric_from_float(
4280    x: f64,
4281    precision: u8,
4282    scale: u8,
4283    col_name: &str,
4284) -> Result<Value, EngineError> {
4285    if !x.is_finite() {
4286        return Err(EngineError::Unsupported(alloc::format!(
4287            "cannot store non-finite float in NUMERIC column `{col_name}`"
4288        )));
4289    }
4290    let mut factor = 1.0_f64;
4291    for _ in 0..scale {
4292        factor *= 10.0;
4293    }
4294    // Round half-away-from-zero by biasing then casting (`as i128`
4295    // truncates toward zero, so the bias + truncation gives the
4296    // desired rounding). `f64::floor` / `ceil` live in std; we don't
4297    // need them — the cast handles the truncation step.
4298    let shifted = x * factor;
4299    let biased = if shifted >= 0.0 {
4300        shifted + 0.5
4301    } else {
4302        shifted - 0.5
4303    };
4304    // Range-check before casting back to i128 — the cast itself is
4305    // saturating in Rust, which would silently truncate huge inputs.
4306    if !(-1e38..=1e38).contains(&biased) {
4307        return Err(EngineError::Unsupported(alloc::format!(
4308            "value {x} overflows NUMERIC range for column `{col_name}`"
4309        )));
4310    }
4311    let scaled = biased as i128;
4312    check_precision(scaled, precision, col_name)?;
4313    Ok(Value::Numeric { scaled, scale })
4314}
4315
4316/// Move a Numeric value from `src_scale` to `dst_scale`. Going up
4317/// multiplies by 10; going down rounds half-away-from-zero.
4318fn numeric_rescale(
4319    scaled: i128,
4320    src_scale: u8,
4321    precision: u8,
4322    dst_scale: u8,
4323    col_name: &str,
4324) -> Result<Value, EngineError> {
4325    let new_scaled = if dst_scale >= src_scale {
4326        let bump = pow10_i128(dst_scale - src_scale);
4327        scaled.checked_mul(bump).ok_or_else(|| {
4328            EngineError::Unsupported(alloc::format!(
4329                "overflow rescaling NUMERIC for column `{col_name}`"
4330            ))
4331        })?
4332    } else {
4333        let drop = pow10_i128(src_scale - dst_scale);
4334        let half = drop / 2;
4335        if scaled >= 0 {
4336            (scaled + half) / drop
4337        } else {
4338            (scaled - half) / drop
4339        }
4340    };
4341    check_precision(new_scaled, precision, col_name)?;
4342    Ok(Value::Numeric {
4343        scaled: new_scaled,
4344        scale: dst_scale,
4345    })
4346}
4347
4348/// Drop the fractional part of a scaled integer, returning the integer
4349/// portion (toward zero). Used for NUMERIC → INT casts.
4350const fn numeric_truncate_to_integer(scaled: i128, scale: u8) -> i128 {
4351    if scale == 0 {
4352        return scaled;
4353    }
4354    let factor = pow10_i128_const(scale);
4355    scaled / factor
4356}
4357
4358/// Verify a scaled NUMERIC value fits the column's declared precision.
4359/// `precision == 0` is the "unconstrained" form (bare `NUMERIC`); we
4360/// skip the check there.
4361fn check_precision(scaled: i128, precision: u8, col_name: &str) -> Result<(), EngineError> {
4362    if precision == 0 {
4363        return Ok(());
4364    }
4365    let limit = pow10_i128(precision);
4366    if scaled.unsigned_abs() >= limit.unsigned_abs() {
4367        return Err(EngineError::Unsupported(alloc::format!(
4368            "NUMERIC value exceeds precision {precision} for column `{col_name}`"
4369        )));
4370    }
4371    Ok(())
4372}
4373
4374const fn pow10_i128_const(p: u8) -> i128 {
4375    let mut acc: i128 = 1;
4376    let mut i = 0;
4377    while i < p {
4378        acc *= 10;
4379        i += 1;
4380    }
4381    acc
4382}
4383
4384fn pow10_i128(p: u8) -> i128 {
4385    pow10_i128_const(p)
4386}
4387
4388/// Walk a parsed `Statement`, swapping any `NOW()` /
4389/// `CURRENT_TIMESTAMP()` / `CURRENT_DATE()` function calls for a
4390/// literal cast that wraps the engine's per-statement clock reading.
4391/// When `now_micros` is `None`, calls stay as-is and surface as
4392/// `unknown function` at eval time — keeps the error path explicit.
4393/// v4.10: pre-walk the WHERE / projection / etc. of a SELECT and
4394/// replace every subquery node with a materialised literal. SPG
4395/// only supports uncorrelated subqueries — the inner SELECT does
4396/// not see outer-row columns, so the result is the same for every
4397/// outer row and can be evaluated once.
4398///
4399/// Returns the rewritten statement; the caller passes this to the
4400/// regular row-loop executor which no longer sees Subquery nodes
4401/// in its tree.
4402impl Engine {
4403    /// v4.12 window executor. Implements `ROW_NUMBER` / `RANK` /
4404    /// `DENSE_RANK` and the partition-aware aggregates `SUM` /
4405    /// `AVG` / `COUNT` / `MIN` / `MAX`. The plan is:
4406    /// 1. Apply the WHERE filter.
4407    /// 2. For each unique `WindowFunction` node in the projection,
4408    ///    partition + sort, compute the per-row value.
4409    /// 3. Append the window values as synthetic columns (`__win_N`)
4410    ///    to the row schema.
4411    /// 4. Rewrite the projection to read those columns.
4412    /// 5. Hand off to the regular project / ORDER BY / LIMIT pipe.
4413    #[allow(
4414        clippy::too_many_lines,
4415        clippy::type_complexity,
4416        clippy::needless_range_loop
4417    )] // window-eval is one cohesive pipe; splitting fragments
4418    fn exec_select_with_window(
4419        &self,
4420        stmt: &SelectStatement,
4421        cancel: CancelToken<'_>,
4422    ) -> Result<QueryResult, EngineError> {
4423        let from = stmt.from.as_ref().ok_or_else(|| {
4424            EngineError::Unsupported("window functions require a FROM clause".into())
4425        })?;
4426        // For v4.12 we only support a single-table FROM. Joins +
4427        // windows is queued for v5.x.
4428        if !from.joins.is_empty() {
4429            return Err(EngineError::Unsupported(
4430                "JOIN with window functions not yet supported".into(),
4431            ));
4432        }
4433        let primary = &from.primary;
4434        let table = self.active_catalog().get(&primary.name).ok_or_else(|| {
4435            StorageError::TableNotFound {
4436                name: primary.name.clone(),
4437            }
4438        })?;
4439        let alias = primary.alias.as_deref().unwrap_or(primary.name.as_str());
4440        let schema_cols = &table.schema().columns;
4441        let ctx = EvalContext::new(schema_cols, Some(alias));
4442
4443        // 1) Filter pass.
4444        let mut filtered: Vec<&Row> = Vec::new();
4445        for (i, row) in table.rows().iter().enumerate() {
4446            if i.is_multiple_of(256) {
4447                cancel.check()?;
4448            }
4449            if let Some(w) = &stmt.where_ {
4450                let cond = eval::eval_expr(w, row, &ctx)?;
4451                if !matches!(cond, Value::Bool(true)) {
4452                    continue;
4453                }
4454            }
4455            filtered.push(row);
4456        }
4457        let n_rows = filtered.len();
4458
4459        // 2) Collect unique window function nodes from projection.
4460        let mut window_nodes: Vec<Expr> = Vec::new();
4461        for item in &stmt.items {
4462            if let SelectItem::Expr { expr, .. } = item {
4463                collect_window_nodes(expr, &mut window_nodes);
4464            }
4465        }
4466
4467        // 3) For each window, compute per-row value.
4468        // Index: same order as window_nodes; for row i, win_vals[w][i].
4469        let mut win_vals: Vec<Vec<Value>> = Vec::with_capacity(window_nodes.len());
4470        for wnode in &window_nodes {
4471            let Expr::WindowFunction {
4472                name,
4473                args,
4474                partition_by,
4475                order_by,
4476                frame,
4477                null_treatment,
4478            } = wnode
4479            else {
4480                unreachable!("collect_window_nodes pushes only WindowFunction");
4481            };
4482            // Compute (partition_key, order_key, original_index) for each row.
4483            let mut indexed: Vec<(Vec<Value>, Vec<(Value, bool)>, usize)> =
4484                Vec::with_capacity(n_rows);
4485            for (i, row) in filtered.iter().enumerate() {
4486                let pkey: Vec<Value> = partition_by
4487                    .iter()
4488                    .map(|p| eval::eval_expr(p, row, &ctx))
4489                    .collect::<Result<_, _>>()?;
4490                let okey: Vec<(Value, bool)> = order_by
4491                    .iter()
4492                    .map(|(e, desc)| eval::eval_expr(e, row, &ctx).map(|v| (v, *desc)))
4493                    .collect::<Result<_, _>>()?;
4494                indexed.push((pkey, okey, i));
4495            }
4496            // Sort by (partition_key, order_key). Partition key uses
4497            // a stable encoded form; order key respects ASC/DESC.
4498            indexed.sort_by(|a, b| {
4499                let p_cmp = partition_key_cmp(&a.0, &b.0);
4500                if p_cmp != core::cmp::Ordering::Equal {
4501                    return p_cmp;
4502                }
4503                order_key_cmp(&a.1, &b.1)
4504            });
4505            // Per-partition compute.
4506            let mut out_vals: Vec<Value> = alloc::vec![Value::Null; n_rows];
4507            let mut p_start = 0;
4508            while p_start < indexed.len() {
4509                let mut p_end = p_start + 1;
4510                while p_end < indexed.len()
4511                    && partition_key_cmp(&indexed[p_start].0, &indexed[p_end].0)
4512                        == core::cmp::Ordering::Equal
4513                {
4514                    p_end += 1;
4515                }
4516                // Compute the function within this partition slice.
4517                compute_window_partition(
4518                    name,
4519                    args,
4520                    !order_by.is_empty(),
4521                    frame.as_ref(),
4522                    *null_treatment,
4523                    &indexed[p_start..p_end],
4524                    &filtered,
4525                    &ctx,
4526                    &mut out_vals,
4527                )?;
4528                p_start = p_end;
4529            }
4530            win_vals.push(out_vals);
4531        }
4532
4533        // 4) Build extended schema: original columns + synthetic.
4534        let mut ext_cols = schema_cols.clone();
4535        for i in 0..window_nodes.len() {
4536            ext_cols.push(ColumnSchema::new(
4537                alloc::format!("__win_{i}"),
4538                DataType::Text, // type doesn't matter for projection eval
4539                true,
4540            ));
4541        }
4542        // 5) Build extended rows: each row gets its window values appended.
4543        let mut ext_rows: Vec<Row> = Vec::with_capacity(n_rows);
4544        for i in 0..n_rows {
4545            let mut values = filtered[i].values.clone();
4546            for w in 0..window_nodes.len() {
4547                values.push(win_vals[w][i].clone());
4548            }
4549            ext_rows.push(Row::new(values));
4550        }
4551        // 6) Rewrite the projection: WindowFunction nodes → Column(__win_N).
4552        let mut rewritten_items: Vec<SelectItem> = Vec::with_capacity(stmt.items.len());
4553        for item in &stmt.items {
4554            let new_item = match item {
4555                SelectItem::Wildcard => SelectItem::Wildcard,
4556                SelectItem::Expr { expr, alias } => {
4557                    let mut e = expr.clone();
4558                    rewrite_window_to_columns(&mut e, &window_nodes);
4559                    SelectItem::Expr {
4560                        expr: e,
4561                        alias: alias.clone(),
4562                    }
4563                }
4564            };
4565            rewritten_items.push(new_item);
4566        }
4567
4568        // 7) Project into final rows.
4569        let ext_ctx = EvalContext::new(&ext_cols, Some(alias));
4570        let projection = build_projection(&rewritten_items, &ext_cols, alias)?;
4571        let mut tagged: Vec<(Vec<f64>, Row)> = Vec::with_capacity(n_rows);
4572        for (i, row) in ext_rows.iter().enumerate() {
4573            if i.is_multiple_of(256) {
4574                cancel.check()?;
4575            }
4576            let mut values = Vec::with_capacity(projection.len());
4577            for p in &projection {
4578                values.push(eval::eval_expr(&p.expr, row, &ext_ctx)?);
4579            }
4580            let order_keys = if stmt.order_by.is_empty() {
4581                Vec::new()
4582            } else {
4583                let mut keys = Vec::with_capacity(stmt.order_by.len());
4584                for o in &stmt.order_by {
4585                    let mut e = o.expr.clone();
4586                    rewrite_window_to_columns(&mut e, &window_nodes);
4587                    let key = eval::eval_expr(&e, row, &ext_ctx)?;
4588                    keys.push(value_to_order_key(&key)?);
4589                }
4590                keys
4591            };
4592            tagged.push((order_keys, Row::new(values)));
4593        }
4594        // ORDER BY + LIMIT/OFFSET on the projected rows.
4595        if !stmt.order_by.is_empty() {
4596            let descs: Vec<bool> = stmt.order_by.iter().map(|o| o.desc).collect();
4597            sort_by_keys(&mut tagged, &descs);
4598        }
4599        let mut out_rows: Vec<Row> = tagged.into_iter().map(|(_, r)| r).collect();
4600        apply_offset_and_limit(&mut out_rows, stmt.offset, stmt.limit);
4601        let final_cols: Vec<ColumnSchema> = projection
4602            .into_iter()
4603            .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
4604            .collect();
4605        Ok(QueryResult::Rows {
4606            columns: final_cols,
4607            rows: out_rows,
4608        })
4609    }
4610
4611    /// v4.11: materialise each CTE into a temp table inside a
4612    /// cloned catalog, then run the body SELECT against a fresh
4613    /// engine instance that owns the enriched catalog. The clone
4614    /// is moderately expensive — only paid by CTE-bearing queries.
4615    /// Subqueries inside CTE bodies / the main body resolve as
4616    /// usual; `clock_fn` is propagated so `NOW()` lines up.
4617    fn exec_with_ctes(
4618        &self,
4619        stmt: &SelectStatement,
4620        cancel: CancelToken<'_>,
4621    ) -> Result<QueryResult, EngineError> {
4622        cancel.check()?;
4623        let mut catalog = self.active_catalog().clone();
4624        for cte in &stmt.ctes {
4625            if catalog.get(&cte.name).is_some() {
4626                return Err(EngineError::Unsupported(alloc::format!(
4627                    "CTE name {:?} shadows an existing table; rename the CTE",
4628                    cte.name
4629                )));
4630            }
4631            let (columns, rows) = if cte.recursive {
4632                self.materialise_recursive_cte(cte, &catalog, cancel)?
4633            } else {
4634                let body_result = self.exec_select_cancel(&cte.body, cancel)?;
4635                let QueryResult::Rows { columns, rows } = body_result else {
4636                    return Err(EngineError::Unsupported(alloc::format!(
4637                        "CTE {:?} body did not return rows",
4638                        cte.name
4639                    )));
4640                };
4641                (columns, rows)
4642            };
4643            // v4.22: the projection builder labels any non-column
4644            // expression as Text — including literal SELECT 1.
4645            // Promote each column's type to whatever the rows
4646            // actually carry so the CTE storage table accepts them.
4647            let inferred = infer_column_types(&columns, &rows);
4648            let mut columns = inferred;
4649            // v4.22: apply optional `WITH name(a, b, c)` overrides.
4650            if !cte.column_overrides.is_empty() {
4651                if cte.column_overrides.len() != columns.len() {
4652                    return Err(EngineError::Unsupported(alloc::format!(
4653                        "CTE {:?} column list has {} names but body returns {} columns",
4654                        cte.name,
4655                        cte.column_overrides.len(),
4656                        columns.len()
4657                    )));
4658                }
4659                for (col, name) in columns.iter_mut().zip(cte.column_overrides.iter()) {
4660                    col.name.clone_from(name);
4661                }
4662            }
4663            let schema = TableSchema::new(cte.name.clone(), columns);
4664            catalog.create_table(schema).map_err(EngineError::Storage)?;
4665            let table = catalog
4666                .get_mut(&cte.name)
4667                .expect("just-created CTE table must exist");
4668            for row in rows {
4669                table.insert(row).map_err(EngineError::Storage)?;
4670            }
4671        }
4672        // Strip CTEs from the body before running on the temp engine
4673        // so we don't recurse forever.
4674        let mut body = stmt.clone();
4675        body.ctes = Vec::new();
4676        let mut temp = Engine::restore(catalog);
4677        if let Some(c) = self.clock {
4678            temp = temp.with_clock(c);
4679        }
4680        if let Some(f) = self.salt_fn {
4681            temp = temp.with_salt_fn(f);
4682        }
4683        temp.exec_select_cancel(&body, cancel)
4684    }
4685
4686    /// v4.22: materialise a WITH RECURSIVE CTE. The body must be a
4687    /// UNION (or UNION ALL) of an anchor that does not reference
4688    /// the CTE name, and one or more recursive terms that do. The
4689    /// anchor runs first; each subsequent iteration runs the
4690    /// recursive term against a temp catalog where the CTE name is
4691    /// bound to the *previous* iteration's output. Iteration stops
4692    /// when the recursive term yields no rows; UNION (DISTINCT)
4693    /// deduplicates against the accumulated result, UNION ALL does
4694    /// not. A hard cap on total rows prevents runaway queries.
4695    #[allow(clippy::too_many_lines)]
4696    fn materialise_recursive_cte(
4697        &self,
4698        cte: &spg_sql::ast::Cte,
4699        base_catalog: &Catalog,
4700        cancel: CancelToken<'_>,
4701    ) -> Result<(Vec<ColumnSchema>, Vec<Row>), EngineError> {
4702        const MAX_TOTAL_ROWS: usize = 1_000_000;
4703        const MAX_ITERATIONS: usize = 100_000;
4704        cancel.check()?;
4705        if cte.body.unions.is_empty() {
4706            return Err(EngineError::Unsupported(alloc::format!(
4707                "WITH RECURSIVE {:?} body must be a UNION of an anchor and a recursive term",
4708                cte.name
4709            )));
4710        }
4711        // Anchor: the body's leading SELECT, with unions stripped.
4712        let mut anchor = cte.body.clone();
4713        let union_terms = core::mem::take(&mut anchor.unions);
4714        anchor.ctes = Vec::new();
4715        // Anchor must not reference the CTE name.
4716        if select_refers_to(&anchor, &cte.name) {
4717            return Err(EngineError::Unsupported(alloc::format!(
4718                "WITH RECURSIVE {:?}: the anchor must not reference the CTE itself",
4719                cte.name
4720            )));
4721        }
4722        let anchor_result = self.exec_select_cancel(&anchor, cancel)?;
4723        let QueryResult::Rows {
4724            columns: anchor_cols,
4725            rows: anchor_rows,
4726        } = anchor_result
4727        else {
4728            return Err(EngineError::Unsupported(alloc::format!(
4729                "WITH RECURSIVE {:?}: anchor did not return rows",
4730                cte.name
4731            )));
4732        };
4733        // The projection builder labels non-column expressions Text;
4734        // refine column types from the anchor's actual values so the
4735        // intermediate iter-catalog tables accept them.
4736        let mut columns = infer_column_types(&anchor_cols, &anchor_rows);
4737        if !cte.column_overrides.is_empty() {
4738            if cte.column_overrides.len() != columns.len() {
4739                return Err(EngineError::Unsupported(alloc::format!(
4740                    "CTE {:?} column list has {} names but anchor returns {} columns",
4741                    cte.name,
4742                    cte.column_overrides.len(),
4743                    columns.len()
4744                )));
4745            }
4746            for (col, name) in columns.iter_mut().zip(cte.column_overrides.iter()) {
4747                col.name.clone_from(name);
4748            }
4749        }
4750        let mut all_rows: Vec<Row> = anchor_rows.clone();
4751        let mut working_set: Vec<Row> = anchor_rows;
4752        let mut seen: alloc::collections::BTreeSet<Vec<u8>> = alloc::collections::BTreeSet::new();
4753        // Track at least one "all UNION ALL" flag — if every union
4754        // kind is ALL we skip the dedup step (faster + matches PG).
4755        let all_union_all = union_terms.iter().all(|(k, _)| matches!(k, UnionKind::All));
4756        if !all_union_all {
4757            for r in &all_rows {
4758                seen.insert(encode_row_key(r));
4759            }
4760        }
4761        for iter in 0..MAX_ITERATIONS {
4762            cancel.check()?;
4763            if working_set.is_empty() {
4764                break;
4765            }
4766            // Build a fresh catalog: base + CTE bound to working_set.
4767            let mut iter_catalog = base_catalog.clone();
4768            let schema = TableSchema::new(cte.name.clone(), columns.clone());
4769            iter_catalog
4770                .create_table(schema)
4771                .map_err(EngineError::Storage)?;
4772            {
4773                let table = iter_catalog.get_mut(&cte.name).expect("just-created");
4774                for row in &working_set {
4775                    table.insert(row.clone()).map_err(EngineError::Storage)?;
4776                }
4777            }
4778            let mut iter_engine = Engine::restore(iter_catalog);
4779            if let Some(c) = self.clock {
4780                iter_engine = iter_engine.with_clock(c);
4781            }
4782            if let Some(f) = self.salt_fn {
4783                iter_engine = iter_engine.with_salt_fn(f);
4784            }
4785            // Run each recursive term in sequence and collect new rows.
4786            let mut next_set: Vec<Row> = Vec::new();
4787            for (_, term) in &union_terms {
4788                let mut term = term.clone();
4789                term.ctes = Vec::new();
4790                let r = iter_engine.exec_select_cancel(&term, cancel)?;
4791                let QueryResult::Rows {
4792                    columns: rc,
4793                    rows: rs,
4794                } = r
4795                else {
4796                    return Err(EngineError::Unsupported(alloc::format!(
4797                        "WITH RECURSIVE {:?}: recursive term did not return rows",
4798                        cte.name
4799                    )));
4800                };
4801                if rc.len() != columns.len() {
4802                    return Err(EngineError::Unsupported(alloc::format!(
4803                        "WITH RECURSIVE {:?}: column count of recursive term ({}) does not match anchor ({})",
4804                        cte.name,
4805                        rc.len(),
4806                        columns.len()
4807                    )));
4808                }
4809                for row in rs {
4810                    if !all_union_all {
4811                        let key = encode_row_key(&row);
4812                        if !seen.insert(key) {
4813                            continue;
4814                        }
4815                    }
4816                    next_set.push(row);
4817                }
4818            }
4819            if next_set.is_empty() {
4820                break;
4821            }
4822            all_rows.extend(next_set.iter().cloned());
4823            working_set = next_set;
4824            if all_rows.len() > MAX_TOTAL_ROWS {
4825                return Err(EngineError::Unsupported(alloc::format!(
4826                    "WITH RECURSIVE {:?}: produced more than {MAX_TOTAL_ROWS} rows — likely runaway recursion",
4827                    cte.name
4828                )));
4829            }
4830            if iter + 1 == MAX_ITERATIONS {
4831                return Err(EngineError::Unsupported(alloc::format!(
4832                    "WITH RECURSIVE {:?}: exceeded {MAX_ITERATIONS} iterations",
4833                    cte.name
4834                )));
4835            }
4836        }
4837        Ok((columns, all_rows))
4838    }
4839
4840    fn resolve_select_subqueries(
4841        &self,
4842        stmt: &mut SelectStatement,
4843        cancel: CancelToken<'_>,
4844    ) -> Result<(), EngineError> {
4845        for item in &mut stmt.items {
4846            if let SelectItem::Expr { expr, .. } = item {
4847                self.resolve_expr_subqueries(expr, cancel)?;
4848            }
4849        }
4850        if let Some(w) = &mut stmt.where_ {
4851            self.resolve_expr_subqueries(w, cancel)?;
4852        }
4853        if let Some(gs) = &mut stmt.group_by {
4854            for g in gs {
4855                self.resolve_expr_subqueries(g, cancel)?;
4856            }
4857        }
4858        if let Some(h) = &mut stmt.having {
4859            self.resolve_expr_subqueries(h, cancel)?;
4860        }
4861        for o in &mut stmt.order_by {
4862            self.resolve_expr_subqueries(&mut o.expr, cancel)?;
4863        }
4864        for (_, peer) in &mut stmt.unions {
4865            self.resolve_select_subqueries(peer, cancel)?;
4866        }
4867        Ok(())
4868    }
4869
4870    #[allow(clippy::only_used_in_recursion)] // engine handle reads aren't really pure
4871    fn resolve_expr_subqueries(
4872        &self,
4873        e: &mut Expr,
4874        cancel: CancelToken<'_>,
4875    ) -> Result<(), EngineError> {
4876        // Replace-on-this-node cases first.
4877        if let Some(replacement) = self.subquery_replacement(e, cancel)? {
4878            *e = replacement;
4879            return Ok(());
4880        }
4881        match e {
4882            Expr::Binary { lhs, rhs, .. } => {
4883                self.resolve_expr_subqueries(lhs, cancel)?;
4884                self.resolve_expr_subqueries(rhs, cancel)?;
4885            }
4886            Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
4887                self.resolve_expr_subqueries(expr, cancel)?;
4888            }
4889            Expr::FunctionCall { args, .. } => {
4890                for a in args {
4891                    self.resolve_expr_subqueries(a, cancel)?;
4892                }
4893            }
4894            Expr::Like { expr, pattern, .. } => {
4895                self.resolve_expr_subqueries(expr, cancel)?;
4896                self.resolve_expr_subqueries(pattern, cancel)?;
4897            }
4898            Expr::Extract { source, .. } => self.resolve_expr_subqueries(source, cancel)?,
4899            // v4.12 window functions — recurse into args + ORDER BY
4900            // + PARTITION BY in case they carry inner subqueries.
4901            Expr::WindowFunction {
4902                args,
4903                partition_by,
4904                order_by,
4905                ..
4906            } => {
4907                for a in args {
4908                    self.resolve_expr_subqueries(a, cancel)?;
4909                }
4910                for p in partition_by {
4911                    self.resolve_expr_subqueries(p, cancel)?;
4912                }
4913                for (e, _) in order_by {
4914                    self.resolve_expr_subqueries(e, cancel)?;
4915                }
4916            }
4917            // Subquery nodes are handled in subquery_replacement
4918            // (which returned None — defensive no-op); Literal /
4919            // Column are leaves.
4920            Expr::ScalarSubquery(_)
4921            | Expr::Exists { .. }
4922            | Expr::InSubquery { .. }
4923            | Expr::Literal(_)
4924            | Expr::Placeholder(_)
4925            | Expr::Column(_) => {}
4926        }
4927        Ok(())
4928    }
4929
4930    /// v4.23: per-row eval that handles correlated subqueries.
4931    /// Equivalent to `eval::eval_expr` when the expression has no
4932    /// subqueries; otherwise clones the expression, substitutes
4933    /// outer-row columns into each surviving subquery node, runs
4934    /// the inner SELECT, and replaces the node with the literal
4935    /// result. Only the WHERE-filter call sites use this path so
4936    /// the uncorrelated fast path is preserved everywhere else.
4937    fn eval_expr_with_correlated(
4938        &self,
4939        expr: &Expr,
4940        row: &Row,
4941        ctx: &EvalContext<'_>,
4942        cancel: CancelToken<'_>,
4943        memo: Option<&mut memoize::MemoizeCache>,
4944    ) -> Result<Value, EngineError> {
4945        if !expr_has_subquery(expr) {
4946            return eval::eval_expr(expr, row, ctx).map_err(EngineError::Eval);
4947        }
4948        let mut e = expr.clone();
4949        self.resolve_correlated_in_expr(&mut e, row, ctx, cancel, memo)?;
4950        eval::eval_expr(&e, row, ctx).map_err(EngineError::Eval)
4951    }
4952
4953    fn resolve_correlated_in_expr(
4954        &self,
4955        e: &mut Expr,
4956        row: &Row,
4957        ctx: &EvalContext<'_>,
4958        cancel: CancelToken<'_>,
4959        mut memo: Option<&mut memoize::MemoizeCache>,
4960    ) -> Result<(), EngineError> {
4961        match e {
4962            Expr::ScalarSubquery(inner) => {
4963                // v6.2.6 — Memoize: build the cache key from the
4964                // pre-substitution subquery repr + the outer row's
4965                // values. Two outer rows with identical correlated
4966                // values hit the same entry.
4967                let cache_key = memo.as_ref().map(|_| memoize::CacheKey {
4968                    subquery_repr: alloc::format!("{}", **inner),
4969                    outer_values: row.values.clone(),
4970                });
4971                if let (Some(cache), Some(k)) = (memo.as_deref_mut(), cache_key.as_ref())
4972                    && let Some(cached) = cache.get(k)
4973                {
4974                    *e = value_to_literal_expr(cached)?;
4975                    return Ok(());
4976                }
4977                let mut s = (**inner).clone();
4978                substitute_outer_columns(&mut s, row, ctx);
4979                let r = self.exec_select_cancel(&s, cancel)?;
4980                let QueryResult::Rows { rows, .. } = r else {
4981                    return Err(EngineError::Unsupported(
4982                        "scalar subquery: inner did not return rows".into(),
4983                    ));
4984                };
4985                let value = match rows.as_slice() {
4986                    [] => Value::Null,
4987                    [r0] => r0.values.first().cloned().unwrap_or(Value::Null),
4988                    _ => {
4989                        return Err(EngineError::Unsupported(alloc::format!(
4990                            "scalar subquery returned {} rows; expected 0 or 1",
4991                            rows.len()
4992                        )));
4993                    }
4994                };
4995                if let (Some(cache), Some(k)) = (memo.as_deref_mut(), cache_key) {
4996                    cache.insert(k, value.clone());
4997                }
4998                *e = value_to_literal_expr(value)?;
4999            }
5000            Expr::Exists { subquery, negated } => {
5001                let mut s = (**subquery).clone();
5002                substitute_outer_columns(&mut s, row, ctx);
5003                let r = self.exec_select_cancel(&s, cancel)?;
5004                let exists = matches!(r, QueryResult::Rows { rows, .. } if !rows.is_empty());
5005                let bit = if *negated { !exists } else { exists };
5006                *e = Expr::Literal(Literal::Bool(bit));
5007            }
5008            Expr::InSubquery {
5009                expr: lhs,
5010                subquery,
5011                negated,
5012            } => {
5013                self.resolve_correlated_in_expr(lhs, row, ctx, cancel, memo.as_deref_mut())?;
5014                let lhs_val = eval::eval_expr(lhs, row, ctx).map_err(EngineError::Eval)?;
5015                let mut s = (**subquery).clone();
5016                substitute_outer_columns(&mut s, row, ctx);
5017                let r = self.exec_select_cancel(&s, cancel)?;
5018                let QueryResult::Rows { columns, rows, .. } = r else {
5019                    return Err(EngineError::Unsupported(
5020                        "IN-subquery: inner did not return rows".into(),
5021                    ));
5022                };
5023                if columns.len() != 1 {
5024                    return Err(EngineError::Unsupported(alloc::format!(
5025                        "IN-subquery must project exactly one column; got {}",
5026                        columns.len()
5027                    )));
5028                }
5029                let mut found = false;
5030                let mut any_null = false;
5031                for r0 in rows {
5032                    let v = r0.values.into_iter().next().unwrap_or(Value::Null);
5033                    if v.is_null() {
5034                        any_null = true;
5035                        continue;
5036                    }
5037                    if value_cmp(&v, &lhs_val) == core::cmp::Ordering::Equal {
5038                        found = true;
5039                        break;
5040                    }
5041                }
5042                let bit = if found {
5043                    !*negated
5044                } else if any_null {
5045                    return Err(EngineError::Unsupported(
5046                        "IN-subquery with NULL in result and no match: NULL semantics not yet implemented".into(),
5047                    ));
5048                } else {
5049                    *negated
5050                };
5051                *e = Expr::Literal(Literal::Bool(bit));
5052            }
5053            Expr::Binary { lhs, rhs, .. } => {
5054                self.resolve_correlated_in_expr(lhs, row, ctx, cancel, memo.as_deref_mut())?;
5055                self.resolve_correlated_in_expr(rhs, row, ctx, cancel, memo.as_deref_mut())?;
5056            }
5057            Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
5058                self.resolve_correlated_in_expr(expr, row, ctx, cancel, memo.as_deref_mut())?;
5059            }
5060            Expr::Like { expr, pattern, .. } => {
5061                self.resolve_correlated_in_expr(expr, row, ctx, cancel, memo.as_deref_mut())?;
5062                self.resolve_correlated_in_expr(pattern, row, ctx, cancel, memo.as_deref_mut())?;
5063            }
5064            Expr::FunctionCall { args, .. } => {
5065                for a in args {
5066                    self.resolve_correlated_in_expr(a, row, ctx, cancel, memo.as_deref_mut())?;
5067                }
5068            }
5069            Expr::Extract { source, .. } => {
5070                self.resolve_correlated_in_expr(source, row, ctx, cancel, memo.as_deref_mut())?;
5071            }
5072            Expr::WindowFunction { .. } | Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => {}
5073        }
5074        Ok(())
5075    }
5076
5077    fn subquery_replacement(
5078        &self,
5079        e: &Expr,
5080        cancel: CancelToken<'_>,
5081    ) -> Result<Option<Expr>, EngineError> {
5082        match e {
5083            Expr::ScalarSubquery(inner) => {
5084                let mut s = (**inner).clone();
5085                // Recurse into the inner SELECT first so nested
5086                // subqueries materialise bottom-up.
5087                self.resolve_select_subqueries(&mut s, cancel)?;
5088                let r = match self.exec_bare_select_cancel(&s, cancel) {
5089                    Ok(r) => r,
5090                    Err(e) if is_correlation_error(&e) => return Ok(None),
5091                    Err(e) => return Err(e),
5092                };
5093                let QueryResult::Rows { rows, .. } = r else {
5094                    return Err(EngineError::Unsupported(
5095                        "scalar subquery: inner statement did not return rows".into(),
5096                    ));
5097                };
5098                let value = match rows.as_slice() {
5099                    [] => Value::Null,
5100                    [row] => row.values.first().cloned().unwrap_or(Value::Null),
5101                    _ => {
5102                        return Err(EngineError::Unsupported(alloc::format!(
5103                            "scalar subquery returned {} rows; expected 0 or 1",
5104                            rows.len()
5105                        )));
5106                    }
5107                };
5108                Ok(Some(value_to_literal_expr(value)?))
5109            }
5110            Expr::Exists { subquery, negated } => {
5111                let mut s = (**subquery).clone();
5112                self.resolve_select_subqueries(&mut s, cancel)?;
5113                let r = match self.exec_bare_select_cancel(&s, cancel) {
5114                    Ok(r) => r,
5115                    Err(e) if is_correlation_error(&e) => return Ok(None),
5116                    Err(e) => return Err(e),
5117                };
5118                let exists = match r {
5119                    QueryResult::Rows { rows, .. } => !rows.is_empty(),
5120                    QueryResult::CommandOk { .. } => false,
5121                };
5122                let bit = if *negated { !exists } else { exists };
5123                Ok(Some(Expr::Literal(Literal::Bool(bit))))
5124            }
5125            Expr::InSubquery {
5126                expr,
5127                subquery,
5128                negated,
5129            } => {
5130                let mut s = (**subquery).clone();
5131                self.resolve_select_subqueries(&mut s, cancel)?;
5132                let r = match self.exec_bare_select_cancel(&s, cancel) {
5133                    Ok(r) => r,
5134                    Err(e) if is_correlation_error(&e) => return Ok(None),
5135                    Err(e) => return Err(e),
5136                };
5137                let QueryResult::Rows { columns, rows, .. } = r else {
5138                    return Err(EngineError::Unsupported(
5139                        "IN-subquery: inner statement did not return rows".into(),
5140                    ));
5141                };
5142                if columns.len() != 1 {
5143                    return Err(EngineError::Unsupported(alloc::format!(
5144                        "IN-subquery must project exactly one column; got {}",
5145                        columns.len()
5146                    )));
5147                }
5148                // Build the same OR-Eq chain the parse-time literal-list
5149                // path constructs, with each value lifted into a Literal.
5150                let mut acc: Option<Expr> = None;
5151                for row in rows {
5152                    let v = row.values.into_iter().next().unwrap_or(Value::Null);
5153                    let lit = value_to_literal_expr(v)?;
5154                    let cmp = Expr::Binary {
5155                        lhs: expr.clone(),
5156                        op: BinOp::Eq,
5157                        rhs: Box::new(lit),
5158                    };
5159                    acc = Some(match acc {
5160                        None => cmp,
5161                        Some(prev) => Expr::Binary {
5162                            lhs: Box::new(prev),
5163                            op: BinOp::Or,
5164                            rhs: Box::new(cmp),
5165                        },
5166                    });
5167                }
5168                let combined = acc.unwrap_or(Expr::Literal(Literal::Bool(false)));
5169                let final_expr = if *negated {
5170                    Expr::Unary {
5171                        op: UnOp::Not,
5172                        expr: Box::new(combined),
5173                    }
5174                } else {
5175                    combined
5176                };
5177                Ok(Some(final_expr))
5178            }
5179            _ => Ok(None),
5180        }
5181    }
5182}
5183
5184// ---- v4.12 window-function helpers ----
5185// The (partition-key, order-key, original-index) tuple shape used
5186// across these helpers is intrinsic to the planner. Factoring it
5187// into a typedef adds indirection without making the code clearer,
5188// so several lints are allowed inline on the affected functions
5189// rather than module-wide.
5190
5191/// v4.22: cheap structural scan for `FROM <name>` (qualified or
5192/// not) inside a SELECT — used to verify the anchor of a WITH
5193/// RECURSIVE CTE doesn't recurse into itself. Conservative: walks
5194/// FROM joins, subqueries, and unions.
5195fn select_refers_to(stmt: &SelectStatement, target: &str) -> bool {
5196    if let Some(from) = &stmt.from
5197        && from_refers_to(from, target)
5198    {
5199        return true;
5200    }
5201    for (_, peer) in &stmt.unions {
5202        if select_refers_to(peer, target) {
5203            return true;
5204        }
5205    }
5206    for item in &stmt.items {
5207        if let SelectItem::Expr { expr, .. } = item
5208            && expr_refers_to(expr, target)
5209        {
5210            return true;
5211        }
5212    }
5213    if let Some(w) = &stmt.where_
5214        && expr_refers_to(w, target)
5215    {
5216        return true;
5217    }
5218    false
5219}
5220
5221fn from_refers_to(from: &FromClause, target: &str) -> bool {
5222    if from.primary.name.eq_ignore_ascii_case(target) {
5223        return true;
5224    }
5225    from.joins
5226        .iter()
5227        .any(|j| j.table.name.eq_ignore_ascii_case(target))
5228}
5229
5230fn expr_refers_to(e: &Expr, target: &str) -> bool {
5231    match e {
5232        Expr::ScalarSubquery(s) => select_refers_to(s, target),
5233        Expr::Exists { subquery, .. } | Expr::InSubquery { subquery, .. } => {
5234            select_refers_to(subquery, target)
5235        }
5236        Expr::Binary { lhs, rhs, .. } => expr_refers_to(lhs, target) || expr_refers_to(rhs, target),
5237        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
5238            expr_refers_to(expr, target)
5239        }
5240        Expr::Like { expr, pattern, .. } => {
5241            expr_refers_to(expr, target) || expr_refers_to(pattern, target)
5242        }
5243        Expr::FunctionCall { args, .. } => args.iter().any(|a| expr_refers_to(a, target)),
5244        Expr::Extract { source, .. } => expr_refers_to(source, target),
5245        Expr::WindowFunction {
5246            args,
5247            partition_by,
5248            order_by,
5249            ..
5250        } => {
5251            args.iter().any(|a| expr_refers_to(a, target))
5252                || partition_by.iter().any(|p| expr_refers_to(p, target))
5253                || order_by.iter().any(|(o, _)| expr_refers_to(o, target))
5254        }
5255        Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => false,
5256    }
5257}
5258
5259/// v4.22: pick more specific column types from observed rows when
5260/// the projection builder defaulted to Text (the v1.x behavior for
5261/// non-column expressions). Lets `WITH t(n) AS (SELECT 1 ...)`
5262/// land an Int column in the CTE storage table rather than failing
5263/// the insert with "expected TEXT, got INT".
5264fn infer_column_types(columns: &[ColumnSchema], rows: &[Row]) -> Vec<ColumnSchema> {
5265    let mut out = columns.to_vec();
5266    for (col_idx, col) in out.iter_mut().enumerate() {
5267        if col.ty != DataType::Text {
5268            continue;
5269        }
5270        let mut inferred: Option<DataType> = None;
5271        let mut all_null = true;
5272        for row in rows {
5273            let Some(v) = row.values.get(col_idx) else {
5274                continue;
5275            };
5276            let ty = match v {
5277                Value::Null => continue,
5278                Value::SmallInt(_) => DataType::SmallInt,
5279                Value::Int(_) => DataType::Int,
5280                Value::BigInt(_) => DataType::BigInt,
5281                Value::Float(_) => DataType::Float,
5282                Value::Bool(_) => DataType::Bool,
5283                Value::Vector(_) => DataType::Vector {
5284                    dim: 0,
5285                    encoding: VecEncoding::F32,
5286                },
5287                _ => DataType::Text,
5288            };
5289            all_null = false;
5290            inferred = Some(match inferred {
5291                None => ty,
5292                Some(prev) if prev == ty => prev,
5293                Some(_) => DataType::Text,
5294            });
5295        }
5296        if let Some(t) = inferred {
5297            col.ty = t;
5298            col.nullable = true;
5299        } else if all_null {
5300            col.nullable = true;
5301        }
5302    }
5303    out
5304}
5305
5306/// v4.26: render a human-readable plan tree for `EXPLAIN <select>`.
5307/// Lines are pushed into `out`; `depth` controls indentation. We
5308/// describe the rewritten SELECT — what the executor *would* do —
5309/// using the engine handle to spot indexed lookups and table shapes.
5310#[allow(clippy::too_many_lines, clippy::format_push_string)]
5311/// v6.2.4 — Walk every line of the rendered plan tree and append
5312/// per-operator stats. Lines that name a known operator get
5313/// `(rows=N)` (`actual_rows` of the top-level operator equals the
5314/// final result row count; scans report their catalog row count
5315/// as the rows-considered metric). Other lines — Filter / Join /
5316/// GroupBy / OrderBy etc. — are marked `(—)` so the surface is
5317/// complete-by-construction; v6.2.5 fills these in via inline
5318/// executor counters.
5319/// v6.8.3 — surface "CREATE INDEX …" suggestions for every
5320/// `(table, column)` pair the query touches via WHERE / JOIN
5321/// that doesn't already have an index on the owning table.
5322/// Walks the SELECT's FROM clauses + WHERE expression tree;
5323/// returns one line per missing index. Deterministic order:
5324/// FROM-clause iteration order, then column-reference walk
5325/// order inside each WHERE. Each suggestion is a copy-pastable
5326/// DDL string.
5327fn build_index_suggestions(stmt: &SelectStatement, engine: &Engine) -> Vec<String> {
5328    use alloc::collections::BTreeSet;
5329    let mut seen: BTreeSet<(String, String)> = BTreeSet::new();
5330    let mut out: Vec<String> = Vec::new();
5331    let cat = engine.active_catalog();
5332    // Build a (table, qualifier-or-alias) list from the FROM clause
5333    // so unqualified column refs in WHERE resolve to the correct
5334    // table.
5335    let Some(from) = &stmt.from else {
5336        return out;
5337    };
5338    let mut tables: Vec<String> = Vec::new();
5339    tables.push(from.primary.name.clone());
5340    for j in &from.joins {
5341        tables.push(j.table.name.clone());
5342    }
5343    // Collect column refs from the WHERE expression. JOIN ON
5344    // predicates also feed in.
5345    let mut col_refs: Vec<spg_sql::ast::ColumnName> = Vec::new();
5346    if let Some(w) = &stmt.where_ {
5347        collect_column_refs(w, &mut col_refs);
5348    }
5349    for j in &from.joins {
5350        if let Some(on) = &j.on {
5351            collect_column_refs(on, &mut col_refs);
5352        }
5353    }
5354    for cn in &col_refs {
5355        // Resolve owner table: explicit qualifier first, else
5356        // first table in FROM that has a column of this name.
5357        let owner: Option<String> = if let Some(q) = &cn.qualifier {
5358            tables.iter().find(|t| t == &q).cloned()
5359        } else {
5360            tables.iter().find_map(|t| {
5361                cat.get(t).and_then(|tbl| {
5362                    if tbl.schema().column_position(&cn.name).is_some() {
5363                        Some(t.clone())
5364                    } else {
5365                        None
5366                    }
5367                })
5368            })
5369        };
5370        let Some(owner) = owner else {
5371            continue;
5372        };
5373        let Some(tbl) = cat.get(&owner) else {
5374            continue;
5375        };
5376        let Some(col_pos) = tbl.schema().column_position(&cn.name) else {
5377            continue;
5378        };
5379        // Skip if any BTree index already covers this column as
5380        // its key.
5381        let already_indexed = tbl.indices().iter().any(|i| {
5382            matches!(i.kind, spg_storage::IndexKind::BTree(_))
5383                && i.column_position == col_pos
5384                && i.expression.is_none()
5385                && i.partial_predicate.is_none()
5386        });
5387        if already_indexed {
5388            continue;
5389        }
5390        if seen.insert((owner.clone(), cn.name.clone())) {
5391            out.push(alloc::format!(
5392                "SUGGEST: CREATE INDEX ix_{}_{} ON {} ({})",
5393                owner,
5394                cn.name,
5395                owner,
5396                cn.name
5397            ));
5398        }
5399    }
5400    out
5401}
5402
5403/// Walks an `Expr` and pushes every `ColumnName` it references.
5404/// Order is depth-first, left-to-right.
5405fn collect_column_refs(expr: &Expr, out: &mut Vec<spg_sql::ast::ColumnName>) {
5406    match expr {
5407        Expr::Column(cn) => out.push(cn.clone()),
5408        Expr::FunctionCall { args, .. } => {
5409            for a in args {
5410                collect_column_refs(a, out);
5411            }
5412        }
5413        Expr::Binary { lhs, rhs, .. } => {
5414            collect_column_refs(lhs, out);
5415            collect_column_refs(rhs, out);
5416        }
5417        Expr::Unary { expr: e, .. } => collect_column_refs(e, out),
5418        _ => {}
5419    }
5420}
5421
5422fn annotate_explain_lines(lines: &mut [String], total_rows: usize, engine: &Engine) {
5423    let catalog = engine.active_catalog();
5424    let cold_ids = catalog.cold_segment_ids_global();
5425    let any_cold = !cold_ids.is_empty();
5426    let cold_ids_repr = if any_cold {
5427        let mut s = alloc::string::String::from("[");
5428        for (i, id) in cold_ids.iter().enumerate() {
5429            if i > 0 {
5430                s.push(',');
5431            }
5432            s.push_str(&alloc::format!("{id}"));
5433        }
5434        s.push(']');
5435        s
5436    } else {
5437        alloc::string::String::new()
5438    };
5439    for (idx, line) in lines.iter_mut().enumerate() {
5440        let trimmed = line.trim_start();
5441        let is_top_level = idx == 0;
5442        if is_top_level {
5443            line.push_str(&alloc::format!(" (rows={total_rows})"));
5444            continue;
5445        }
5446        if let Some(rest) = trimmed.strip_prefix("From: ") {
5447            let (name, scan_kind) = match rest.split_once(" [") {
5448                Some((n, k)) => (n.trim(), k.trim_end_matches(']')),
5449                None => (rest.trim(), ""),
5450            };
5451            let bare = name.split_whitespace().next().unwrap_or(name);
5452            let hot = catalog.get(bare).map(|t| t.rows().len());
5453            // v6.2.7 — `cold_segments=[id0,id1,…]` enumerates every
5454            // cold-tier segment the scan COULD have walked. v6.2.x
5455            // can tighten to per-table by walking the table's
5456            // BTree-index cold locators.
5457            let annot = match (hot, scan_kind) {
5458                (Some(h), "full scan") => {
5459                    let mut s = alloc::format!(" (hot_rows={h}");
5460                    if any_cold {
5461                        s.push_str(&alloc::format!(
5462                            ", cold_tier=present, cold_segments={cold_ids_repr}"
5463                        ));
5464                    }
5465                    s.push(')');
5466                    s
5467                }
5468                (Some(h), "index seek") => {
5469                    let mut s = alloc::format!(" (hot_rows≤{h}");
5470                    if any_cold {
5471                        s.push_str(&alloc::format!(
5472                            ", cold_tier=present, cold_segments={cold_ids_repr}"
5473                        ));
5474                    }
5475                    s.push(')');
5476                    s
5477                }
5478                _ => " (rows=—)".to_string(),
5479            };
5480            line.push_str(&annot);
5481            continue;
5482        }
5483        // Filter / GroupBy / Having / OrderBy / Limit / Join etc.
5484        line.push_str(" (rows=—)");
5485    }
5486}
5487
5488fn explain_select(stmt: &SelectStatement, engine: &Engine, depth: usize, out: &mut Vec<String>) {
5489    let pad = "  ".repeat(depth);
5490    // 1) Top-level operator label.
5491    let top = if !stmt.ctes.is_empty() {
5492        if stmt.ctes.iter().any(|c| c.recursive) {
5493            "CTEScan (WITH RECURSIVE)"
5494        } else {
5495            "CTEScan (WITH)"
5496        }
5497    } else if !stmt.unions.is_empty() {
5498        "UnionScan"
5499    } else if select_has_window(stmt) {
5500        "WindowAgg"
5501    } else if aggregate::uses_aggregate(stmt) {
5502        "Aggregate"
5503    } else if stmt.distinct {
5504        "Distinct"
5505    } else if stmt.from.is_some() {
5506        "TableScan"
5507    } else {
5508        "Result"
5509    };
5510    out.push(alloc::format!("{pad}{top}"));
5511    let child = "  ".repeat(depth + 1);
5512    // 2) CTE bodies.
5513    for cte in &stmt.ctes {
5514        let head = if cte.recursive {
5515            alloc::format!("{child}CTE (recursive): {}", cte.name)
5516        } else {
5517            alloc::format!("{child}CTE: {}", cte.name)
5518        };
5519        out.push(head);
5520        explain_select(&cte.body, engine, depth + 2, out);
5521    }
5522    // 3) FROM details — primary table + joins, index hits.
5523    if let Some(from) = &stmt.from {
5524        let mut tag = alloc::format!("{child}From: {}", from.primary.name);
5525        if let Some(alias) = &from.primary.alias {
5526            tag.push_str(&alloc::format!(" AS {alias}"));
5527        }
5528        // Try to detect an index-seek opportunity on WHERE against
5529        // the primary table — same heuristic the executor uses.
5530        if let Some(w) = &stmt.where_
5531            && let Some(table) = engine.active_catalog().get(&from.primary.name)
5532        {
5533            let alias = from.primary.alias.as_deref().unwrap_or(&from.primary.name);
5534            let cols = &table.schema().columns;
5535            if try_index_seek(w, cols, engine.active_catalog(), table, alias).is_some() {
5536                tag.push_str(" [index seek]");
5537            } else {
5538                tag.push_str(" [full scan]");
5539            }
5540        } else {
5541            tag.push_str(" [full scan]");
5542        }
5543        out.push(tag);
5544        for j in &from.joins {
5545            let kind = match j.kind {
5546                spg_sql::ast::JoinKind::Inner => "INNER JOIN",
5547                spg_sql::ast::JoinKind::Left => "LEFT JOIN",
5548                spg_sql::ast::JoinKind::Cross => "CROSS JOIN",
5549            };
5550            let mut s = alloc::format!("{child}{kind}: {}", j.table.name);
5551            if let Some(alias) = &j.table.alias {
5552                s.push_str(&alloc::format!(" AS {alias}"));
5553            }
5554            if j.on.is_some() {
5555                s.push_str(" (ON …)");
5556            }
5557            out.push(s);
5558        }
5559    }
5560    // 4) WHERE / GROUP BY / HAVING / ORDER BY / LIMIT / OFFSET.
5561    if let Some(w) = &stmt.where_ {
5562        let mut s = alloc::format!("{child}Filter: {w}");
5563        if expr_has_subquery(w) {
5564            s.push_str(" [subquery]");
5565        }
5566        out.push(s);
5567    }
5568    if let Some(gs) = &stmt.group_by {
5569        let mut parts = Vec::new();
5570        for g in gs {
5571            parts.push(alloc::format!("{g}"));
5572        }
5573        out.push(alloc::format!("{child}GroupBy: {}", parts.join(", ")));
5574    }
5575    if let Some(h) = &stmt.having {
5576        out.push(alloc::format!("{child}Having: {h}"));
5577    }
5578    for o in &stmt.order_by {
5579        let dir = if o.desc { "DESC" } else { "ASC" };
5580        out.push(alloc::format!("{child}OrderBy: {} {dir}", o.expr));
5581    }
5582    if let Some(lim) = stmt.limit {
5583        out.push(alloc::format!("{child}Limit: {lim}"));
5584    }
5585    if let Some(off) = stmt.offset {
5586        out.push(alloc::format!("{child}Offset: {off}"));
5587    }
5588    // 5) Projection — collapse Wildcard or render N items.
5589    if stmt
5590        .items
5591        .iter()
5592        .any(|it| matches!(it, SelectItem::Wildcard))
5593    {
5594        out.push(alloc::format!("{child}Project: *"));
5595    } else {
5596        out.push(alloc::format!(
5597            "{child}Project: {} item(s)",
5598            stmt.items.len()
5599        ));
5600    }
5601    // 6) Recurse into UNION peers.
5602    for (kind, peer) in &stmt.unions {
5603        let label = match kind {
5604            UnionKind::All => "UNION ALL",
5605            UnionKind::Distinct => "UNION",
5606        };
5607        out.push(alloc::format!("{child}{label}"));
5608        explain_select(peer, engine, depth + 2, out);
5609    }
5610}
5611
5612/// v4.23: recognise the engine errors that indicate the inner
5613/// SELECT couldn't be evaluated in isolation because it references
5614/// an outer column — used by `subquery_replacement` to skip
5615/// materialisation and let row-eval handle it instead.
5616fn is_correlation_error(e: &EngineError) -> bool {
5617    matches!(
5618        e,
5619        EngineError::Eval(
5620            eval::EvalError::ColumnNotFound { .. } | eval::EvalError::UnknownQualifier { .. }
5621        )
5622    )
5623}
5624
5625/// v4.23: walk every Expr in `stmt` and replace each Column ref
5626/// that targets the outer scope (qualifier matches the outer
5627/// table alias) with a Literal carrying the outer row's value.
5628/// Conservative: only qualified refs are substituted, so the user
5629/// must write `outer_alias.col` to reference an outer column. This
5630/// matches PG's lexical scoping for correlated subqueries and
5631/// avoids accidentally rebinding inner columns of the same name.
5632fn substitute_outer_columns(stmt: &mut SelectStatement, row: &Row, ctx: &EvalContext<'_>) {
5633    let Some(outer_alias) = ctx.table_alias else {
5634        return;
5635    };
5636    substitute_in_select(stmt, row, ctx, outer_alias);
5637}
5638
5639fn substitute_in_select(
5640    stmt: &mut SelectStatement,
5641    row: &Row,
5642    ctx: &EvalContext<'_>,
5643    outer_alias: &str,
5644) {
5645    for item in &mut stmt.items {
5646        if let SelectItem::Expr { expr, .. } = item {
5647            substitute_in_expr(expr, row, ctx, outer_alias);
5648        }
5649    }
5650    if let Some(w) = &mut stmt.where_ {
5651        substitute_in_expr(w, row, ctx, outer_alias);
5652    }
5653    if let Some(gs) = &mut stmt.group_by {
5654        for g in gs {
5655            substitute_in_expr(g, row, ctx, outer_alias);
5656        }
5657    }
5658    if let Some(h) = &mut stmt.having {
5659        substitute_in_expr(h, row, ctx, outer_alias);
5660    }
5661    for o in &mut stmt.order_by {
5662        substitute_in_expr(&mut o.expr, row, ctx, outer_alias);
5663    }
5664    for (_, peer) in &mut stmt.unions {
5665        substitute_in_select(peer, row, ctx, outer_alias);
5666    }
5667}
5668
5669fn substitute_in_expr(e: &mut Expr, row: &Row, ctx: &EvalContext<'_>, outer_alias: &str) {
5670    if let Expr::Column(c) = e
5671        && let Some(qual) = &c.qualifier
5672        && qual.eq_ignore_ascii_case(outer_alias)
5673    {
5674        // Look up the column's index in the outer schema.
5675        if let Some(idx) = ctx
5676            .columns
5677            .iter()
5678            .position(|sc| sc.name.eq_ignore_ascii_case(&c.name))
5679        {
5680            let v = row.values.get(idx).cloned().unwrap_or(Value::Null);
5681            if let Ok(lit) = value_to_literal_expr(v) {
5682                *e = lit;
5683                return;
5684            }
5685        }
5686    }
5687    match e {
5688        Expr::Binary { lhs, rhs, .. } => {
5689            substitute_in_expr(lhs, row, ctx, outer_alias);
5690            substitute_in_expr(rhs, row, ctx, outer_alias);
5691        }
5692        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
5693            substitute_in_expr(expr, row, ctx, outer_alias);
5694        }
5695        Expr::Like { expr, pattern, .. } => {
5696            substitute_in_expr(expr, row, ctx, outer_alias);
5697            substitute_in_expr(pattern, row, ctx, outer_alias);
5698        }
5699        Expr::FunctionCall { args, .. } => {
5700            for a in args {
5701                substitute_in_expr(a, row, ctx, outer_alias);
5702            }
5703        }
5704        Expr::Extract { source, .. } => substitute_in_expr(source, row, ctx, outer_alias),
5705        Expr::WindowFunction {
5706            args,
5707            partition_by,
5708            order_by,
5709            ..
5710        } => {
5711            for a in args {
5712                substitute_in_expr(a, row, ctx, outer_alias);
5713            }
5714            for p in partition_by {
5715                substitute_in_expr(p, row, ctx, outer_alias);
5716            }
5717            for (o, _) in order_by {
5718                substitute_in_expr(o, row, ctx, outer_alias);
5719            }
5720        }
5721        Expr::ScalarSubquery(s) => substitute_in_select(s, row, ctx, outer_alias),
5722        Expr::Exists { subquery, .. } | Expr::InSubquery { subquery, .. } => {
5723            substitute_in_select(subquery, row, ctx, outer_alias);
5724        }
5725        Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => {}
5726    }
5727}
5728
5729/// v4.22: encode a Row to a comparable byte key for UNION-DISTINCT
5730/// dedup inside the recursive iteration. Crude but deterministic
5731/// — Debug prints embed type discriminants so NULL ≠ "" ≠ 0.
5732fn encode_row_key(row: &Row) -> Vec<u8> {
5733    let mut out = Vec::new();
5734    for v in &row.values {
5735        let s = alloc::format!("{v:?}|");
5736        out.extend_from_slice(s.as_bytes());
5737    }
5738    out
5739}
5740
5741fn select_has_window(stmt: &SelectStatement) -> bool {
5742    for item in &stmt.items {
5743        if let SelectItem::Expr { expr, .. } = item
5744            && expr_has_window(expr)
5745        {
5746            return true;
5747        }
5748    }
5749    false
5750}
5751
5752fn expr_has_window(e: &Expr) -> bool {
5753    match e {
5754        Expr::WindowFunction { .. } => true,
5755        Expr::Binary { lhs, rhs, .. } => expr_has_window(lhs) || expr_has_window(rhs),
5756        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
5757            expr_has_window(expr)
5758        }
5759        Expr::FunctionCall { args, .. } => args.iter().any(expr_has_window),
5760        Expr::Like { expr, pattern, .. } => expr_has_window(expr) || expr_has_window(pattern),
5761        Expr::Extract { source, .. } => expr_has_window(source),
5762        Expr::ScalarSubquery(_)
5763        | Expr::Exists { .. }
5764        | Expr::InSubquery { .. }
5765        | Expr::Literal(_)
5766        | Expr::Placeholder(_)
5767        | Expr::Column(_) => false,
5768    }
5769}
5770
5771fn collect_window_nodes(e: &Expr, out: &mut Vec<Expr>) {
5772    if let Expr::WindowFunction { .. } = e {
5773        // Deduplicate by structural equality on the expression
5774        // (cheap because window args + partition + order are
5775        // small). Without dedup we'd recompute identical windows
5776        // once per occurrence in the projection.
5777        if !out.iter().any(|x| x == e) {
5778            out.push(e.clone());
5779        }
5780        return;
5781    }
5782    match e {
5783        // Already handled by the early-return at the top.
5784        Expr::WindowFunction { .. } => unreachable!(),
5785        Expr::Binary { lhs, rhs, .. } => {
5786            collect_window_nodes(lhs, out);
5787            collect_window_nodes(rhs, out);
5788        }
5789        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
5790            collect_window_nodes(expr, out);
5791        }
5792        Expr::FunctionCall { args, .. } => {
5793            for a in args {
5794                collect_window_nodes(a, out);
5795            }
5796        }
5797        Expr::Like { expr, pattern, .. } => {
5798            collect_window_nodes(expr, out);
5799            collect_window_nodes(pattern, out);
5800        }
5801        Expr::Extract { source, .. } => collect_window_nodes(source, out),
5802        _ => {}
5803    }
5804}
5805
5806fn rewrite_window_to_columns(e: &mut Expr, window_nodes: &[Expr]) {
5807    if let Expr::WindowFunction { .. } = e
5808        && let Some(idx) = window_nodes.iter().position(|w| w == e)
5809    {
5810        *e = Expr::Column(spg_sql::ast::ColumnName {
5811            qualifier: None,
5812            name: alloc::format!("__win_{idx}"),
5813        });
5814        return;
5815    }
5816    match e {
5817        Expr::Binary { lhs, rhs, .. } => {
5818            rewrite_window_to_columns(lhs, window_nodes);
5819            rewrite_window_to_columns(rhs, window_nodes);
5820        }
5821        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
5822            rewrite_window_to_columns(expr, window_nodes);
5823        }
5824        Expr::FunctionCall { args, .. } => {
5825            for a in args {
5826                rewrite_window_to_columns(a, window_nodes);
5827            }
5828        }
5829        Expr::Like { expr, pattern, .. } => {
5830            rewrite_window_to_columns(expr, window_nodes);
5831            rewrite_window_to_columns(pattern, window_nodes);
5832        }
5833        Expr::Extract { source, .. } => rewrite_window_to_columns(source, window_nodes),
5834        _ => {}
5835    }
5836}
5837
5838/// Total order over partition-key tuples. NULL sorts as the
5839/// lowest value (matches the `<` partial order's NULL-last
5840/// behaviour with `INFINITY` flipped).
5841fn partition_key_cmp(a: &[Value], b: &[Value]) -> core::cmp::Ordering {
5842    for (x, y) in a.iter().zip(b.iter()) {
5843        let c = value_cmp(x, y);
5844        if c != core::cmp::Ordering::Equal {
5845            return c;
5846        }
5847    }
5848    a.len().cmp(&b.len())
5849}
5850
5851fn order_key_cmp(a: &[(Value, bool)], b: &[(Value, bool)]) -> core::cmp::Ordering {
5852    for ((va, desc), (vb, _)) in a.iter().zip(b.iter()) {
5853        let c = value_cmp(va, vb);
5854        let c = if *desc { c.reverse() } else { c };
5855        if c != core::cmp::Ordering::Equal {
5856            return c;
5857        }
5858    }
5859    a.len().cmp(&b.len())
5860}
5861
5862#[allow(clippy::match_same_arms)] // explicit arms per type document the supported pairs
5863fn value_cmp(a: &Value, b: &Value) -> core::cmp::Ordering {
5864    use core::cmp::Ordering;
5865    match (a, b) {
5866        (Value::Null, Value::Null) => Ordering::Equal,
5867        (Value::Null, _) => Ordering::Less,
5868        (_, Value::Null) => Ordering::Greater,
5869        (Value::Int(x), Value::Int(y)) => x.cmp(y),
5870        (Value::BigInt(x), Value::BigInt(y)) => x.cmp(y),
5871        (Value::SmallInt(x), Value::SmallInt(y)) => x.cmp(y),
5872        (Value::Text(x), Value::Text(y)) => x.cmp(y),
5873        (Value::Bool(x), Value::Bool(y)) => x.cmp(y),
5874        (Value::Float(x), Value::Float(y)) => x.partial_cmp(y).unwrap_or(Ordering::Equal),
5875        (Value::Date(x), Value::Date(y)) => x.cmp(y),
5876        (Value::Timestamp(x), Value::Timestamp(y)) => x.cmp(y),
5877        // Cross-type compare: fall back to the debug rendering —
5878        // same-partition is the goal, exact order is irrelevant.
5879        _ => alloc::format!("{a:?}").cmp(&alloc::format!("{b:?}")),
5880    }
5881}
5882
5883/// Compute the window function's per-row output for one partition.
5884/// `slice` has (partition key, order key, original-row-index)
5885/// tuples already sorted by order key. `filtered_rows` is the
5886/// full row list indexed by original-row-index. `out_vals` is
5887/// the destination, also indexed by original-row-index.
5888#[allow(
5889    clippy::too_many_arguments,
5890    clippy::cast_possible_truncation,
5891    clippy::cast_possible_wrap,
5892    clippy::cast_precision_loss,
5893    clippy::cast_sign_loss,
5894    clippy::doc_markdown,
5895    clippy::too_many_lines,
5896    clippy::type_complexity,
5897    clippy::match_same_arms
5898)]
5899fn compute_window_partition(
5900    name: &str,
5901    args: &[Expr],
5902    ordered: bool,
5903    frame: Option<&WindowFrame>,
5904    null_treatment: spg_sql::ast::NullTreatment,
5905    slice: &[(Vec<Value>, Vec<(Value, bool)>, usize)],
5906    filtered_rows: &[&Row],
5907    ctx: &EvalContext<'_>,
5908    out_vals: &mut [Value],
5909) -> Result<(), EngineError> {
5910    let ignore_nulls = matches!(null_treatment, spg_sql::ast::NullTreatment::Ignore);
5911    let lower = name.to_ascii_lowercase();
5912    match lower.as_str() {
5913        "row_number" => {
5914            for (rank, (_, _, idx)) in slice.iter().enumerate() {
5915                out_vals[*idx] = Value::BigInt((rank + 1) as i64);
5916            }
5917            Ok(())
5918        }
5919        "rank" => {
5920            let mut prev_key: Option<&[(Value, bool)]> = None;
5921            let mut current_rank: i64 = 1;
5922            for (i, (_, okey, idx)) in slice.iter().enumerate() {
5923                if let Some(p) = prev_key
5924                    && order_key_cmp(p, okey) != core::cmp::Ordering::Equal
5925                {
5926                    current_rank = (i + 1) as i64;
5927                }
5928                if prev_key.is_none() {
5929                    current_rank = 1;
5930                }
5931                out_vals[*idx] = Value::BigInt(current_rank);
5932                prev_key = Some(okey.as_slice());
5933            }
5934            Ok(())
5935        }
5936        "dense_rank" => {
5937            let mut prev_key: Option<&[(Value, bool)]> = None;
5938            let mut current_rank: i64 = 0;
5939            for (_, okey, idx) in slice {
5940                if prev_key.is_none_or(|p| order_key_cmp(p, okey) != core::cmp::Ordering::Equal) {
5941                    current_rank += 1;
5942                }
5943                out_vals[*idx] = Value::BigInt(current_rank);
5944                prev_key = Some(okey.as_slice());
5945            }
5946            Ok(())
5947        }
5948        "sum" | "avg" | "min" | "max" | "count" | "count_star" => {
5949            // Pre-evaluate the function arg per row in the slice
5950            // (count_star has no arg).
5951            let arg_values: Vec<Value> = if lower == "count_star" || args.is_empty() {
5952                slice.iter().map(|_| Value::Null).collect()
5953            } else {
5954                slice
5955                    .iter()
5956                    .map(|(_, _, idx)| eval::eval_expr(&args[0], filtered_rows[*idx], ctx))
5957                    .collect::<Result<_, _>>()
5958                    .map_err(EngineError::Eval)?
5959            };
5960            // v4.20: pick the effective frame. Explicit frame
5961            // overrides the implicit default (running for ordered,
5962            // whole-partition for unordered).
5963            let eff = effective_frame(frame, ordered)?;
5964            #[allow(clippy::needless_range_loop)]
5965            for i in 0..slice.len() {
5966                let (lo, hi) = frame_bounds_for_row(&eff, i, slice);
5967                let mut sum: f64 = 0.0;
5968                let mut count: i64 = 0;
5969                let mut min_v: Option<f64> = None;
5970                let mut max_v: Option<f64> = None;
5971                let mut row_count: i64 = 0;
5972                if lo <= hi {
5973                    for j in lo..=hi {
5974                        let v = &arg_values[j];
5975                        match lower.as_str() {
5976                            "count_star" => row_count += 1,
5977                            "count" => {
5978                                if !v.is_null() {
5979                                    count += 1;
5980                                }
5981                            }
5982                            _ => {
5983                                if let Some(x) = value_to_f64(v) {
5984                                    sum += x;
5985                                    count += 1;
5986                                    min_v = Some(min_v.map_or(x, |m| m.min(x)));
5987                                    max_v = Some(max_v.map_or(x, |m| m.max(x)));
5988                                }
5989                            }
5990                        }
5991                    }
5992                }
5993                let value = match lower.as_str() {
5994                    "count_star" => Value::BigInt(row_count),
5995                    "count" => Value::BigInt(count),
5996                    "sum" => Value::Float(sum),
5997                    "avg" => {
5998                        if count == 0 {
5999                            Value::Null
6000                        } else {
6001                            Value::Float(sum / count as f64)
6002                        }
6003                    }
6004                    "min" => min_v.map_or(Value::Null, Value::Float),
6005                    "max" => max_v.map_or(Value::Null, Value::Float),
6006                    _ => unreachable!(),
6007                };
6008                let (_, _, idx) = &slice[i];
6009                out_vals[*idx] = value;
6010            }
6011            Ok(())
6012        }
6013        "lag" | "lead" => {
6014            // lag(expr [, offset [, default]])
6015            // lead(expr [, offset [, default]])
6016            if args.is_empty() {
6017                return Err(EngineError::Unsupported(alloc::format!(
6018                    "{lower}() requires at least one argument"
6019                )));
6020            }
6021            let offset: i64 = if args.len() >= 2 {
6022                let v = eval::eval_expr(&args[1], filtered_rows[slice[0].2], ctx)
6023                    .map_err(EngineError::Eval)?;
6024                match v {
6025                    Value::SmallInt(n) => i64::from(n),
6026                    Value::Int(n) => i64::from(n),
6027                    Value::BigInt(n) => n,
6028                    _ => {
6029                        return Err(EngineError::Unsupported(alloc::format!(
6030                            "{lower}() offset must be integer"
6031                        )));
6032                    }
6033                }
6034            } else {
6035                1
6036            };
6037            let default: Value = if args.len() >= 3 {
6038                eval::eval_expr(&args[2], filtered_rows[slice[0].2], ctx)
6039                    .map_err(EngineError::Eval)?
6040            } else {
6041                Value::Null
6042            };
6043            let values: Vec<Value> = slice
6044                .iter()
6045                .map(|(_, _, idx)| eval::eval_expr(&args[0], filtered_rows[*idx], ctx))
6046                .collect::<Result<_, _>>()
6047                .map_err(EngineError::Eval)?;
6048            let n = slice.len();
6049            for (i, (_, _, idx)) in slice.iter().enumerate() {
6050                let signed_offset = if lower == "lag" { -offset } else { offset };
6051                let v = if ignore_nulls {
6052                    // v6.4.2 — IGNORE NULLS: walk in the offset direction
6053                    // skipping NULL values; the `offset`-th non-NULL
6054                    // encountered is the result.
6055                    let step: i64 = if signed_offset >= 0 { 1 } else { -1 };
6056                    let needed: i64 = signed_offset.abs();
6057                    if needed == 0 {
6058                        values[i].clone()
6059                    } else {
6060                        let mut j: i64 = i as i64;
6061                        let mut hits: i64 = 0;
6062                        let mut found: Option<Value> = None;
6063                        loop {
6064                            j += step;
6065                            if j < 0 || j >= n as i64 {
6066                                break;
6067                            }
6068                            #[allow(clippy::cast_sign_loss)]
6069                            let v = &values[j as usize];
6070                            if !v.is_null() {
6071                                hits += 1;
6072                                if hits == needed {
6073                                    found = Some(v.clone());
6074                                    break;
6075                                }
6076                            }
6077                        }
6078                        found.unwrap_or_else(|| default.clone())
6079                    }
6080                } else {
6081                    let target_signed = i64::try_from(i).unwrap_or(i64::MAX) + signed_offset;
6082                    if target_signed < 0
6083                        || target_signed >= i64::try_from(n).unwrap_or(i64::MAX)
6084                    {
6085                        default.clone()
6086                    } else {
6087                        #[allow(clippy::cast_sign_loss)]
6088                        {
6089                            values[target_signed as usize].clone()
6090                        }
6091                    }
6092                };
6093                out_vals[*idx] = v;
6094            }
6095            Ok(())
6096        }
6097        "first_value" | "last_value" | "nth_value" => {
6098            if args.is_empty() {
6099                return Err(EngineError::Unsupported(alloc::format!(
6100                    "{lower}() requires at least one argument"
6101                )));
6102            }
6103            let values: Vec<Value> = slice
6104                .iter()
6105                .map(|(_, _, idx)| eval::eval_expr(&args[0], filtered_rows[*idx], ctx))
6106                .collect::<Result<_, _>>()
6107                .map_err(EngineError::Eval)?;
6108            let nth: usize = if lower == "nth_value" {
6109                if args.len() < 2 {
6110                    return Err(EngineError::Unsupported(
6111                        "nth_value() requires (expr, n)".into(),
6112                    ));
6113                }
6114                let v = eval::eval_expr(&args[1], filtered_rows[slice[0].2], ctx)
6115                    .map_err(EngineError::Eval)?;
6116                let raw = match v {
6117                    Value::SmallInt(n) => i64::from(n),
6118                    Value::Int(n) => i64::from(n),
6119                    Value::BigInt(n) => n,
6120                    _ => {
6121                        return Err(EngineError::Unsupported(
6122                            "nth_value() n must be integer".into(),
6123                        ));
6124                    }
6125                };
6126                if raw < 1 {
6127                    return Err(EngineError::Unsupported(
6128                        "nth_value() n must be >= 1".into(),
6129                    ));
6130                }
6131                #[allow(clippy::cast_sign_loss)]
6132                {
6133                    raw as usize
6134                }
6135            } else {
6136                0
6137            };
6138            let eff = effective_frame(frame, ordered)?;
6139            for i in 0..slice.len() {
6140                let (lo, hi) = frame_bounds_for_row(&eff, i, slice);
6141                let (_, _, idx) = &slice[i];
6142                let v = if lo > hi {
6143                    Value::Null
6144                } else if ignore_nulls && matches!(lower.as_str(), "first_value" | "last_value") {
6145                    // v6.4.2 — IGNORE NULLS: skip NULL cells when
6146                    // selecting the boundary value within the frame.
6147                    if lower == "first_value" {
6148                        (lo..=hi)
6149                            .find_map(|j| {
6150                                let v = &values[j];
6151                                (!v.is_null()).then(|| v.clone())
6152                            })
6153                            .unwrap_or(Value::Null)
6154                    } else {
6155                        (lo..=hi)
6156                            .rev()
6157                            .find_map(|j| {
6158                                let v = &values[j];
6159                                (!v.is_null()).then(|| v.clone())
6160                            })
6161                            .unwrap_or(Value::Null)
6162                    }
6163                } else {
6164                    match lower.as_str() {
6165                        "first_value" => values[lo].clone(),
6166                        "last_value" => values[hi].clone(),
6167                        "nth_value" => {
6168                            let pos = lo + nth - 1;
6169                            if pos > hi {
6170                                Value::Null
6171                            } else {
6172                                values[pos].clone()
6173                            }
6174                        }
6175                        _ => unreachable!(),
6176                    }
6177                };
6178                out_vals[*idx] = v;
6179            }
6180            Ok(())
6181        }
6182        "ntile" => {
6183            if args.is_empty() {
6184                return Err(EngineError::Unsupported(
6185                    "ntile(n) requires an integer argument".into(),
6186                ));
6187            }
6188            let v = eval::eval_expr(&args[0], filtered_rows[slice[0].2], ctx)
6189                .map_err(EngineError::Eval)?;
6190            let bucket_count: i64 = match v {
6191                Value::SmallInt(n) => i64::from(n),
6192                Value::Int(n) => i64::from(n),
6193                Value::BigInt(n) => n,
6194                _ => {
6195                    return Err(EngineError::Unsupported(
6196                        "ntile() argument must be integer".into(),
6197                    ));
6198                }
6199            };
6200            if bucket_count < 1 {
6201                return Err(EngineError::Unsupported(
6202                    "ntile() argument must be >= 1".into(),
6203                ));
6204            }
6205            #[allow(clippy::cast_sign_loss)]
6206            let buckets = bucket_count as usize;
6207            let n = slice.len();
6208            // Each bucket gets `base` rows; the first `extras` buckets
6209            // get one extra. PG semantics.
6210            let base = n / buckets;
6211            let extras = n % buckets;
6212            let mut bucket: usize = 1;
6213            let mut remaining_in_bucket = if extras > 0 { base + 1 } else { base };
6214            let mut buckets_with_extra_remaining = extras;
6215            for (_, _, idx) in slice {
6216                if remaining_in_bucket == 0 {
6217                    bucket += 1;
6218                    buckets_with_extra_remaining = buckets_with_extra_remaining.saturating_sub(1);
6219                    remaining_in_bucket = if buckets_with_extra_remaining > 0 {
6220                        base + 1
6221                    } else {
6222                        base
6223                    };
6224                    // Edge: if base==0 and extras==0, all rows fit;
6225                    // shouldn't reach here, but guard anyway.
6226                    if remaining_in_bucket == 0 {
6227                        remaining_in_bucket = 1;
6228                    }
6229                }
6230                out_vals[*idx] = Value::BigInt(i64::try_from(bucket).unwrap_or(i64::MAX));
6231                remaining_in_bucket -= 1;
6232            }
6233            Ok(())
6234        }
6235        "percent_rank" => {
6236            // (rank - 1) / (n - 1) where rank is the standard RANK().
6237            // Single-row partitions get 0.
6238            let n = slice.len();
6239            let mut prev_key: Option<&[(Value, bool)]> = None;
6240            let mut current_rank: i64 = 1;
6241            for (i, (_, okey, idx)) in slice.iter().enumerate() {
6242                if let Some(p) = prev_key
6243                    && order_key_cmp(p, okey) != core::cmp::Ordering::Equal
6244                {
6245                    current_rank = i64::try_from(i + 1).unwrap_or(i64::MAX);
6246                }
6247                if prev_key.is_none() {
6248                    current_rank = 1;
6249                }
6250                #[allow(clippy::cast_precision_loss)]
6251                let pr = if n <= 1 {
6252                    0.0
6253                } else {
6254                    (current_rank - 1) as f64 / (n - 1) as f64
6255                };
6256                out_vals[*idx] = Value::Float(pr);
6257                prev_key = Some(okey.as_slice());
6258            }
6259            Ok(())
6260        }
6261        "cume_dist" => {
6262            // # rows up to and including this row's peer group / n.
6263            let n = slice.len();
6264            // First pass: find peer-group-end rank for each row.
6265            for i in 0..slice.len() {
6266                let peer_end = peer_group_end(slice, i);
6267                #[allow(clippy::cast_precision_loss)]
6268                let cd = (peer_end + 1) as f64 / n as f64;
6269                let (_, _, idx) = &slice[i];
6270                out_vals[*idx] = Value::Float(cd);
6271            }
6272            Ok(())
6273        }
6274        other => Err(EngineError::Unsupported(alloc::format!(
6275            "window function {other:?} not supported (v4.21: row_number/rank/dense_rank/sum/avg/count/min/max/lag/lead/first_value/last_value/nth_value/ntile/percent_rank/cume_dist)"
6276        ))),
6277    }
6278}
6279
6280/// v4.20: resolve the user-provided frame down to a normalised
6281/// `(kind, start, end)`. `None` means default — derive from
6282/// `ordered`: ordered ⇒ RANGE UNBOUNDED PRECEDING AND CURRENT ROW,
6283/// unordered ⇒ ROWS UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING.
6284/// Single-bound shorthand (e.g. `ROWS 5 PRECEDING`) normalises
6285/// end → CURRENT ROW per the PG spec.
6286fn effective_frame(
6287    frame: Option<&WindowFrame>,
6288    ordered: bool,
6289) -> Result<(FrameKind, FrameBound, FrameBound), EngineError> {
6290    match frame {
6291        None => {
6292            if ordered {
6293                Ok((
6294                    FrameKind::Range,
6295                    FrameBound::UnboundedPreceding,
6296                    FrameBound::CurrentRow,
6297                ))
6298            } else {
6299                Ok((
6300                    FrameKind::Rows,
6301                    FrameBound::UnboundedPreceding,
6302                    FrameBound::UnboundedFollowing,
6303                ))
6304            }
6305        }
6306        Some(fr) => {
6307            let end = fr.end.clone().unwrap_or(FrameBound::CurrentRow);
6308            // Reject start > end (a few impossible combinations).
6309            if matches!(fr.start, FrameBound::UnboundedFollowing)
6310                || matches!(end, FrameBound::UnboundedPreceding)
6311            {
6312                return Err(EngineError::Unsupported(alloc::format!(
6313                    "invalid frame: start={:?} end={:?}",
6314                    fr.start,
6315                    end
6316                )));
6317            }
6318            // RANGE OFFSET PRECEDING / FOLLOWING needs value-typed
6319            // arithmetic on the ORDER BY key (e.g. `RANGE BETWEEN
6320            // INTERVAL '1 day' PRECEDING AND CURRENT ROW`). Not
6321            // implemented in v4.20.
6322            if fr.kind == FrameKind::Range
6323                && (matches!(
6324                    fr.start,
6325                    FrameBound::OffsetPreceding(_) | FrameBound::OffsetFollowing(_)
6326                ) || matches!(
6327                    end,
6328                    FrameBound::OffsetPreceding(_) | FrameBound::OffsetFollowing(_)
6329                ))
6330            {
6331                return Err(EngineError::Unsupported(
6332                    "RANGE with explicit offset bounds is not supported (v4.20: only UNBOUNDED / CURRENT ROW for RANGE)".into(),
6333                ));
6334            }
6335            Ok((fr.kind, fr.start.clone(), end))
6336        }
6337    }
6338}
6339
6340/// Compute `(lo, hi)` row-index bounds inside the partition slice
6341/// for the row at position `i`. Inclusive, clamped to
6342/// `[0, slice.len()-1]`. Empty result if `lo > hi`.
6343#[allow(clippy::type_complexity)]
6344fn frame_bounds_for_row(
6345    eff: &(FrameKind, FrameBound, FrameBound),
6346    i: usize,
6347    slice: &[(Vec<Value>, Vec<(Value, bool)>, usize)],
6348) -> (usize, usize) {
6349    let (kind, start, end) = eff;
6350    let n = slice.len();
6351    let last = n.saturating_sub(1);
6352    let (mut lo, mut hi) = match kind {
6353        FrameKind::Rows => {
6354            let lo = match start {
6355                FrameBound::UnboundedPreceding => 0,
6356                FrameBound::OffsetPreceding(k) => {
6357                    let k = usize::try_from(*k).unwrap_or(usize::MAX);
6358                    i.saturating_sub(k)
6359                }
6360                FrameBound::CurrentRow => i,
6361                FrameBound::OffsetFollowing(k) => {
6362                    let k = usize::try_from(*k).unwrap_or(usize::MAX);
6363                    i.saturating_add(k).min(last)
6364                }
6365                FrameBound::UnboundedFollowing => last,
6366            };
6367            let hi = match end {
6368                FrameBound::UnboundedPreceding => 0,
6369                FrameBound::OffsetPreceding(k) => {
6370                    let k = usize::try_from(*k).unwrap_or(usize::MAX);
6371                    i.saturating_sub(k)
6372                }
6373                FrameBound::CurrentRow => i,
6374                FrameBound::OffsetFollowing(k) => {
6375                    let k = usize::try_from(*k).unwrap_or(usize::MAX);
6376                    i.saturating_add(k).min(last)
6377                }
6378                FrameBound::UnboundedFollowing => last,
6379            };
6380            (lo, hi)
6381        }
6382        FrameKind::Range => {
6383            // RANGE bounds are peer-aware. With only UNBOUNDED and
6384            // CURRENT ROW supported (rejected at effective_frame for
6385            // explicit offsets), the start/end map to the
6386            // partition's full extent at the same-order-key peer
6387            // group boundary.
6388            let lo = match start {
6389                FrameBound::UnboundedPreceding => 0,
6390                FrameBound::CurrentRow => peer_group_start(slice, i),
6391                FrameBound::UnboundedFollowing => last,
6392                _ => unreachable!("offset bounds rejected for RANGE"),
6393            };
6394            let hi = match end {
6395                FrameBound::UnboundedPreceding => 0,
6396                FrameBound::CurrentRow => peer_group_end(slice, i),
6397                FrameBound::UnboundedFollowing => last,
6398                _ => unreachable!("offset bounds rejected for RANGE"),
6399            };
6400            (lo, hi)
6401        }
6402    };
6403    if hi >= n {
6404        hi = last;
6405    }
6406    if lo >= n {
6407        lo = last;
6408    }
6409    (lo, hi)
6410}
6411
6412/// Find the inclusive index of the first row with the same ORDER
6413/// BY key as `slice[i]`. Slice is already sorted by partition then
6414/// order, so peers are contiguous.
6415#[allow(clippy::type_complexity)]
6416fn peer_group_start(slice: &[(Vec<Value>, Vec<(Value, bool)>, usize)], i: usize) -> usize {
6417    let key = &slice[i].1;
6418    let mut j = i;
6419    while j > 0 && order_key_cmp(&slice[j - 1].1, key) == core::cmp::Ordering::Equal {
6420        j -= 1;
6421    }
6422    j
6423}
6424
6425/// Find the inclusive index of the last row with the same ORDER
6426/// BY key as `slice[i]`.
6427#[allow(clippy::type_complexity)]
6428fn peer_group_end(slice: &[(Vec<Value>, Vec<(Value, bool)>, usize)], i: usize) -> usize {
6429    let key = &slice[i].1;
6430    let mut j = i;
6431    while j + 1 < slice.len() && order_key_cmp(&slice[j + 1].1, key) == core::cmp::Ordering::Equal {
6432        j += 1;
6433    }
6434    j
6435}
6436
6437fn value_to_f64(v: &Value) -> Option<f64> {
6438    match v {
6439        Value::SmallInt(n) => Some(f64::from(*n)),
6440        Value::Int(n) => Some(f64::from(*n)),
6441        #[allow(clippy::cast_precision_loss)]
6442        Value::BigInt(n) => Some(*n as f64),
6443        Value::Float(x) => Some(*x),
6444        _ => None,
6445    }
6446}
6447
6448/// Quick scan for any subquery-bearing node in a SELECT's WHERE /
6449/// projection / `order_by` — saves cloning the AST when there are
6450/// none (the common case).
6451fn expr_tree_has_subquery(stmt: &SelectStatement) -> bool {
6452    let mut any = false;
6453    for item in &stmt.items {
6454        if let SelectItem::Expr { expr, .. } = item {
6455            any = any || expr_has_subquery(expr);
6456        }
6457    }
6458    if let Some(w) = &stmt.where_ {
6459        any = any || expr_has_subquery(w);
6460    }
6461    if let Some(h) = &stmt.having {
6462        any = any || expr_has_subquery(h);
6463    }
6464    for o in &stmt.order_by {
6465        any = any || expr_has_subquery(&o.expr);
6466    }
6467    for (_, peer) in &stmt.unions {
6468        any = any || expr_tree_has_subquery(peer);
6469    }
6470    any
6471}
6472
6473fn expr_has_subquery(e: &Expr) -> bool {
6474    match e {
6475        Expr::ScalarSubquery(_) | Expr::Exists { .. } | Expr::InSubquery { .. } => true,
6476        Expr::Binary { lhs, rhs, .. } => expr_has_subquery(lhs) || expr_has_subquery(rhs),
6477        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6478            expr_has_subquery(expr)
6479        }
6480        Expr::FunctionCall { args, .. } => args.iter().any(expr_has_subquery),
6481        Expr::Like { expr, pattern, .. } => expr_has_subquery(expr) || expr_has_subquery(pattern),
6482        Expr::Extract { source, .. } => expr_has_subquery(source),
6483        Expr::WindowFunction {
6484            args,
6485            partition_by,
6486            order_by,
6487            ..
6488        } => {
6489            args.iter().any(expr_has_subquery)
6490                || partition_by.iter().any(expr_has_subquery)
6491                || order_by.iter().any(|(e, _)| expr_has_subquery(e))
6492        }
6493        Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => false,
6494    }
6495}
6496
6497/// v4.10 helper: materialise a runtime `Value` back into an AST
6498/// `Expr::Literal` for the subquery-rewrite path. Supports the
6499/// types `Literal` can represent (Integer / Float / Text / Bool /
6500/// Null). Date / Timestamp / Numeric / Vector / Interval / JSON
6501/// would lose precision through Literal and aren't supported in
6502/// uncorrelated-subquery results; they error with a clear hint.
6503fn value_to_literal_expr(v: Value) -> Result<Expr, EngineError> {
6504    let lit = match v {
6505        Value::Null => Literal::Null,
6506        Value::SmallInt(n) => Literal::Integer(i64::from(n)),
6507        Value::Int(n) => Literal::Integer(i64::from(n)),
6508        Value::BigInt(n) => Literal::Integer(n),
6509        Value::Float(x) => Literal::Float(x),
6510        Value::Text(s) | Value::Json(s) => Literal::String(s),
6511        Value::Bool(b) => Literal::Bool(b),
6512        other => {
6513            return Err(EngineError::Unsupported(alloc::format!(
6514                "subquery result type {:?} not yet materialisable; cast to text or integer in the inner SELECT",
6515                other.data_type()
6516            )));
6517        }
6518    };
6519    Ok(Expr::Literal(lit))
6520}
6521
6522/// v6.1.1 — walk the prepared `Statement` AST and replace every
6523/// `Expr::Placeholder(n)` with `Expr::Literal(value_to_literal(
6524/// params[n-1]))`. The dispatch downstream sees a `Statement`
6525/// indistinguishable from a simple-query parse, so the exec path
6526/// stays unchanged.
6527///
6528/// Errors fall into one shape: a `$N` references past the bound
6529/// `params.len()`. Out-of-range happens when the Bind didn't
6530/// supply enough values; pgwire surfaces this as a protocol error
6531/// to the client.
6532fn substitute_placeholders(stmt: &mut Statement, params: &[Value]) -> Result<(), EngineError> {
6533    match stmt {
6534        Statement::Select(s) => substitute_select(s, params)?,
6535        Statement::Insert(ins) => {
6536            for row in &mut ins.rows {
6537                for e in row {
6538                    substitute_expr(e, params)?;
6539                }
6540            }
6541        }
6542        Statement::Update(u) => {
6543            for (_, e) in &mut u.assignments {
6544                substitute_expr(e, params)?;
6545            }
6546            if let Some(w) = &mut u.where_ {
6547                substitute_expr(w, params)?;
6548            }
6549        }
6550        Statement::Delete(d) => {
6551            if let Some(w) = &mut d.where_ {
6552                substitute_expr(w, params)?;
6553            }
6554        }
6555        Statement::Explain(e) => substitute_select(&mut e.inner, params)?,
6556        // Other statements (CREATE / BEGIN / SHOW / …) have no
6557        // expression slots; no walk needed.
6558        _ => {}
6559    }
6560    Ok(())
6561}
6562
6563fn substitute_select(
6564    s: &mut SelectStatement,
6565    params: &[Value],
6566) -> Result<(), EngineError> {
6567    for item in &mut s.items {
6568        if let SelectItem::Expr { expr, .. } = item {
6569            substitute_expr(expr, params)?;
6570        }
6571    }
6572    if let Some(w) = &mut s.where_ {
6573        substitute_expr(w, params)?;
6574    }
6575    if let Some(gs) = &mut s.group_by {
6576        for g in gs {
6577            substitute_expr(g, params)?;
6578        }
6579    }
6580    if let Some(h) = &mut s.having {
6581        substitute_expr(h, params)?;
6582    }
6583    for o in &mut s.order_by {
6584        substitute_expr(&mut o.expr, params)?;
6585    }
6586    for (_, peer) in &mut s.unions {
6587        substitute_select(peer, params)?;
6588    }
6589    Ok(())
6590}
6591
6592fn substitute_expr(e: &mut Expr, params: &[Value]) -> Result<(), EngineError> {
6593    if let Expr::Placeholder(n) = e {
6594        let idx = usize::from(*n).saturating_sub(1);
6595        let v = params.get(idx).ok_or_else(|| {
6596            EngineError::Eval(EvalError::PlaceholderOutOfRange {
6597                n: *n,
6598                bound: u16::try_from(params.len()).unwrap_or(u16::MAX),
6599            })
6600        })?;
6601        *e = Expr::Literal(value_to_literal(v.clone()));
6602        return Ok(());
6603    }
6604    match e {
6605        Expr::Binary { lhs, rhs, .. } => {
6606            substitute_expr(lhs, params)?;
6607            substitute_expr(rhs, params)?;
6608        }
6609        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6610            substitute_expr(expr, params)?;
6611        }
6612        Expr::FunctionCall { args, .. } => {
6613            for a in args {
6614                substitute_expr(a, params)?;
6615            }
6616        }
6617        Expr::Like { expr, pattern, .. } => {
6618            substitute_expr(expr, params)?;
6619            substitute_expr(pattern, params)?;
6620        }
6621        Expr::Extract { source, .. } => substitute_expr(source, params)?,
6622        Expr::ScalarSubquery(s) => substitute_select(s, params)?,
6623        Expr::Exists { subquery, .. } => substitute_select(subquery, params)?,
6624        Expr::InSubquery { expr, subquery, .. } => {
6625            substitute_expr(expr, params)?;
6626            substitute_select(subquery, params)?;
6627        }
6628        Expr::WindowFunction {
6629            args,
6630            partition_by,
6631            order_by,
6632            ..
6633        } => {
6634            for a in args {
6635                substitute_expr(a, params)?;
6636            }
6637            for p in partition_by {
6638                substitute_expr(p, params)?;
6639            }
6640            for (e, _) in order_by {
6641                substitute_expr(e, params)?;
6642            }
6643        }
6644        Expr::Literal(_) | Expr::Column(_) => {}
6645        // Already handled above.
6646        Expr::Placeholder(_) => unreachable!("Placeholder handled at top of fn"),
6647    }
6648    Ok(())
6649}
6650
6651/// v6.1.1 — convert a runtime `Value` into the closest matching
6652/// `Literal` for the substitute walker. Lossless for the simple
6653/// scalars (Int / Float / Text / Bool); Numeric / Date / Timestamp
6654/// / Json / Interval render as their canonical text form so the
6655/// downstream coerce_value can re-parse against the target column
6656/// type. SQ8 / HalfVector cells are NOT expected as bind params;
6657/// pgwire's Bind decodes vector params to the f32 representation
6658/// before they reach this helper.
6659/// v6.2.0 — total ordering on `Value`s used by ANALYZE to sort a
6660/// column's non-NULL sample before histogram building. Cross-type
6661/// pairs (Int vs Float, Date vs Timestamp, …) compare via the
6662/// same widening the eval-side `compare` operator uses; everything
6663/// else (the genuinely-incompatible pairs) falls back to ordering
6664/// by canonical string form so the sort is still total + stable.
6665/// Vector / SQ8 / Half / Json / Numeric / Interval values reach
6666/// here only via the string-fallback path because vector columns
6667/// are filtered out upstream.
6668fn sort_values_for_histogram(a: &Value, b: &Value) -> core::cmp::Ordering {
6669    use core::cmp::Ordering;
6670    match (a, b) {
6671        (Value::SmallInt(a), Value::SmallInt(b)) => a.cmp(b),
6672        (Value::Int(a), Value::Int(b)) => a.cmp(b),
6673        (Value::BigInt(a), Value::BigInt(b)) => a.cmp(b),
6674        (Value::SmallInt(a), Value::Int(b)) => i32::from(*a).cmp(b),
6675        (Value::Int(a), Value::SmallInt(b)) => a.cmp(&i32::from(*b)),
6676        (Value::Int(a), Value::BigInt(b)) => i64::from(*a).cmp(b),
6677        (Value::BigInt(a), Value::Int(b)) => a.cmp(&i64::from(*b)),
6678        (Value::SmallInt(a), Value::BigInt(b)) => i64::from(*a).cmp(b),
6679        (Value::BigInt(a), Value::SmallInt(b)) => a.cmp(&i64::from(*b)),
6680        (Value::Float(a), Value::Float(b)) => a.partial_cmp(b).unwrap_or(Ordering::Equal),
6681        (Value::Text(a), Value::Text(b)) | (Value::Json(a), Value::Json(b)) => a.cmp(b),
6682        (Value::Bool(a), Value::Bool(b)) => a.cmp(b),
6683        (Value::Date(a), Value::Date(b)) => a.cmp(b),
6684        (Value::Timestamp(a), Value::Timestamp(b)) => a.cmp(b),
6685        // Mixed numeric/float — widen to f64 and compare.
6686        (Value::SmallInt(n), Value::Float(x)) => {
6687            (f64::from(*n)).partial_cmp(x).unwrap_or(Ordering::Equal)
6688        }
6689        (Value::Float(x), Value::SmallInt(n)) => {
6690            x.partial_cmp(&f64::from(*n)).unwrap_or(Ordering::Equal)
6691        }
6692        (Value::Int(n), Value::Float(x)) => {
6693            (f64::from(*n)).partial_cmp(x).unwrap_or(Ordering::Equal)
6694        }
6695        (Value::Float(x), Value::Int(n)) => {
6696            x.partial_cmp(&f64::from(*n)).unwrap_or(Ordering::Equal)
6697        }
6698        (Value::BigInt(n), Value::Float(x)) => {
6699            #[allow(clippy::cast_precision_loss)]
6700            let nf = *n as f64;
6701            nf.partial_cmp(x).unwrap_or(Ordering::Equal)
6702        }
6703        (Value::Float(x), Value::BigInt(n)) => {
6704            #[allow(clippy::cast_precision_loss)]
6705            let nf = *n as f64;
6706            x.partial_cmp(&nf).unwrap_or(Ordering::Equal)
6707        }
6708        // Cross-type fallback: lexicographic on canonical form.
6709        // Total + stable so the sort is well-defined.
6710        _ => canonical_value_repr(a).cmp(&canonical_value_repr(b)),
6711    }
6712}
6713
6714/// v6.2.0 — render the histogram bounds list as a `[v0, v1, ...]`
6715/// string for the `spg_statistic.histogram_bounds` column. Values
6716/// containing `,` or `[` / `]` are JSON-style escaped so the
6717/// rendering round-trips through a future parser; v6.2.0 only
6718/// uses the rendered form for human consumption, so the escaping
6719/// is conservative.
6720fn render_histogram_bounds(bounds: &[alloc::string::String]) -> alloc::string::String {
6721    let mut out = alloc::string::String::with_capacity(bounds.len() * 8 + 2);
6722    out.push('[');
6723    for (i, b) in bounds.iter().enumerate() {
6724        if i > 0 {
6725            out.push_str(", ");
6726        }
6727        let needs_quote = b.contains([',', '[', ']', '"']) || b.is_empty();
6728        if needs_quote {
6729            out.push('"');
6730            for ch in b.chars() {
6731                if ch == '"' || ch == '\\' {
6732                    out.push('\\');
6733                }
6734                out.push(ch);
6735            }
6736            out.push('"');
6737        } else {
6738            out.push_str(b);
6739        }
6740    }
6741    out.push(']');
6742    out
6743}
6744
6745/// v6.2.0 — canonical textual form of a `Value` for histogram
6746/// bound storage. Strings used by ANALYZE for sort + bound output.
6747/// INT / BIGINT → decimal; FLOAT → shortest-round-trip via
6748/// `{:?}`; TEXT pass-through; BOOL → `t` / `f`; DATE / TIMESTAMP →
6749/// the same form `format_date` / `format_timestamp` produce for
6750/// SQL Display. Vector / SQ8 / Half / Json / Numeric / Interval
6751/// reach this only via a non-Vector column (vector columns are
6752/// skipped upstream); they fall back to a Debug-derived form so
6753/// stats still serialise without crashing.
6754pub(crate) fn canonical_value_repr(v: &Value) -> alloc::string::String {
6755    match v {
6756        Value::Null => "NULL".to_string(),
6757        Value::SmallInt(n) => alloc::format!("{n}"),
6758        Value::Int(n) => alloc::format!("{n}"),
6759        Value::BigInt(n) => alloc::format!("{n}"),
6760        Value::Float(x) => alloc::format!("{x:?}"),
6761        Value::Text(s) | Value::Json(s) => s.clone(),
6762        Value::Bool(b) => if *b { "t" } else { "f" }.to_string(),
6763        Value::Date(d) => eval::format_date(*d),
6764        Value::Timestamp(t) => eval::format_timestamp(*t),
6765        Value::Interval { months, micros } => eval::format_interval(*months, *micros),
6766        Value::Numeric { scaled, scale } => eval::format_numeric(*scaled, *scale),
6767        Value::Vector(_) | Value::Sq8Vector(_) | Value::HalfVector(_) => {
6768            // Unreachable in practice (vector columns are filtered
6769            // out before this). Defensive fallback so a future
6770            // vector-stats path doesn't crash.
6771            alloc::format!("{v:?}")
6772        }
6773        // v7.5.0 — Value is #[non_exhaustive] for downstream
6774        // forward-compat. Future variants fall through to Debug
6775        // form here (same shape as the vector fallback above).
6776        _ => alloc::format!("{v:?}"),
6777    }
6778}
6779
6780/// v6.2.0 — true for engine-managed catalog tables that the bare
6781/// `ANALYZE` (no target) should skip. v6.2.0 has no internal
6782/// tables yet (publications / subscriptions / users / statistics
6783/// all live as engine fields, not catalog tables), so this is a
6784/// reserved future-proofing hook — every existing user table is
6785/// analysed.
6786const fn is_internal_table_name(_name: &str) -> bool {
6787    false
6788}
6789
6790fn value_to_literal(v: Value) -> Literal {
6791    match v {
6792        Value::Null => Literal::Null,
6793        Value::SmallInt(n) => Literal::Integer(i64::from(n)),
6794        Value::Int(n) => Literal::Integer(i64::from(n)),
6795        Value::BigInt(n) => Literal::Integer(n),
6796        Value::Float(x) => Literal::Float(x),
6797        Value::Text(s) | Value::Json(s) => Literal::String(s),
6798        Value::Bool(b) => Literal::Bool(b),
6799        Value::Vector(v) => Literal::Vector(v),
6800        Value::Numeric { scaled, scale } => {
6801            Literal::String(eval::format_numeric(scaled, scale))
6802        }
6803        Value::Date(d) => Literal::String(eval::format_date(d)),
6804        Value::Timestamp(t) => Literal::String(eval::format_timestamp(t)),
6805        Value::Interval { months, micros } => Literal::Interval {
6806            months,
6807            micros,
6808            text: eval::format_interval(months, micros),
6809        },
6810        // SQ8 / halfvec cells dequantise to f32 before reaching the
6811        // substitute walker; pgwire's Bind path handles that.
6812        Value::Sq8Vector(q) => Literal::Vector(spg_storage::quantize::dequantize(&q)),
6813        Value::HalfVector(h) => Literal::Vector(h.to_f32_vec()),
6814        // v7.5.0 — Value is #[non_exhaustive]; future variants
6815        // render as Debug-form String literal until explicit
6816        // mapping is added.
6817        v => Literal::String(alloc::format!("{v:?}")),
6818    }
6819}
6820
6821fn rewrite_clock_calls(stmt: &mut Statement, now_micros: Option<i64>) {
6822    let Some(now) = now_micros else {
6823        return;
6824    };
6825    match stmt {
6826        Statement::Select(s) => rewrite_select_clock(s, now),
6827        Statement::Insert(ins) => {
6828            for row in &mut ins.rows {
6829                for e in row {
6830                    rewrite_expr_clock(e, now);
6831                }
6832            }
6833        }
6834        _ => {}
6835    }
6836}
6837
6838fn rewrite_select_clock(s: &mut SelectStatement, now: i64) {
6839    for item in &mut s.items {
6840        if let SelectItem::Expr { expr, .. } = item {
6841            rewrite_expr_clock(expr, now);
6842        }
6843    }
6844    if let Some(w) = &mut s.where_ {
6845        rewrite_expr_clock(w, now);
6846    }
6847    if let Some(gs) = &mut s.group_by {
6848        for g in gs {
6849            rewrite_expr_clock(g, now);
6850        }
6851    }
6852    if let Some(h) = &mut s.having {
6853        rewrite_expr_clock(h, now);
6854    }
6855    for o in &mut s.order_by {
6856        rewrite_expr_clock(&mut o.expr, now);
6857    }
6858    for (_, peer) in &mut s.unions {
6859        rewrite_select_clock(peer, now);
6860    }
6861}
6862
6863/// v3.0.3 hot path: every recursion lands in exactly one `match` arm.
6864/// Literal / Column-with-qualifier (the dominant cases on a typical
6865/// AST) take a single pattern dispatch and exit. The clock-rewrite
6866/// targets (zero-arg `NOW` / `CURRENT_TIMESTAMP` / `CURRENT_DATE`
6867/// functions, and bare `CURRENT_TIMESTAMP` / `CURRENT_DATE` column
6868/// refs) sit on their own arms with match guards so the fall-through
6869/// to the recursive arms is unambiguous.
6870fn rewrite_expr_clock(e: &mut Expr, now: i64) {
6871    // Fast-path test on the no-recursion shapes first. We can't fold
6872    // them into the big match below because they need to *replace* `e`
6873    // outright; the recursive arms below match on its sub-fields.
6874    if let Some(replacement) = clock_replacement_for(e, now) {
6875        *e = replacement;
6876        return;
6877    }
6878    match e {
6879        Expr::Binary { lhs, rhs, .. } => {
6880            rewrite_expr_clock(lhs, now);
6881            rewrite_expr_clock(rhs, now);
6882        }
6883        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6884            rewrite_expr_clock(expr, now);
6885        }
6886        Expr::FunctionCall { args, .. } => {
6887            for a in args {
6888                rewrite_expr_clock(a, now);
6889            }
6890        }
6891        Expr::Like { expr, pattern, .. } => {
6892            rewrite_expr_clock(expr, now);
6893            rewrite_expr_clock(pattern, now);
6894        }
6895        Expr::Extract { source, .. } => rewrite_expr_clock(source, now),
6896        // v4.10 subquery nodes — recurse into the inner SELECT's
6897        // expression slots so e.g. SELECT NOW() in a scalar
6898        // subquery picks up the same instant as the outer query.
6899        Expr::ScalarSubquery(s) => rewrite_select_clock(s, now),
6900        Expr::Exists { subquery, .. } => rewrite_select_clock(subquery, now),
6901        Expr::InSubquery { expr, subquery, .. } => {
6902            rewrite_expr_clock(expr, now);
6903            rewrite_select_clock(subquery, now);
6904        }
6905        // v4.12 window functions — args + PARTITION BY + ORDER BY
6906        // may all reference clock literals.
6907        Expr::WindowFunction {
6908            args,
6909            partition_by,
6910            order_by,
6911            ..
6912        } => {
6913            for a in args {
6914                rewrite_expr_clock(a, now);
6915            }
6916            for p in partition_by {
6917                rewrite_expr_clock(p, now);
6918            }
6919            for (e, _) in order_by {
6920                rewrite_expr_clock(e, now);
6921            }
6922        }
6923        Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => {}
6924    }
6925}
6926
6927/// Returns `Some(Expr)` when `e` is one of the clock-call shapes that
6928/// must be rewritten; otherwise `None` so the caller falls through to
6929/// the recursive walk. Identifies both function-call forms (`NOW()` /
6930/// `CURRENT_TIMESTAMP()` / `CURRENT_DATE()`) and bare-identifier forms
6931/// (`CURRENT_TIMESTAMP` / `CURRENT_DATE` as unqualified column refs,
6932/// which is how PG accepts them without parens).
6933fn clock_replacement_for(e: &Expr, now: i64) -> Option<Expr> {
6934    let (kind, name) = match e {
6935        Expr::FunctionCall { name, args } if args.is_empty() => (ClockSite::Fn, name.as_str()),
6936        Expr::Column(c) if c.qualifier.is_none() => (ClockSite::BareIdent, c.name.as_str()),
6937        _ => return None,
6938    };
6939    // ASCII case-insensitive name match. Limited to the three keywords
6940    // that actually need rewriting.
6941    let matched = match name.len() {
6942        3 if kind == ClockSite::Fn && name.eq_ignore_ascii_case("now") => Some(true),
6943        12 if name.eq_ignore_ascii_case("current_date") => Some(false),
6944        17 if name.eq_ignore_ascii_case("current_timestamp") => Some(true),
6945        _ => None,
6946    };
6947    let is_timestamp = matched?;
6948    let payload = if is_timestamp {
6949        now
6950    } else {
6951        now.div_euclid(86_400_000_000)
6952    };
6953    let target = if is_timestamp {
6954        spg_sql::ast::CastTarget::Timestamp
6955    } else {
6956        spg_sql::ast::CastTarget::Date
6957    };
6958    Some(Expr::Cast {
6959        expr: alloc::boxed::Box::new(Expr::Literal(spg_sql::ast::Literal::Integer(payload))),
6960        target,
6961    })
6962}
6963
6964#[derive(Debug, Clone, Copy, PartialEq, Eq)]
6965enum ClockSite {
6966    Fn,
6967    BareIdent,
6968}
6969
6970/// `ORDER BY <integer>` references the N-th SELECT item (1-based).
6971/// Swap the integer literal for the matching item's expression so the
6972/// executor doesn't need a special-case branch. Recurses into UNION
6973/// peers because each peer keeps its own SELECT list.
6974/// v6.4.1 — expand `GROUP BY ALL` to every non-aggregate SELECT-list
6975/// item. Mirrors DuckDB / PG 19 semantics. Wildcards (`SELECT * …`)
6976/// are NOT expanded by GROUP BY ALL (PG 19 leaves the wildcard intact
6977/// and groups by whatever explicit non-aggregates remain — none in
6978/// the wildcard-only case, which still works for non-aggregate
6979/// queries).
6980fn expand_group_by_all(s: &mut SelectStatement) {
6981    if !s.group_by_all {
6982        for (_, peer) in &mut s.unions {
6983            expand_group_by_all(peer);
6984        }
6985        return;
6986    }
6987    let mut groups: Vec<Expr> = Vec::new();
6988    for item in &s.items {
6989        if let SelectItem::Expr { expr, .. } = item
6990            && !aggregate::contains_aggregate(expr)
6991        {
6992            groups.push(expr.clone());
6993        }
6994    }
6995    s.group_by = Some(groups);
6996    s.group_by_all = false;
6997    for (_, peer) in &mut s.unions {
6998        expand_group_by_all(peer);
6999    }
7000}
7001
7002fn resolve_order_by_position(s: &mut SelectStatement) {
7003    // v6.4.0 — iterate every ORDER BY key. Position references
7004    // (`ORDER BY 2`) bind to the 1-based projection index;
7005    // identifier references that match a SELECT-list alias bind to
7006    // the projected expression (Step 4 of L3a).
7007    for order in &mut s.order_by {
7008        match &order.expr {
7009            Expr::Literal(Literal::Integer(n)) if *n >= 1 => {
7010                if let Ok(idx_one_based) = usize::try_from(*n) {
7011                    let idx = idx_one_based - 1;
7012                    if idx < s.items.len()
7013                        && let SelectItem::Expr { expr, .. } = &s.items[idx]
7014                    {
7015                        order.expr = expr.clone();
7016                    }
7017                }
7018            }
7019            Expr::Column(c) if c.qualifier.is_none() => {
7020                // Alias-in-ORDER-BY lookup.
7021                for item in &s.items {
7022                    if let SelectItem::Expr {
7023                        expr,
7024                        alias: Some(a),
7025                    } = item
7026                        && a == &c.name
7027                    {
7028                        order.expr = expr.clone();
7029                        break;
7030                    }
7031                }
7032            }
7033            _ => {}
7034        }
7035    }
7036    for (_, peer) in &mut s.unions {
7037        resolve_order_by_position(peer);
7038    }
7039}
7040
7041/// Sort `tagged` by `f64` key, reversing the comparator under DESC.
7042/// Used by the UNION ORDER BY path; per-block paths inline the same
7043/// comparator because they already hold `&OrderBy` directly.
7044/// v3.1.1: partial-sort helper. When `keep` (= offset + limit) is
7045/// strictly less than `tagged.len()`, run `select_nth_unstable_by` to
7046/// partition the prefix in O(n), then sort just that prefix in O(k
7047/// log k). Total O(n + k log k), vs O(n log n) for a full sort. The
7048/// caller decides what `keep` is; passing `None` (no LIMIT) keeps the
7049/// full-sort behaviour.
7050///
7051/// `tagged` holds `(Option<f64>, Row)` (the SELECT path) — `None` keys
7052/// sort last in ascending order, mirroring NULL-sorts-last in SQL.
7053fn partial_sort_tagged(
7054    tagged: &mut Vec<(Vec<f64>, Row)>,
7055    keep: Option<usize>,
7056    descs: &[bool],
7057) {
7058    let cmp = |a: &(Vec<f64>, Row), b: &(Vec<f64>, Row)| cmp_multi_key(&a.0, &b.0, descs);
7059    match keep {
7060        Some(k) if k < tagged.len() && k > 0 => {
7061            let pivot = k - 1;
7062            tagged.select_nth_unstable_by(pivot, cmp);
7063            tagged[..k].sort_by(cmp);
7064            tagged.truncate(k);
7065        }
7066        _ => {
7067            tagged.sort_by(cmp);
7068        }
7069    }
7070}
7071
7072fn sort_by_keys(tagged: &mut [(Vec<f64>, Row)], descs: &[bool]) {
7073    tagged.sort_by(|a, b| cmp_multi_key(&a.0, &b.0, descs));
7074}
7075
7076/// v6.4.0 — multi-key ORDER BY comparator. Each key's per-key DESC
7077/// flag is honored independently. NULL is encoded as `f64::INFINITY`
7078/// so it sorts last in ASC and first in DESC (matches PG default).
7079fn cmp_multi_key(a: &[f64], b: &[f64], descs: &[bool]) -> core::cmp::Ordering {
7080    use core::cmp::Ordering;
7081    for (i, (ka, kb)) in a.iter().zip(b.iter()).enumerate() {
7082        let ord = ka.partial_cmp(kb).unwrap_or(Ordering::Equal);
7083        let ord = if descs.get(i).copied().unwrap_or(false) {
7084            ord.reverse()
7085        } else {
7086            ord
7087        };
7088        if ord != Ordering::Equal {
7089            return ord;
7090        }
7091    }
7092    Ordering::Equal
7093}
7094
7095/// v6.4.0 — eval every ORDER BY expression for a row and pack the
7096/// resulting keys into a `Vec<f64>`. NULL → `f64::INFINITY`.
7097fn build_order_keys(
7098    order_by: &[OrderBy],
7099    row: &Row,
7100    ctx: &EvalContext,
7101) -> Result<Vec<f64>, EngineError> {
7102    let mut keys = Vec::with_capacity(order_by.len());
7103    for o in order_by {
7104        let v = eval::eval_expr(&o.expr, row, ctx)?;
7105        keys.push(value_to_order_key(&v)?);
7106    }
7107    Ok(keys)
7108}
7109
7110/// Drop the first `offset` rows then truncate to `limit`. PG / `MySQL`
7111/// agree: OFFSET applies *after* ORDER BY but *before* LIMIT (so
7112/// `LIMIT 10 OFFSET 5` keeps rows 6..=15).
7113fn apply_offset_and_limit(rows: &mut Vec<Row>, offset: Option<u32>, limit: Option<u32>) {
7114    if let Some(off) = offset {
7115        let off = off as usize;
7116        if off >= rows.len() {
7117            rows.clear();
7118        } else {
7119            rows.drain(..off);
7120        }
7121    }
7122    if let Some(n) = limit {
7123        rows.truncate(n as usize);
7124    }
7125}
7126
7127/// v7.6.1 — resolve a parser-level `ForeignKeyConstraint` (column
7128/// names + parent table name) into the storage-layer shape (column
7129/// indices + same parent table). Validates everything the engine
7130/// needs to know about the FK at CREATE TABLE time:
7131///
7132///   - parent table exists (catalog lookup, unless self-referencing)
7133///   - parent columns exist on the parent table
7134///   - parent column list matches the local arity (defaults to the
7135///     parent's primary index column when omitted)
7136///   - parent columns are covered by a `BTree` UNIQUE-class index
7137///     (SPG's stand-in for `PRIMARY KEY`/`UNIQUE`) — required so
7138///     the v7.6.2 INSERT path can do an O(log n) parent lookup
7139///   - local columns exist on the table being created
7140fn resolve_foreign_key(
7141    local_table_name: &str,
7142    local_cols: &[ColumnSchema],
7143    fk: spg_sql::ast::ForeignKeyConstraint,
7144    catalog: &Catalog,
7145) -> Result<spg_storage::ForeignKeyConstraint, EngineError> {
7146    // Resolve local columns.
7147    let mut local_columns = Vec::with_capacity(fk.columns.len());
7148    for name in &fk.columns {
7149        let pos = local_cols
7150            .iter()
7151            .position(|c| c.name == *name)
7152            .ok_or_else(|| {
7153                EngineError::Unsupported(alloc::format!(
7154                    "FOREIGN KEY references unknown local column {name:?}"
7155                ))
7156            })?;
7157        local_columns.push(pos);
7158    }
7159    // Self-referencing FK: parent table is the one we're creating.
7160    // The parent column resolution uses the local column list since
7161    // the catalog doesn't have this table yet.
7162    let is_self_ref = fk.parent_table == local_table_name;
7163    let (parent_cols_for_lookup, parent_table_str): (&[ColumnSchema], &str) = if is_self_ref {
7164        (local_cols, local_table_name)
7165    } else {
7166        let parent_table = catalog.get(&fk.parent_table).ok_or_else(|| {
7167            EngineError::Storage(StorageError::TableNotFound {
7168                name: fk.parent_table.clone(),
7169            })
7170        })?;
7171        (parent_table.schema().columns.as_slice(), fk.parent_table.as_str())
7172    };
7173    // Resolve parent column names → positions. If the FK omitted the
7174    // parent column list, fall back to the parent's primary index
7175    // column (single-column only — composite default is rejected
7176    // because there's no unambiguous "PK" in SPG's index list).
7177    let parent_columns: Vec<usize> = if fk.parent_columns.is_empty() {
7178        if fk.columns.len() != 1 {
7179            return Err(EngineError::Unsupported(
7180                "composite FOREIGN KEY without explicit parent column list is not supported \
7181                 — list the parent columns explicitly"
7182                    .into(),
7183            ));
7184        }
7185        // Find a single BTree index on the parent and use its column.
7186        let pos = pick_pk_index_column(catalog, parent_table_str, is_self_ref, local_cols)
7187            .ok_or_else(|| {
7188                EngineError::Unsupported(alloc::format!(
7189                    "parent table {parent_table_str:?} has no PRIMARY-key / UNIQUE BTree index \
7190                     to default the FOREIGN KEY against"
7191                ))
7192            })?;
7193        alloc::vec![pos]
7194    } else {
7195        let mut out = Vec::with_capacity(fk.parent_columns.len());
7196        for name in &fk.parent_columns {
7197            let pos = parent_cols_for_lookup
7198                .iter()
7199                .position(|c| c.name == *name)
7200                .ok_or_else(|| {
7201                    EngineError::Unsupported(alloc::format!(
7202                        "FOREIGN KEY references unknown parent column \
7203                         {name:?} on table {parent_table_str:?}"
7204                    ))
7205                })?;
7206            out.push(pos);
7207        }
7208        out
7209    };
7210    if parent_columns.len() != local_columns.len() {
7211        return Err(EngineError::Unsupported(alloc::format!(
7212            "FOREIGN KEY arity mismatch: {} local columns vs {} parent columns",
7213            local_columns.len(),
7214            parent_columns.len()
7215        )));
7216    }
7217    // For non-self-referencing FKs, verify the parent column set is
7218    // covered by a BTree index. SPG doesn't have a `PRIMARY KEY`
7219    // declaration; the convention is "the parent column for FK
7220    // purposes must have a BTree index" — which the user creates via
7221    // `CREATE INDEX ... USING btree (col)` (the default). We accept
7222    // any single-column BTree index that covers a parent column;
7223    // composite parent column lists require an index whose `column_position`
7224    // matches the first parent column (multi-column BTree indices
7225    // are not in the v7.x roadmap).
7226    if !is_self_ref {
7227        let parent_table = catalog
7228            .get(&fk.parent_table)
7229            .expect("checked above");
7230        let primary_parent_col = parent_columns[0];
7231        let has_btree = parent_table.schema().columns.get(primary_parent_col).is_some()
7232            && parent_table
7233                .indices()
7234                .iter()
7235                .any(|idx| {
7236                    matches!(idx.kind, spg_storage::IndexKind::BTree(_))
7237                        && idx.column_position == primary_parent_col
7238                        && idx.partial_predicate.is_none()
7239                });
7240        if !has_btree {
7241            return Err(EngineError::Unsupported(alloc::format!(
7242                "FOREIGN KEY parent column on {:?} is not covered by an unconditional BTree \
7243                 index — create one with `CREATE INDEX ... ON {} ({})` first",
7244                parent_table_str,
7245                parent_table_str,
7246                parent_table.schema().columns[primary_parent_col].name,
7247            )));
7248        }
7249    }
7250    let on_delete = fk_action_sql_to_storage(fk.on_delete);
7251    let on_update = fk_action_sql_to_storage(fk.on_update);
7252    Ok(spg_storage::ForeignKeyConstraint {
7253        name: fk.name,
7254        local_columns,
7255        parent_table: fk.parent_table,
7256        parent_columns,
7257        on_delete,
7258        on_update,
7259    })
7260}
7261
7262/// v7.6.1 — pick a sentinel "primary key" column from the parent
7263/// table when the FK didn't name parent columns. Picks the first
7264/// single-column unconditional BTree index — that's the closest
7265/// thing SPG has to a PRIMARY KEY today. Self-referencing FKs use
7266/// `local_cols` as the column source.
7267fn pick_pk_index_column(
7268    catalog: &Catalog,
7269    parent_name: &str,
7270    is_self_ref: bool,
7271    local_cols: &[ColumnSchema],
7272) -> Option<usize> {
7273    if is_self_ref {
7274        // Self-ref FK omitted parent columns: pick column 0 by
7275        // convention (no catalog entry yet). Engine will widen this
7276        // when v7.6.7 lands; v7.6.1 only handles the explicit form.
7277        let _ = local_cols;
7278        return Some(0);
7279    }
7280    let parent = catalog.get(parent_name)?;
7281    parent.indices().iter().find_map(|idx| {
7282        if matches!(idx.kind, spg_storage::IndexKind::BTree(_))
7283            && idx.partial_predicate.is_none()
7284            && idx.included_columns.is_empty()
7285            && idx.expression.is_none()
7286        {
7287            Some(idx.column_position)
7288        } else {
7289            None
7290        }
7291    })
7292}
7293
7294/// v7.6.2 / v7.6.7 — INSERT-side FK enforcement. For every row
7295/// about to be inserted into `child_table`, every FK declared on
7296/// that table is checked: the row's FK columns must either be
7297/// NULL (SQL spec skip) or match an existing parent row via the
7298/// parent's BTree PK / UNIQUE index.
7299///
7300/// Returns `EngineError::Unsupported` with a `FOREIGN KEY violation`
7301/// payload on first failure.
7302///
7303/// **Self-referencing FKs (v7.6.7 widening):** when `fk.parent_table
7304/// == child_table`, the parent rows visible to this check are
7305///  (a) rows already committed to the table, plus
7306///  (b) earlier rows from the *same* `rows` batch.
7307/// This makes `INSERT INTO tree VALUES (1, NULL), (2, 1), (3, 2)`
7308/// work in a single statement — common pattern for bulk-loading
7309/// hierarchies.
7310fn enforce_fk_inserts(
7311    catalog: &Catalog,
7312    child_table: &str,
7313    fks: &[spg_storage::ForeignKeyConstraint],
7314    rows: &[Vec<Value>],
7315) -> Result<(), EngineError> {
7316    for fk in fks {
7317        let parent_is_self = fk.parent_table == child_table;
7318        let parent = if parent_is_self {
7319            // Self-ref: read the current state of the same table.
7320            // The mut borrow on child has been dropped by the caller.
7321            catalog.get(child_table).ok_or_else(|| {
7322                EngineError::Storage(StorageError::TableNotFound {
7323                    name: child_table.into(),
7324                })
7325            })?
7326        } else {
7327            catalog.get(&fk.parent_table).ok_or_else(|| {
7328                EngineError::Storage(StorageError::TableNotFound {
7329                    name: fk.parent_table.clone(),
7330                })
7331            })?
7332        };
7333        for (batch_idx, row_values) in rows.iter().enumerate() {
7334            // Single-column FK fast path: try the parent's BTree
7335            // index for an O(log n) lookup. Composite FKs fall back
7336            // to a parent-row scan.
7337            if fk.local_columns.len() == 1 {
7338                let v = &row_values[fk.local_columns[0]];
7339                if matches!(v, Value::Null) {
7340                    continue;
7341                }
7342                let parent_col = fk.parent_columns[0];
7343                let key = spg_storage::IndexKey::from_value(v).ok_or_else(|| {
7344                    EngineError::Unsupported(alloc::format!(
7345                        "FOREIGN KEY column value of type {:?} is not index-eligible",
7346                        v.data_type()
7347                    ))
7348                })?;
7349                let present_committed = parent.indices().iter().any(|idx| {
7350                    matches!(idx.kind, spg_storage::IndexKind::BTree(_))
7351                        && idx.column_position == parent_col
7352                        && idx.partial_predicate.is_none()
7353                        && !idx.lookup_eq(&key).is_empty()
7354                });
7355                // v7.6.7 self-ref widening: also accept a match
7356                // against earlier rows in this same batch when the
7357                // FK points at the table being inserted into.
7358                let present_in_batch = parent_is_self
7359                    && rows[..batch_idx].iter().any(|earlier| {
7360                        earlier.get(parent_col) == Some(v)
7361                    });
7362                if !(present_committed || present_in_batch) {
7363                    return Err(EngineError::Unsupported(alloc::format!(
7364                        "FOREIGN KEY violation: no parent row in {:?} where {} = {:?}",
7365                        fk.parent_table,
7366                        parent
7367                            .schema()
7368                            .columns
7369                            .get(parent_col)
7370                            .map_or("?", |c| c.name.as_str()),
7371                        v,
7372                    )));
7373                }
7374            } else {
7375                // Composite FK: scan parent rows. v7.6.7 also
7376                // accepts a match against earlier rows in the same
7377                // batch (self-ref bulk-loading of hierarchies).
7378                if fk.local_columns
7379                    .iter()
7380                    .all(|&i| matches!(row_values.get(i), Some(Value::Null)))
7381                {
7382                    continue;
7383                }
7384                let local: Vec<&Value> = fk.local_columns.iter().map(|&i| &row_values[i]).collect();
7385                let parent_match_committed = parent.rows().iter().any(|prow| {
7386                    fk.parent_columns
7387                        .iter()
7388                        .enumerate()
7389                        .all(|(i, &pi)| prow.values.get(pi) == Some(local[i]))
7390                });
7391                let parent_match_in_batch = parent_is_self
7392                    && rows[..batch_idx].iter().any(|earlier| {
7393                        fk.parent_columns
7394                            .iter()
7395                            .enumerate()
7396                            .all(|(i, &pi)| earlier.get(pi) == Some(local[i]))
7397                    });
7398                if !(parent_match_committed || parent_match_in_batch) {
7399                    return Err(EngineError::Unsupported(alloc::format!(
7400                        "FOREIGN KEY violation: no parent row in {:?} matching composite key",
7401                        fk.parent_table,
7402                    )));
7403                }
7404            }
7405        }
7406    }
7407    Ok(())
7408}
7409
7410/// v7.6.4 / v7.6.5 — one step of the FK action plan computed for a
7411/// DELETE on a parent. The plan is a list of these steps, stacked
7412/// across the FK graph by `plan_fk_parent_deletions`.
7413#[derive(Debug, Clone)]
7414struct FkChildStep {
7415    child_table: String,
7416    action: FkChildAction,
7417}
7418
7419#[derive(Debug, Clone)]
7420enum FkChildAction {
7421    /// CASCADE — remove these rows. Sorted, deduplicated positions.
7422    Delete { positions: Vec<usize> },
7423    /// SET NULL — for each (row, column) in the flat list, write
7424    /// NULL into that child cell. Multiple FKs on the same row may
7425    /// produce overlapping entries (deduped at plan time).
7426    SetNull {
7427        positions: Vec<usize>,
7428        columns: Vec<usize>,
7429    },
7430    /// SET DEFAULT — same shape as SetNull but writes the column's
7431    /// declared DEFAULT value (resolved at plan time). Columns
7432    /// without a DEFAULT raise an error during planning.
7433    SetDefault {
7434        positions: Vec<usize>,
7435        columns: Vec<usize>,
7436        defaults: Vec<Value>,
7437    },
7438}
7439
7440/// v7.6.3 → v7.6.5 — plan FK fallout for a DELETE on a parent table.
7441///
7442/// Walks every table in the catalog looking for FKs whose
7443/// `parent_table` is `parent_table_name`. For each such FK + each
7444/// to-be-deleted parent row:
7445///
7446///   - RESTRICT / NoAction → error, no plan returned
7447///   - CASCADE → child rows get scheduled for deletion; recursive
7448///   - SetNull → child FK column(s) scheduled to be NULL-ed.
7449///     Verified NULL-able at plan time.
7450///   - SetDefault → child FK column(s) scheduled to be reset to
7451///     their declared DEFAULT. Columns without a DEFAULT raise.
7452///
7453/// SET NULL / SET DEFAULT do NOT cascade further — the child row
7454/// stays; only one of its columns mutates.
7455fn plan_fk_parent_deletions(
7456    catalog: &Catalog,
7457    parent_table_name: &str,
7458    to_delete_positions: &[usize],
7459    to_delete_rows: &[Vec<Value>],
7460) -> Result<Vec<FkChildStep>, EngineError> {
7461    use alloc::collections::{BTreeMap, BTreeSet};
7462    if to_delete_rows.is_empty() {
7463        return Ok(Vec::new());
7464    }
7465    let mut delete_plan: BTreeMap<String, BTreeSet<usize>> = BTreeMap::new();
7466    // setnull / setdefault keyed by child_table → (row_idx, col_idx) → optional default
7467    let mut setnull_plan: BTreeMap<String, BTreeSet<(usize, usize)>> = BTreeMap::new();
7468    let mut setdefault_plan: BTreeMap<String, BTreeMap<(usize, usize), Value>> =
7469        BTreeMap::new();
7470    let mut visited: BTreeSet<(String, usize)> = BTreeSet::new();
7471    for &p in to_delete_positions {
7472        visited.insert((parent_table_name.to_string(), p));
7473    }
7474    let mut work: Vec<(String, Vec<Value>)> = to_delete_rows
7475        .iter()
7476        .map(|r| (parent_table_name.to_string(), r.clone()))
7477        .collect();
7478    while let Some((cur_parent, parent_row)) = work.pop() {
7479        for child_name in catalog.table_names() {
7480            let child = catalog
7481                .get(&child_name)
7482                .expect("table_names → catalog.get round-trip is total");
7483            for fk in &child.schema().foreign_keys {
7484                if fk.parent_table != cur_parent {
7485                    continue;
7486                }
7487                let parent_key: Vec<&Value> = fk
7488                    .parent_columns
7489                    .iter()
7490                    .map(|&pi| &parent_row[pi])
7491                    .collect();
7492                if parent_key.iter().any(|v| matches!(v, Value::Null)) {
7493                    continue;
7494                }
7495                for (child_row_idx, child_row) in child.rows().iter().enumerate() {
7496                    if child_name == cur_parent
7497                        && visited.contains(&(child_name.clone(), child_row_idx))
7498                    {
7499                        continue;
7500                    }
7501                    let matches_key = fk
7502                        .local_columns
7503                        .iter()
7504                        .enumerate()
7505                        .all(|(i, &li)| child_row.values.get(li) == Some(parent_key[i]));
7506                    if !matches_key {
7507                        continue;
7508                    }
7509                    match fk.on_delete {
7510                        spg_storage::FkAction::Restrict
7511                        | spg_storage::FkAction::NoAction => {
7512                            return Err(EngineError::Unsupported(alloc::format!(
7513                                "FOREIGN KEY violation: DELETE on {cur_parent:?} is \
7514                                 restricted by FK from {child_name:?}.{:?}",
7515                                fk.local_columns,
7516                            )));
7517                        }
7518                        spg_storage::FkAction::Cascade => {
7519                            if visited.insert((child_name.clone(), child_row_idx)) {
7520                                delete_plan
7521                                    .entry(child_name.clone())
7522                                    .or_default()
7523                                    .insert(child_row_idx);
7524                                work.push((child_name.clone(), child_row.values.clone()));
7525                            }
7526                        }
7527                        spg_storage::FkAction::SetNull => {
7528                            // Verify every local FK column is NULL-able.
7529                            for &li in &fk.local_columns {
7530                                let col = child.schema().columns.get(li).ok_or_else(|| {
7531                                    EngineError::Unsupported(alloc::format!(
7532                                        "FK local column {li} missing in {child_name:?}"
7533                                    ))
7534                                })?;
7535                                if !col.nullable {
7536                                    return Err(EngineError::Unsupported(alloc::format!(
7537                                        "FOREIGN KEY ON DELETE SET NULL: column \
7538                                         {child_name:?}.{:?} is NOT NULL — cannot SET NULL",
7539                                        col.name,
7540                                    )));
7541                                }
7542                            }
7543                            let entry = setnull_plan.entry(child_name.clone()).or_default();
7544                            for &li in &fk.local_columns {
7545                                entry.insert((child_row_idx, li));
7546                            }
7547                        }
7548                        spg_storage::FkAction::SetDefault => {
7549                            // Resolve the DEFAULT for every local FK col.
7550                            let entry =
7551                                setdefault_plan.entry(child_name.clone()).or_default();
7552                            for &li in &fk.local_columns {
7553                                let col = child.schema().columns.get(li).ok_or_else(|| {
7554                                    EngineError::Unsupported(alloc::format!(
7555                                        "FK local column {li} missing in {child_name:?}"
7556                                    ))
7557                                })?;
7558                                let default = col.default.clone().ok_or_else(|| {
7559                                    EngineError::Unsupported(alloc::format!(
7560                                        "FOREIGN KEY ON DELETE SET DEFAULT: column \
7561                                         {child_name:?}.{:?} has no DEFAULT declared",
7562                                        col.name,
7563                                    ))
7564                                })?;
7565                                entry.insert((child_row_idx, li), default);
7566                            }
7567                        }
7568                    }
7569                }
7570            }
7571        }
7572    }
7573    // Flatten the three plans into the ordered `FkChildStep` list.
7574    // Deletes are applied last per child (after any null/default
7575    // re-writes on the same child) so a child row that's both
7576    // re-written and then cascade-deleted only ends up deleted —
7577    // but in v7.6.5 SetNull/Cascade never overlap on the same row
7578    // (a single FK chooses exactly one action), so the order is
7579    // mostly a precaution.
7580    let mut steps: Vec<FkChildStep> = Vec::new();
7581    for (child_table, entries) in setnull_plan {
7582        let (positions, columns): (Vec<usize>, Vec<usize>) = entries.into_iter().unzip();
7583        steps.push(FkChildStep {
7584            child_table,
7585            action: FkChildAction::SetNull { positions, columns },
7586        });
7587    }
7588    for (child_table, entries) in setdefault_plan {
7589        let mut positions = Vec::with_capacity(entries.len());
7590        let mut columns = Vec::with_capacity(entries.len());
7591        let mut defaults = Vec::with_capacity(entries.len());
7592        for ((p, c), v) in entries {
7593            positions.push(p);
7594            columns.push(c);
7595            defaults.push(v);
7596        }
7597        steps.push(FkChildStep {
7598            child_table,
7599            action: FkChildAction::SetDefault {
7600                positions,
7601                columns,
7602                defaults,
7603            },
7604        });
7605    }
7606    for (child_table, positions) in delete_plan {
7607        steps.push(FkChildStep {
7608            child_table,
7609            action: FkChildAction::Delete {
7610                positions: positions.into_iter().collect(),
7611            },
7612        });
7613    }
7614    Ok(steps)
7615}
7616
7617/// v7.6.6 — plan FK fallout for an UPDATE that mutates parent-side
7618/// PK/UNIQUE columns. Walks every other table whose FK references
7619/// `parent_table_name`; for each FK whose parent_columns overlap a
7620/// mutated column, decides the action by `fk.on_update`.
7621///
7622///   - RESTRICT / NoAction → error if any child references the OLD
7623///     value
7624///   - CASCADE → child FK columns get rewritten to the NEW parent
7625///     value (a SetNull-style update step with the new value)
7626///   - SetNull → child FK columns set to NULL
7627///   - SetDefault → child FK columns set to declared default
7628///
7629/// `plan_with_old` is `(row_position, old_values, new_values)` so
7630/// the planner can detect "did this row's parent key actually
7631/// change?" — only rows where at least one referenced parent
7632/// column moved trigger inbound work.
7633fn plan_fk_parent_updates(
7634    catalog: &Catalog,
7635    parent_table_name: &str,
7636    plan_with_old: &[(usize, Vec<Value>, Vec<Value>)],
7637) -> Result<Vec<FkChildStep>, EngineError> {
7638    use alloc::collections::BTreeMap;
7639    if plan_with_old.is_empty() {
7640        return Ok(Vec::new());
7641    }
7642    // For each child table we may touch, build per-child step
7643    // lists. UPDATE never deletes children — `delete_plan` stays
7644    // empty here but is kept structurally aligned with
7645    // `plan_fk_parent_deletions` for future use.
7646    let delete_plan: BTreeMap<String, alloc::collections::BTreeSet<usize>> = BTreeMap::new();
7647    let mut setnull_plan: BTreeMap<
7648        String,
7649        alloc::collections::BTreeSet<(usize, usize)>,
7650    > = BTreeMap::new();
7651    let mut setdefault_plan: BTreeMap<String, BTreeMap<(usize, usize), Value>> =
7652        BTreeMap::new();
7653    // Cascade-update plan: child_table → row_idx → col_idx → new_value
7654    let mut cascade_plan: BTreeMap<String, BTreeMap<(usize, usize), Value>> = BTreeMap::new();
7655
7656    for child_name in catalog.table_names() {
7657        let child = catalog
7658            .get(&child_name)
7659            .expect("table_names → catalog.get total");
7660        for fk in &child.schema().foreign_keys {
7661            if fk.parent_table != parent_table_name {
7662                continue;
7663            }
7664            for (_pos, old_row, new_row) in plan_with_old {
7665                // Did any parent FK column change?
7666                let key_changed = fk
7667                    .parent_columns
7668                    .iter()
7669                    .any(|&pi| old_row.get(pi) != new_row.get(pi));
7670                if !key_changed {
7671                    continue;
7672                }
7673                // The OLD parent key — used to find referring children.
7674                let old_key: Vec<&Value> = fk
7675                    .parent_columns
7676                    .iter()
7677                    .map(|&pi| &old_row[pi])
7678                    .collect();
7679                if old_key.iter().any(|v| matches!(v, Value::Null)) {
7680                    // NULL parent has no children — skip.
7681                    continue;
7682                }
7683                let new_key: Vec<&Value> = fk
7684                    .parent_columns
7685                    .iter()
7686                    .map(|&pi| &new_row[pi])
7687                    .collect();
7688                for (child_row_idx, child_row) in child.rows().iter().enumerate() {
7689                    // Self-ref same-row updates: a row updating its
7690                    // own PK doesn't restrict itself.
7691                    if child_name == parent_table_name
7692                        && plan_with_old
7693                            .iter()
7694                            .any(|(p, _, _)| *p == child_row_idx)
7695                    {
7696                        continue;
7697                    }
7698                    let matches_key = fk
7699                        .local_columns
7700                        .iter()
7701                        .enumerate()
7702                        .all(|(i, &li)| child_row.values.get(li) == Some(old_key[i]));
7703                    if !matches_key {
7704                        continue;
7705                    }
7706                    match fk.on_update {
7707                        spg_storage::FkAction::Restrict
7708                        | spg_storage::FkAction::NoAction => {
7709                            return Err(EngineError::Unsupported(alloc::format!(
7710                                "FOREIGN KEY violation: UPDATE on {parent_table_name:?} PK is \
7711                                 restricted by FK from {child_name:?}.{:?}",
7712                                fk.local_columns,
7713                            )));
7714                        }
7715                        spg_storage::FkAction::Cascade => {
7716                            // Rewrite child FK columns to new key.
7717                            let entry = cascade_plan.entry(child_name.clone()).or_default();
7718                            for (i, &li) in fk.local_columns.iter().enumerate() {
7719                                entry.insert((child_row_idx, li), new_key[i].clone());
7720                            }
7721                        }
7722                        spg_storage::FkAction::SetNull => {
7723                            for &li in &fk.local_columns {
7724                                let col = child.schema().columns.get(li).ok_or_else(|| {
7725                                    EngineError::Unsupported(alloc::format!(
7726                                        "FK local column {li} missing in {child_name:?}"
7727                                    ))
7728                                })?;
7729                                if !col.nullable {
7730                                    return Err(EngineError::Unsupported(alloc::format!(
7731                                        "FOREIGN KEY ON UPDATE SET NULL: column \
7732                                         {child_name:?}.{:?} is NOT NULL",
7733                                        col.name,
7734                                    )));
7735                                }
7736                            }
7737                            let entry = setnull_plan.entry(child_name.clone()).or_default();
7738                            for &li in &fk.local_columns {
7739                                entry.insert((child_row_idx, li));
7740                            }
7741                        }
7742                        spg_storage::FkAction::SetDefault => {
7743                            let entry =
7744                                setdefault_plan.entry(child_name.clone()).or_default();
7745                            for &li in &fk.local_columns {
7746                                let col = child.schema().columns.get(li).ok_or_else(|| {
7747                                    EngineError::Unsupported(alloc::format!(
7748                                        "FK local column {li} missing in {child_name:?}"
7749                                    ))
7750                                })?;
7751                                let default = col.default.clone().ok_or_else(|| {
7752                                    EngineError::Unsupported(alloc::format!(
7753                                        "FOREIGN KEY ON UPDATE SET DEFAULT: column \
7754                                         {child_name:?}.{:?} has no DEFAULT",
7755                                        col.name,
7756                                    ))
7757                                })?;
7758                                entry.insert((child_row_idx, li), default);
7759                            }
7760                        }
7761                    }
7762                }
7763            }
7764        }
7765    }
7766    // Flatten into FkChildStep list. UPDATE doesn't produce
7767    // DeleteSteps (CASCADE on UPDATE just rewrites FK values).
7768    let mut steps: Vec<FkChildStep> = Vec::new();
7769    for (child_table, entries) in cascade_plan {
7770        let mut positions = Vec::with_capacity(entries.len());
7771        let mut columns = Vec::with_capacity(entries.len());
7772        let mut defaults = Vec::with_capacity(entries.len());
7773        for ((p, c), v) in entries {
7774            positions.push(p);
7775            columns.push(c);
7776            defaults.push(v);
7777        }
7778        // We reuse `FkChildAction::SetDefault` for cascade-update:
7779        // both shapes are "write a known value into specific cells"
7780        // — `apply_per_cell_writes` doesn't care whether the value
7781        // came from a DEFAULT declaration or a new parent key.
7782        steps.push(FkChildStep {
7783            child_table,
7784            action: FkChildAction::SetDefault {
7785                positions,
7786                columns,
7787                defaults,
7788            },
7789        });
7790    }
7791    for (child_table, entries) in setnull_plan {
7792        let (positions, columns): (Vec<usize>, Vec<usize>) = entries.into_iter().unzip();
7793        steps.push(FkChildStep {
7794            child_table,
7795            action: FkChildAction::SetNull { positions, columns },
7796        });
7797    }
7798    for (child_table, entries) in setdefault_plan {
7799        let mut positions = Vec::with_capacity(entries.len());
7800        let mut columns = Vec::with_capacity(entries.len());
7801        let mut defaults = Vec::with_capacity(entries.len());
7802        for ((p, c), v) in entries {
7803            positions.push(p);
7804            columns.push(c);
7805            defaults.push(v);
7806        }
7807        steps.push(FkChildStep {
7808            child_table,
7809            action: FkChildAction::SetDefault {
7810                positions,
7811                columns,
7812                defaults,
7813            },
7814        });
7815    }
7816    let _ = delete_plan; // UPDATE never deletes children.
7817    Ok(steps)
7818}
7819
7820/// v7.6.5 — apply one FK child step to the catalog. Encapsulates
7821/// the three action variants so the DELETE executor stays a
7822/// simple loop over the planned steps.
7823fn apply_fk_child_step(
7824    catalog: &mut Catalog,
7825    step: &FkChildStep,
7826) -> Result<(), EngineError> {
7827    let child = catalog.get_mut(&step.child_table).ok_or_else(|| {
7828        EngineError::Storage(StorageError::TableNotFound {
7829            name: step.child_table.clone(),
7830        })
7831    })?;
7832    match &step.action {
7833        FkChildAction::Delete { positions } => {
7834            let _ = child.delete_rows(positions);
7835        }
7836        FkChildAction::SetNull { positions, columns } => {
7837            apply_per_cell_writes(child, positions, columns, |_| Value::Null)?;
7838        }
7839        FkChildAction::SetDefault {
7840            positions,
7841            columns,
7842            defaults,
7843        } => {
7844            apply_per_cell_writes(child, positions, columns, |i| defaults[i].clone())?;
7845        }
7846    }
7847    Ok(())
7848}
7849
7850/// v7.6.5 — write new values into selected child cells via
7851/// `Table::update_row` (the catalog's existing UPDATE entry).
7852/// Groups writes by row position so multi-column updates on the
7853/// same row only call `update_row` once. `value_for(i)` produces
7854/// the new value for the i-th (position, column) entry.
7855fn apply_per_cell_writes(
7856    child: &mut spg_storage::Table,
7857    positions: &[usize],
7858    columns: &[usize],
7859    mut value_for: impl FnMut(usize) -> Value,
7860) -> Result<(), EngineError> {
7861    use alloc::collections::BTreeMap;
7862    let mut by_row: BTreeMap<usize, Vec<(usize, Value)>> = BTreeMap::new();
7863    for i in 0..positions.len() {
7864        by_row
7865            .entry(positions[i])
7866            .or_default()
7867            .push((columns[i], value_for(i)));
7868    }
7869    for (pos, mutations) in by_row {
7870        let mut new_values = child.rows()[pos].values.clone();
7871        for (col, v) in mutations {
7872            if let Some(slot) = new_values.get_mut(col) {
7873                *slot = v;
7874            }
7875        }
7876        child
7877            .update_row(pos, new_values)
7878            .map_err(EngineError::Storage)?;
7879    }
7880    Ok(())
7881}
7882
7883fn fk_action_sql_to_storage(a: spg_sql::ast::FkAction) -> spg_storage::FkAction {
7884    match a {
7885        spg_sql::ast::FkAction::Restrict => spg_storage::FkAction::Restrict,
7886        spg_sql::ast::FkAction::Cascade => spg_storage::FkAction::Cascade,
7887        spg_sql::ast::FkAction::SetNull => spg_storage::FkAction::SetNull,
7888        spg_sql::ast::FkAction::SetDefault => spg_storage::FkAction::SetDefault,
7889        spg_sql::ast::FkAction::NoAction => spg_storage::FkAction::NoAction,
7890    }
7891}
7892
7893fn column_def_to_schema(c: ColumnDef) -> Result<ColumnSchema, EngineError> {
7894    let ty = column_type_to_data_type(c.ty);
7895    let mut schema = ColumnSchema::new(c.name.clone(), ty, c.nullable);
7896    if let Some(default_expr) = c.default {
7897        // DEFAULT must be a literal expression — evaluated at CREATE TABLE
7898        // time against an empty row context. Any column ref / aggregate
7899        // surfaces as the corresponding eval error.
7900        let raw = literal_expr_to_value(default_expr)?;
7901        let coerced = coerce_value(raw, ty, &c.name, 0)?;
7902        schema = schema.with_default(coerced);
7903    }
7904    if c.auto_increment {
7905        // AUTO_INCREMENT only makes sense on integer-shaped columns.
7906        if !matches!(ty, DataType::SmallInt | DataType::Int | DataType::BigInt) {
7907            return Err(EngineError::Unsupported(alloc::format!(
7908                "AUTO_INCREMENT requires an integer column type, got {ty:?}"
7909            )));
7910        }
7911        schema = schema.with_auto_increment();
7912    }
7913    Ok(schema)
7914}
7915
7916const fn column_type_to_data_type(t: ColumnTypeName) -> DataType {
7917    match t {
7918        ColumnTypeName::SmallInt => DataType::SmallInt,
7919        ColumnTypeName::Int => DataType::Int,
7920        ColumnTypeName::BigInt => DataType::BigInt,
7921        ColumnTypeName::Float => DataType::Float,
7922        ColumnTypeName::Text => DataType::Text,
7923        ColumnTypeName::Varchar(n) => DataType::Varchar(n),
7924        ColumnTypeName::Char(n) => DataType::Char(n),
7925        ColumnTypeName::Bool => DataType::Bool,
7926        ColumnTypeName::Vector { dim, encoding } => DataType::Vector {
7927            dim,
7928            encoding: match encoding {
7929                SqlVecEncoding::F32 => VecEncoding::F32,
7930                SqlVecEncoding::Sq8 => VecEncoding::Sq8,
7931                SqlVecEncoding::F16 => VecEncoding::F16,
7932            },
7933        },
7934        ColumnTypeName::Numeric(precision, scale) => DataType::Numeric { precision, scale },
7935        ColumnTypeName::Date => DataType::Date,
7936        ColumnTypeName::Timestamp => DataType::Timestamp,
7937        ColumnTypeName::Json => DataType::Json,
7938    }
7939}
7940
7941/// Convert an INSERT VALUES expression to a storage Value. Supports literal
7942/// expressions, unary-minus over numeric literals, and pgvector-style
7943/// `'[..]'::vector` cast (v1.2). Anything more complex returns `Unsupported`.
7944fn literal_expr_to_value(expr: Expr) -> Result<Value, EngineError> {
7945    match expr {
7946        Expr::Literal(l) => Ok(literal_to_value(l)),
7947        Expr::Cast { expr, target } => {
7948            let inner_value = literal_expr_to_value(*expr)?;
7949            crate::eval::cast_value(inner_value, target).map_err(EngineError::Eval)
7950        }
7951        Expr::Unary {
7952            op: UnOp::Neg,
7953            expr,
7954        } => match *expr {
7955            Expr::Literal(Literal::Integer(n)) => {
7956                // Fold to i32 if it fits, else BigInt. Parser emits Integer(i64)
7957                // — overflow on negate of i64::MIN is the one edge case.
7958                let neg = n.checked_neg().ok_or_else(|| {
7959                    EngineError::Unsupported("integer literal overflow on negation".into())
7960                })?;
7961                Ok(int_value_for(neg))
7962            }
7963            Expr::Literal(Literal::Float(x)) => Ok(Value::Float(-x)),
7964            other => Err(EngineError::Unsupported(alloc::format!(
7965                "unary minus over non-literal expression: {other:?}"
7966            ))),
7967        },
7968        other => Err(EngineError::Unsupported(alloc::format!(
7969            "non-literal INSERT value expression: {other:?}"
7970        ))),
7971    }
7972}
7973
7974fn literal_to_value(l: Literal) -> Value {
7975    match l {
7976        Literal::Integer(n) => int_value_for(n),
7977        Literal::Float(x) => Value::Float(x),
7978        Literal::String(s) => Value::Text(s),
7979        Literal::Bool(b) => Value::Bool(b),
7980        Literal::Null => Value::Null,
7981        Literal::Vector(v) => Value::Vector(v),
7982        Literal::Interval { months, micros, .. } => Value::Interval { months, micros },
7983    }
7984}
7985
7986/// Pick `Int` (`i32`) when the literal fits, else `BigInt`. `INT` vs `BIGINT`
7987/// columns will still enforce the right tag downstream — this is just the
7988/// default we synthesise from an unannotated integer literal.
7989fn int_value_for(n: i64) -> Value {
7990    if let Ok(small) = i32::try_from(n) {
7991        Value::Int(small)
7992    } else {
7993        Value::BigInt(n)
7994    }
7995}
7996
7997/// Widen / narrow `v` to fit `expected`. Numerics permit safe widening
7998/// (`Int → BigInt`, `Int/BigInt → Float`) and best-effort narrowing
7999/// (`BigInt → Int` succeeds only when the value fits in `i32`). Everything
8000/// else returns `TypeMismatch` carrying the column name for caller diagnostics.
8001/// `NULL` is always permitted; the nullability check happens later in storage.
8002#[allow(clippy::too_many_lines)]
8003fn coerce_value(
8004    v: Value,
8005    expected: DataType,
8006    col_name: &str,
8007    position: usize,
8008) -> Result<Value, EngineError> {
8009    if v.is_null() {
8010        return Ok(Value::Null);
8011    }
8012    let actual = v.data_type().expect("non-null");
8013    if actual == expected {
8014        return Ok(v);
8015    }
8016    let coerced =
8017        match (v, expected) {
8018            (Value::Int(n), DataType::BigInt) => Some(Value::BigInt(i64::from(n))),
8019            (Value::Int(n), DataType::Float) => Some(Value::Float(f64::from(n))),
8020            (Value::Int(n), DataType::SmallInt) => i16::try_from(n).ok().map(Value::SmallInt),
8021            (Value::Int(n), DataType::Numeric { precision, scale }) => Some(numeric_from_integer(
8022                i128::from(n),
8023                precision,
8024                scale,
8025                col_name,
8026            )?),
8027            (Value::SmallInt(n), DataType::Int) => Some(Value::Int(i32::from(n))),
8028            (Value::SmallInt(n), DataType::BigInt) => Some(Value::BigInt(i64::from(n))),
8029            (Value::SmallInt(n), DataType::Float) => Some(Value::Float(f64::from(n))),
8030            (Value::SmallInt(n), DataType::Numeric { precision, scale }) => Some(
8031                numeric_from_integer(i128::from(n), precision, scale, col_name)?,
8032            ),
8033            (Value::BigInt(n), DataType::Int) => i32::try_from(n).ok().map(Value::Int),
8034            (Value::BigInt(n), DataType::SmallInt) => i16::try_from(n).ok().map(Value::SmallInt),
8035            #[allow(clippy::cast_precision_loss)]
8036            (Value::BigInt(n), DataType::Float) => Some(Value::Float(n as f64)),
8037            (Value::BigInt(n), DataType::Numeric { precision, scale }) => Some(
8038                numeric_from_integer(i128::from(n), precision, scale, col_name)?,
8039            ),
8040            (Value::Float(x), DataType::Numeric { precision, scale }) => {
8041                Some(numeric_from_float(x, precision, scale, col_name)?)
8042            }
8043            // Text → DATE / TIMESTAMP: parse canonical text forms.
8044            (Value::Text(s), DataType::Date) => {
8045                let d = eval::parse_date_literal(&s).ok_or_else(|| {
8046                    EngineError::Eval(EvalError::TypeMismatch {
8047                        detail: alloc::format!(
8048                            "cannot parse {s:?} as DATE for column `{col_name}`"
8049                        ),
8050                    })
8051                })?;
8052                Some(Value::Date(d))
8053            }
8054            // v4.9: Text ↔ JSON coercion. No structural validation —
8055            // any text literal is accepted; the responsibility for
8056            // valid JSON lies with the producer.
8057            (Value::Text(s), DataType::Json) => Some(Value::Json(s)),
8058            (Value::Json(s), DataType::Text) => Some(Value::Text(s)),
8059            (Value::Text(s), DataType::Timestamp) => {
8060                let t = eval::parse_timestamp_literal(&s).ok_or_else(|| {
8061                    EngineError::Eval(EvalError::TypeMismatch {
8062                        detail: alloc::format!(
8063                            "cannot parse {s:?} as TIMESTAMP for column `{col_name}`"
8064                        ),
8065                    })
8066                })?;
8067                Some(Value::Timestamp(t))
8068            }
8069            // DATE ↔ TIMESTAMP convertibility (DATE → midnight,
8070            // TIMESTAMP → day truncation).
8071            (Value::Date(d), DataType::Timestamp) => {
8072                Some(Value::Timestamp(i64::from(d) * 86_400_000_000))
8073            }
8074            (Value::Timestamp(t), DataType::Date) => {
8075                let days = t.div_euclid(86_400_000_000);
8076                i32::try_from(days).ok().map(Value::Date)
8077            }
8078            (
8079                Value::Numeric {
8080                    scaled,
8081                    scale: src_scale,
8082                },
8083                DataType::Numeric { precision, scale },
8084            ) => Some(numeric_rescale(
8085                scaled, src_scale, precision, scale, col_name,
8086            )?),
8087            #[allow(clippy::cast_precision_loss)]
8088            (Value::Numeric { scaled, scale }, DataType::Float) => {
8089                let mut div = 1.0_f64;
8090                for _ in 0..scale {
8091                    div *= 10.0;
8092                }
8093                Some(Value::Float((scaled as f64) / div))
8094            }
8095            (Value::Numeric { scaled, scale }, DataType::Int) => {
8096                let truncated = numeric_truncate_to_integer(scaled, scale);
8097                i32::try_from(truncated).ok().map(Value::Int)
8098            }
8099            (Value::Numeric { scaled, scale }, DataType::BigInt) => {
8100                let truncated = numeric_truncate_to_integer(scaled, scale);
8101                i64::try_from(truncated).ok().map(Value::BigInt)
8102            }
8103            (Value::Numeric { scaled, scale }, DataType::SmallInt) => {
8104                let truncated = numeric_truncate_to_integer(scaled, scale);
8105                i16::try_from(truncated).ok().map(Value::SmallInt)
8106            }
8107            // VARCHAR(n) enforces an upper bound on character count.
8108            (Value::Text(s), DataType::Varchar(max)) => {
8109                if u32::try_from(s.chars().count()).unwrap_or(u32::MAX) <= max {
8110                    Some(Value::Text(s))
8111                } else {
8112                    return Err(EngineError::Unsupported(alloc::format!(
8113                        "value for VARCHAR({max}) column `{col_name}` exceeds length: \
8114                     {} chars",
8115                        s.chars().count()
8116                    )));
8117                }
8118            }
8119            // v6.0.1: f32 → SQ8 INSERT-time quantisation. Triggered
8120            // when the column declares `VECTOR(N) USING SQ8` and
8121            // the INSERT VALUES expression yields a raw f32 vector
8122            // (the normal pgvector-shape literal). Dim mismatch
8123            // falls through the `_ => None` arm and surfaces as
8124            // `TypeMismatch` with the expected SQ8 column type —
8125            // matching the F32 path's existing error.
8126            (
8127                Value::Vector(v),
8128                DataType::Vector {
8129                    dim,
8130                    encoding: VecEncoding::Sq8,
8131                },
8132            ) if v.len() == dim as usize => {
8133                Some(Value::Sq8Vector(spg_storage::quantize::quantize(&v)))
8134            }
8135            // v6.0.3: f32 → f16 INSERT-time conversion for HALF
8136            // columns. Bit-exact at the storage layer (modulo
8137            // half-precision rounding); no rerank pass needed at
8138            // search time.
8139            (
8140                Value::Vector(v),
8141                DataType::Vector {
8142                    dim,
8143                    encoding: VecEncoding::F16,
8144                },
8145            ) if v.len() == dim as usize => Some(Value::HalfVector(
8146                spg_storage::halfvec::HalfVector::from_f32_slice(&v),
8147            )),
8148            // CHAR(n) right-pads with U+0020 to exactly n chars; if the input
8149            // is already longer we reject (PG truncates trailing-space-only;
8150            // staying strict for v1).
8151            (Value::Text(s), DataType::Char(size)) => {
8152                let len = u32::try_from(s.chars().count()).unwrap_or(u32::MAX);
8153                if len > size {
8154                    return Err(EngineError::Unsupported(alloc::format!(
8155                        "value for CHAR({size}) column `{col_name}` exceeds length: \
8156                     {len} chars"
8157                    )));
8158                }
8159                let need = (size - len) as usize;
8160                let mut padded = s;
8161                padded.reserve(need);
8162                for _ in 0..need {
8163                    padded.push(' ');
8164                }
8165                Some(Value::Text(padded))
8166            }
8167            _ => None,
8168        };
8169    coerced.ok_or(EngineError::Storage(StorageError::TypeMismatch {
8170        column: col_name.into(),
8171        expected,
8172        actual,
8173        position,
8174    }))
8175}
8176
8177#[cfg(test)]
8178mod tests {
8179    use super::*;
8180    use alloc::vec;
8181
8182    fn unwrap_command_ok(r: &QueryResult) -> usize {
8183        match r {
8184            QueryResult::CommandOk { affected, .. } => *affected,
8185            QueryResult::Rows { .. } => panic!("expected CommandOk, got Rows"),
8186        }
8187    }
8188
8189    #[test]
8190    fn create_table_registers_schema() {
8191        let mut e = Engine::new();
8192        e.execute("CREATE TABLE foo (a INT NOT NULL, b TEXT)")
8193            .unwrap();
8194        assert_eq!(e.catalog().table_count(), 1);
8195        let t = e.catalog().get("foo").unwrap();
8196        assert_eq!(t.schema().columns.len(), 2);
8197        assert_eq!(t.schema().columns[0].ty, DataType::Int);
8198        assert!(!t.schema().columns[0].nullable);
8199        assert_eq!(t.schema().columns[1].ty, DataType::Text);
8200    }
8201
8202    #[test]
8203    fn create_table_vector_default_is_f32_encoded() {
8204        let mut e = Engine::new();
8205        e.execute("CREATE TABLE t (v VECTOR(8))").unwrap();
8206        let t = e.catalog().get("t").unwrap();
8207        assert_eq!(
8208            t.schema().columns[0].ty,
8209            DataType::Vector {
8210                dim: 8,
8211                encoding: VecEncoding::F32,
8212            },
8213        );
8214    }
8215
8216    #[test]
8217    fn create_table_vector_using_sq8_succeeds() {
8218        // v6.0.1 step 3: the step-1 fence in `column_def_to_schema`
8219        // is lifted. CREATE TABLE persists an SQ8 column type in
8220        // the catalog; INSERT (next test) quantises raw f32 input.
8221        let mut e = Engine::new();
8222        e.execute("CREATE TABLE t (v VECTOR(8) USING SQ8)").unwrap();
8223        let t = e.catalog().get("t").unwrap();
8224        assert_eq!(
8225            t.schema().columns[0].ty,
8226            DataType::Vector {
8227                dim: 8,
8228                encoding: VecEncoding::Sq8,
8229            },
8230        );
8231    }
8232
8233    #[test]
8234    fn insert_into_sq8_column_quantises_f32_payload() {
8235        // v6.0.1 step 3: INSERT-time `coerce_value` rewrites a raw
8236        // `Value::Vector(Vec<f32>)` literal into the column's
8237        // quantised representation. The row that lands in the
8238        // catalog must therefore hold a `Value::Sq8Vector`, not the
8239        // original f32 buffer — that's the bit that delivers the
8240        // 4× compression target.
8241        let mut e = Engine::new();
8242        e.execute("CREATE TABLE t (v VECTOR(4) USING SQ8)").unwrap();
8243        e.execute("INSERT INTO t VALUES ([0.0, 0.25, 0.5, 1.0])")
8244            .unwrap();
8245        let t = e.catalog().get("t").unwrap();
8246        assert_eq!(t.rows().len(), 1);
8247        match &t.rows()[0].values[0] {
8248            Value::Sq8Vector(q) => {
8249                assert_eq!(q.bytes.len(), 4);
8250                // min/max are derived from the payload: min=0.0, max=1.0.
8251                assert!((q.min - 0.0).abs() < 1e-6);
8252                assert!((q.max - 1.0).abs() < 1e-6);
8253            }
8254            other => panic!("expected Sq8Vector cell, got {other:?}"),
8255        }
8256    }
8257
8258    #[test]
8259    fn create_table_vector_using_half_succeeds_and_insert_converts_to_f16() {
8260        // v6.0.3: CREATE TABLE accepts USING HALF; INSERT path
8261        // converts the incoming `Value::Vector(Vec<f32>)` cell
8262        // into `Value::HalfVector(HalfVector)` via the new
8263        // `coerce_value` arm. The dequantised round-trip is
8264        // bit-exact for f16-representable values, so 0.0 / 0.25
8265        // / 0.5 / 1.0 hit their grid points exactly.
8266        let mut e = Engine::new();
8267        e.execute("CREATE TABLE t (v VECTOR(4) USING HALF)")
8268            .unwrap();
8269        e.execute("INSERT INTO t VALUES ([0.0, 0.25, 0.5, 1.0])")
8270            .unwrap();
8271        let t = e.catalog().get("t").unwrap();
8272        assert_eq!(t.rows().len(), 1);
8273        match &t.rows()[0].values[0] {
8274            Value::HalfVector(h) => {
8275                assert_eq!(h.dim(), 4);
8276                let back = h.to_f32_vec();
8277                let expected = alloc::vec![0.0_f32, 0.25, 0.5, 1.0];
8278                for (g, e) in back.iter().zip(expected.iter()) {
8279                    assert!(
8280                        (g - e).abs() < 1e-6,
8281                        "{g} vs {e} should be exact on f16 grid"
8282                    );
8283                }
8284            }
8285            other => panic!("expected HalfVector cell, got {other:?}"),
8286        }
8287    }
8288
8289    #[test]
8290    fn alter_index_rebuild_in_place_succeeds() {
8291        // v6.0.4: bare REBUILD (no encoding switch) walks every
8292        // row again to rebuild the NSW graph. Verifies the engine
8293        // dispatch + storage helper plumbing without changing any
8294        // cell encoding.
8295        let mut e = Engine::new();
8296        e.execute("CREATE TABLE t (id INT NOT NULL, v VECTOR(3) NOT NULL)")
8297            .unwrap();
8298        for i in 0..8_i32 {
8299            #[allow(clippy::cast_precision_loss)]
8300            let base = (i as f32) * 0.1;
8301            e.execute(&alloc::format!(
8302                "INSERT INTO t VALUES ({i}, [{base}, {b1}, {b2}])",
8303                b1 = base + 0.01,
8304                b2 = base + 0.02,
8305            ))
8306            .unwrap();
8307        }
8308        e.execute("CREATE INDEX t_idx ON t USING hnsw (v)").unwrap();
8309        e.execute("ALTER INDEX t_idx REBUILD").unwrap();
8310        // Schema encoding stays F32 (no encoding clause).
8311        assert_eq!(
8312            e.catalog().get("t").unwrap().schema().columns[1].ty,
8313            DataType::Vector {
8314                dim: 3,
8315                encoding: VecEncoding::F32,
8316            },
8317        );
8318    }
8319
8320    #[test]
8321    fn alter_index_rebuild_with_encoding_switches_cell_type() {
8322        // v6.0.4: REBUILD WITH (encoding = SQ8) recodes every
8323        // stored cell from F32 → SQ8 + rebuilds the graph atop the
8324        // new encoding. Post-rebuild, cells must be Sq8Vector and
8325        // the schema must report encoding = Sq8.
8326        let mut e = Engine::new();
8327        e.execute("CREATE TABLE t (id INT NOT NULL, v VECTOR(4) NOT NULL)")
8328            .unwrap();
8329        e.execute("INSERT INTO t VALUES (1, [0.0, 0.25, 0.5, 1.0])")
8330            .unwrap();
8331        e.execute("CREATE INDEX t_idx ON t USING hnsw (v)").unwrap();
8332        e.execute("ALTER INDEX t_idx REBUILD WITH (encoding = SQ8)")
8333            .unwrap();
8334        let t = e.catalog().get("t").unwrap();
8335        assert_eq!(
8336            t.schema().columns[1].ty,
8337            DataType::Vector {
8338                dim: 4,
8339                encoding: VecEncoding::Sq8,
8340            },
8341        );
8342        assert!(matches!(t.rows()[0].values[1], Value::Sq8Vector(_)));
8343    }
8344
8345    #[test]
8346    fn alter_index_rebuild_unknown_index_errors() {
8347        let mut e = Engine::new();
8348        let err = e.execute("ALTER INDEX nope REBUILD").unwrap_err();
8349        assert!(
8350            matches!(
8351                &err,
8352                EngineError::Storage(StorageError::IndexNotFound { name }) if name == "nope"
8353            ),
8354            "got: {err}"
8355        );
8356    }
8357
8358    #[test]
8359    fn alter_index_rebuild_on_btree_index_errors() {
8360        // REBUILD on a B-tree index has no semantic meaning in
8361        // v6.0.4 — rejected at the storage layer with `Unsupported`.
8362        let mut e = Engine::new();
8363        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
8364        e.execute("INSERT INTO t VALUES (1)").unwrap();
8365        e.execute("CREATE INDEX t_idx ON t (id)").unwrap();
8366        let err = e.execute("ALTER INDEX t_idx REBUILD").unwrap_err();
8367        assert!(
8368            matches!(&err, EngineError::Storage(StorageError::Unsupported(_))),
8369            "got: {err}"
8370        );
8371    }
8372
8373    #[test]
8374    fn prepared_insert_substitutes_placeholders() {
8375        // v6.1.1: prepare() parses once; execute_prepared() walks the
8376        // AST and replaces $1/$2 with the param Values BEFORE the
8377        // dispatch sees them. Same logical result as a simple-query
8378        // INSERT, but parse happens once per *statement*, not per
8379        // execution.
8380        let mut e = Engine::new();
8381        e.execute("CREATE TABLE t (id INT NOT NULL, name TEXT NOT NULL)")
8382            .unwrap();
8383        let stmt = e.prepare("INSERT INTO t VALUES ($1, $2)").unwrap();
8384        for (id, name) in [(1, "alice"), (2, "bob"), (3, "carol")] {
8385            e.execute_prepared(
8386                stmt.clone(),
8387                &[Value::Int(id), Value::Text(name.into())],
8388            )
8389            .unwrap();
8390        }
8391        // Read back via simple-query SELECT.
8392        let rows_result = e.execute("SELECT id, name FROM t").unwrap();
8393        let QueryResult::Rows { rows, .. } = rows_result else {
8394            panic!("expected Rows")
8395        };
8396        assert_eq!(rows.len(), 3);
8397    }
8398
8399    #[test]
8400    fn prepared_select_with_placeholder_filters_rows() {
8401        let mut e = Engine::new();
8402        e.execute("CREATE TABLE t (id INT NOT NULL, v INT NOT NULL)")
8403            .unwrap();
8404        for i in 0..10_i32 {
8405            e.execute(&alloc::format!("INSERT INTO t VALUES ({i}, {})", i * 7))
8406                .unwrap();
8407        }
8408        let stmt = e
8409            .prepare("SELECT id FROM t WHERE v = $1")
8410            .unwrap();
8411        let QueryResult::Rows { rows, .. } = e
8412            .execute_prepared(stmt, &[Value::Int(35)])
8413            .unwrap()
8414        else {
8415            panic!("expected Rows")
8416        };
8417        // v = 35 means i*7 = 35 → i = 5.
8418        assert_eq!(rows.len(), 1);
8419        assert_eq!(rows[0].values[0], Value::Int(5));
8420    }
8421
8422    #[test]
8423    fn prepared_too_few_params_errors() {
8424        let mut e = Engine::new();
8425        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
8426        let stmt = e.prepare("INSERT INTO t VALUES ($1)").unwrap();
8427        let err = e.execute_prepared(stmt, &[]).unwrap_err();
8428        assert!(
8429            matches!(
8430                &err,
8431                EngineError::Eval(EvalError::PlaceholderOutOfRange { n: 1, bound: 0 })
8432            ),
8433            "got: {err}"
8434        );
8435    }
8436
8437    #[test]
8438    fn insert_into_half_column_dim_mismatch_errors() {
8439        let mut e = Engine::new();
8440        e.execute("CREATE TABLE t (v VECTOR(4) USING HALF)")
8441            .unwrap();
8442        let err = e.execute("INSERT INTO t VALUES ([1.0, 2.0])").unwrap_err();
8443        assert!(matches!(
8444            &err,
8445            EngineError::Storage(StorageError::TypeMismatch { .. })
8446        ));
8447    }
8448
8449    #[test]
8450    fn insert_into_sq8_column_dim_mismatch_errors() {
8451        // Dim mismatch falls through the `coerce_value` Vector→Sq8
8452        // arm's guard and surfaces as `TypeMismatch` — the same
8453        // error the F32 path produces today, so client error
8454        // handling stays uniform across encodings.
8455        let mut e = Engine::new();
8456        e.execute("CREATE TABLE t (v VECTOR(4) USING SQ8)").unwrap();
8457        let err = e.execute("INSERT INTO t VALUES ([1.0, 2.0])").unwrap_err();
8458        assert!(
8459            matches!(
8460                &err,
8461                EngineError::Storage(StorageError::TypeMismatch { .. })
8462            ),
8463            "got: {err}",
8464        );
8465    }
8466
8467    #[test]
8468    fn create_table_duplicate_errors() {
8469        let mut e = Engine::new();
8470        e.execute("CREATE TABLE foo (a INT)").unwrap();
8471        let err = e.execute("CREATE TABLE foo (a INT)").unwrap_err();
8472        assert!(matches!(
8473            err,
8474            EngineError::Storage(StorageError::DuplicateTable { ref name }) if name == "foo"
8475        ));
8476    }
8477
8478    #[test]
8479    fn insert_into_unknown_table_errors() {
8480        let mut e = Engine::new();
8481        let err = e.execute("INSERT INTO ghost VALUES (1)").unwrap_err();
8482        assert!(matches!(
8483            err,
8484            EngineError::Storage(StorageError::TableNotFound { ref name }) if name == "ghost"
8485        ));
8486    }
8487
8488    #[test]
8489    fn insert_happy_path_reports_one_affected() {
8490        let mut e = Engine::new();
8491        e.execute("CREATE TABLE foo (a INT NOT NULL)").unwrap();
8492        let r = e.execute("INSERT INTO foo VALUES (42)").unwrap();
8493        assert_eq!(unwrap_command_ok(&r), 1);
8494        assert_eq!(e.catalog().get("foo").unwrap().row_count(), 1);
8495    }
8496
8497    #[test]
8498    fn insert_arity_mismatch_propagates() {
8499        let mut e = Engine::new();
8500        e.execute("CREATE TABLE foo (a INT, b TEXT)").unwrap();
8501        let err = e.execute("INSERT INTO foo VALUES (1)").unwrap_err();
8502        assert!(matches!(
8503            err,
8504            EngineError::Storage(StorageError::ArityMismatch { .. })
8505        ));
8506    }
8507
8508    #[test]
8509    fn insert_negative_integer_via_unary_minus() {
8510        let mut e = Engine::new();
8511        e.execute("CREATE TABLE foo (a INT NOT NULL)").unwrap();
8512        e.execute("INSERT INTO foo VALUES (-7)").unwrap();
8513        let rows = e.catalog().get("foo").unwrap().rows();
8514        assert_eq!(rows[0].values[0], Value::Int(-7));
8515    }
8516
8517    #[test]
8518    fn insert_non_literal_expr_unsupported() {
8519        let mut e = Engine::new();
8520        e.execute("CREATE TABLE foo (a INT NOT NULL)").unwrap();
8521        let err = e.execute("INSERT INTO foo VALUES (1 + 2)").unwrap_err();
8522        assert!(matches!(err, EngineError::Unsupported(_)));
8523    }
8524
8525    #[test]
8526    fn select_star_returns_all_rows_in_insertion_order() {
8527        let mut e = Engine::new();
8528        e.execute("CREATE TABLE foo (a INT NOT NULL, b TEXT NOT NULL)")
8529            .unwrap();
8530        e.execute("INSERT INTO foo VALUES (1, 'one')").unwrap();
8531        e.execute("INSERT INTO foo VALUES (2, 'two')").unwrap();
8532        e.execute("INSERT INTO foo VALUES (3, 'three')").unwrap();
8533
8534        let r = e.execute("SELECT * FROM foo").unwrap();
8535        let QueryResult::Rows { columns, rows } = r else {
8536            panic!("expected Rows")
8537        };
8538        assert_eq!(columns.len(), 2);
8539        assert_eq!(columns[0].name, "a");
8540        assert_eq!(rows.len(), 3);
8541        assert_eq!(
8542            rows[1].values,
8543            vec![Value::Int(2), Value::Text("two".into())]
8544        );
8545    }
8546
8547    #[test]
8548    fn select_star_on_empty_table_returns_zero_rows() {
8549        let mut e = Engine::new();
8550        e.execute("CREATE TABLE foo (a INT)").unwrap();
8551        let r = e.execute("SELECT * FROM foo").unwrap();
8552        match r {
8553            QueryResult::Rows { rows, .. } => assert!(rows.is_empty()),
8554            QueryResult::CommandOk { .. } => panic!("expected Rows"),
8555        }
8556    }
8557
8558    // --- v0.4: WHERE + projection ------------------------------------------
8559
8560    fn make_three_row_users(e: &mut Engine) {
8561        e.execute("CREATE TABLE users (id INT NOT NULL, name TEXT NOT NULL, score INT)")
8562            .unwrap();
8563        e.execute("INSERT INTO users VALUES (1, 'alice', 90)")
8564            .unwrap();
8565        e.execute("INSERT INTO users VALUES (2, 'bob', NULL)")
8566            .unwrap();
8567        e.execute("INSERT INTO users VALUES (3, 'cara', 70)")
8568            .unwrap();
8569    }
8570
8571    fn unwrap_rows(r: QueryResult) -> (Vec<ColumnSchema>, Vec<Row>) {
8572        match r {
8573            QueryResult::Rows { columns, rows } => (columns, rows),
8574            QueryResult::CommandOk { .. } => panic!("expected Rows"),
8575        }
8576    }
8577
8578    #[test]
8579    fn where_filter_passes_only_true_rows() {
8580        let mut e = Engine::new();
8581        make_three_row_users(&mut e);
8582        let r = e.execute("SELECT * FROM users WHERE id > 1").unwrap();
8583        let (_, rows) = unwrap_rows(r);
8584        assert_eq!(rows.len(), 2);
8585        assert_eq!(rows[0].values[0], Value::Int(2));
8586        assert_eq!(rows[1].values[0], Value::Int(3));
8587    }
8588
8589    #[test]
8590    fn where_with_null_result_filters_out_row() {
8591        let mut e = Engine::new();
8592        make_three_row_users(&mut e);
8593        // score is NULL for bob → score > 80 is NULL → row excluded
8594        let r = e.execute("SELECT * FROM users WHERE score > 80").unwrap();
8595        let (_, rows) = unwrap_rows(r);
8596        assert_eq!(rows.len(), 1);
8597        assert_eq!(rows[0].values[1], Value::Text("alice".into()));
8598    }
8599
8600    #[test]
8601    fn projection_named_columns() {
8602        let mut e = Engine::new();
8603        make_three_row_users(&mut e);
8604        let r = e.execute("SELECT name, score FROM users").unwrap();
8605        let (cols, rows) = unwrap_rows(r);
8606        assert_eq!(cols.len(), 2);
8607        assert_eq!(cols[0].name, "name");
8608        assert_eq!(cols[1].name, "score");
8609        assert_eq!(rows.len(), 3);
8610        assert_eq!(
8611            rows[0].values,
8612            vec![Value::Text("alice".into()), Value::Int(90)]
8613        );
8614    }
8615
8616    #[test]
8617    fn projection_with_column_alias() {
8618        let mut e = Engine::new();
8619        make_three_row_users(&mut e);
8620        let r = e
8621            .execute("SELECT name AS who FROM users WHERE id = 1")
8622            .unwrap();
8623        let (cols, rows) = unwrap_rows(r);
8624        assert_eq!(cols[0].name, "who");
8625        assert_eq!(rows.len(), 1);
8626        assert_eq!(rows[0].values[0], Value::Text("alice".into()));
8627    }
8628
8629    #[test]
8630    fn qualified_column_with_table_alias_resolves() {
8631        let mut e = Engine::new();
8632        make_three_row_users(&mut e);
8633        let r = e
8634            .execute("SELECT u.id, u.name FROM users AS u WHERE u.id < 3")
8635            .unwrap();
8636        let (cols, rows) = unwrap_rows(r);
8637        assert_eq!(cols.len(), 2);
8638        assert_eq!(rows.len(), 2);
8639    }
8640
8641    #[test]
8642    fn qualified_column_with_wrong_alias_errors() {
8643        let mut e = Engine::new();
8644        make_three_row_users(&mut e);
8645        let err = e.execute("SELECT x.id FROM users AS u").unwrap_err();
8646        assert!(matches!(
8647            err,
8648            EngineError::Eval(EvalError::UnknownQualifier { ref qualifier }) if qualifier == "x"
8649        ));
8650    }
8651
8652    #[test]
8653    fn select_unknown_column_errors_in_projection() {
8654        let mut e = Engine::new();
8655        make_three_row_users(&mut e);
8656        let err = e.execute("SELECT ghost FROM users").unwrap_err();
8657        assert!(matches!(
8658            err,
8659            EngineError::Eval(EvalError::ColumnNotFound { ref name }) if name == "ghost"
8660        ));
8661    }
8662
8663    #[test]
8664    fn where_unknown_column_errors() {
8665        let mut e = Engine::new();
8666        make_three_row_users(&mut e);
8667        let err = e
8668            .execute("SELECT * FROM users WHERE ghost = 1")
8669            .unwrap_err();
8670        assert!(matches!(
8671            err,
8672            EngineError::Eval(EvalError::ColumnNotFound { .. })
8673        ));
8674    }
8675
8676    #[test]
8677    fn expression_projection_evaluates_and_renders() {
8678        // Compound expressions in the SELECT list are evaluated per row;
8679        // the output column is typed TEXT, name defaults to the expression.
8680        let mut e = Engine::new();
8681        e.execute("CREATE TABLE t (a INT NOT NULL)").unwrap();
8682        e.execute("INSERT INTO t VALUES (3)").unwrap();
8683        let (_, rows) = unwrap_rows(e.execute("SELECT 1 + 2 FROM t").unwrap());
8684        assert_eq!(rows.len(), 1);
8685        // The expression evaluates to integer 3; rendered as the cell value
8686        // (storage::Value::Int(3) since arithmetic kept ints).
8687        assert_eq!(rows[0].values[0], Value::Int(3));
8688    }
8689
8690    #[test]
8691    fn select_unknown_table_errors() {
8692        let mut e = Engine::new();
8693        let err = e.execute("SELECT * FROM ghost").unwrap_err();
8694        assert!(matches!(
8695            err,
8696            EngineError::Storage(StorageError::TableNotFound { .. })
8697        ));
8698    }
8699
8700    #[test]
8701    fn invalid_sql_returns_parse_error() {
8702        // v4.4: UPDATE is now real SQL, so use a true syntactic
8703        // garbage payload for the parse-error path.
8704        let mut e = Engine::new();
8705        let err = e.execute("THIS_IS_NOT_A_KEYWORD foo bar baz").unwrap_err();
8706        assert!(matches!(err, EngineError::Parse(_)));
8707    }
8708
8709    // --- v0.8 CREATE INDEX + index seek ------------------------------------
8710
8711    #[test]
8712    fn create_index_registers_on_table() {
8713        let mut e = Engine::new();
8714        make_three_row_users(&mut e);
8715        e.execute("CREATE INDEX by_name ON users (name)").unwrap();
8716        let t = e.catalog().get("users").unwrap();
8717        assert_eq!(t.indices().len(), 1);
8718        assert_eq!(t.indices()[0].name, "by_name");
8719    }
8720
8721    #[test]
8722    fn create_index_on_unknown_table_errors() {
8723        let mut e = Engine::new();
8724        let err = e.execute("CREATE INDEX i ON ghost (a)").unwrap_err();
8725        assert!(matches!(
8726            err,
8727            EngineError::Storage(StorageError::TableNotFound { .. })
8728        ));
8729    }
8730
8731    #[test]
8732    fn create_index_on_unknown_column_errors() {
8733        let mut e = Engine::new();
8734        make_three_row_users(&mut e);
8735        let err = e.execute("CREATE INDEX i ON users (ghost)").unwrap_err();
8736        assert!(matches!(
8737            err,
8738            EngineError::Storage(StorageError::ColumnNotFound { .. })
8739        ));
8740    }
8741
8742    #[test]
8743    fn select_eq_uses_index_returns_same_rows_as_scan() {
8744        // Build two engines: one with an index, one without. Same query →
8745        // same row set (index is a planner optimisation, not a semantic
8746        // change).
8747        let mut without = Engine::new();
8748        make_three_row_users(&mut without);
8749        let mut with = Engine::new();
8750        make_three_row_users(&mut with);
8751        with.execute("CREATE INDEX by_id ON users (id)").unwrap();
8752
8753        let q = "SELECT * FROM users WHERE id = 2";
8754        let (_, no_idx_rows) = unwrap_rows(without.execute(q).unwrap());
8755        let (_, idx_rows) = unwrap_rows(with.execute(q).unwrap());
8756        assert_eq!(no_idx_rows, idx_rows);
8757        assert_eq!(idx_rows.len(), 1);
8758    }
8759
8760    #[test]
8761    fn select_eq_with_no_matching_index_value_returns_empty() {
8762        let mut e = Engine::new();
8763        make_three_row_users(&mut e);
8764        e.execute("CREATE INDEX by_id ON users (id)").unwrap();
8765        let (_, rows) = unwrap_rows(e.execute("SELECT * FROM users WHERE id = 999").unwrap());
8766        assert_eq!(rows.len(), 0);
8767    }
8768
8769    // --- v0.9 transactions -------------------------------------------------
8770
8771    #[test]
8772    fn begin_sets_in_transaction_flag() {
8773        let mut e = Engine::new();
8774        assert!(!e.in_transaction());
8775        e.execute("BEGIN").unwrap();
8776        assert!(e.in_transaction());
8777    }
8778
8779    #[test]
8780    fn double_begin_errors() {
8781        let mut e = Engine::new();
8782        e.execute("BEGIN").unwrap();
8783        let err = e.execute("BEGIN").unwrap_err();
8784        assert_eq!(err, EngineError::TransactionAlreadyOpen);
8785    }
8786
8787    #[test]
8788    fn commit_without_begin_errors() {
8789        let mut e = Engine::new();
8790        let err = e.execute("COMMIT").unwrap_err();
8791        assert_eq!(err, EngineError::NoActiveTransaction);
8792    }
8793
8794    #[test]
8795    fn rollback_without_begin_errors() {
8796        let mut e = Engine::new();
8797        let err = e.execute("ROLLBACK").unwrap_err();
8798        assert_eq!(err, EngineError::NoActiveTransaction);
8799    }
8800
8801    #[test]
8802    fn commit_applies_shadow_to_committed_catalog() {
8803        let mut e = Engine::new();
8804        e.execute("CREATE TABLE t (v INT NOT NULL)").unwrap();
8805        e.execute("BEGIN").unwrap();
8806        e.execute("INSERT INTO t VALUES (1)").unwrap();
8807        e.execute("INSERT INTO t VALUES (2)").unwrap();
8808        e.execute("COMMIT").unwrap();
8809        assert!(!e.in_transaction());
8810        assert_eq!(e.catalog().get("t").unwrap().row_count(), 2);
8811    }
8812
8813    #[test]
8814    fn rollback_discards_shadow() {
8815        let mut e = Engine::new();
8816        e.execute("CREATE TABLE t (v INT NOT NULL)").unwrap();
8817        e.execute("BEGIN").unwrap();
8818        e.execute("INSERT INTO t VALUES (1)").unwrap();
8819        e.execute("INSERT INTO t VALUES (2)").unwrap();
8820        e.execute("ROLLBACK").unwrap();
8821        assert!(!e.in_transaction());
8822        assert_eq!(e.catalog().get("t").unwrap().row_count(), 0);
8823    }
8824
8825    #[test]
8826    fn select_during_tx_sees_uncommitted_writes_own_session() {
8827        // The shadow catalog is read by SELECTs while a TX is open — the
8828        // session can see its own pending writes.
8829        let mut e = Engine::new();
8830        e.execute("CREATE TABLE t (v INT NOT NULL)").unwrap();
8831        e.execute("BEGIN").unwrap();
8832        e.execute("INSERT INTO t VALUES (42)").unwrap();
8833        let (_, rows) = unwrap_rows(e.execute("SELECT * FROM t").unwrap());
8834        assert_eq!(rows.len(), 1);
8835        assert_eq!(rows[0].values[0], Value::Int(42));
8836    }
8837
8838    #[test]
8839    fn snapshot_with_no_users_is_bare_catalog_format() {
8840        let mut e = Engine::new();
8841        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
8842        let bytes = e.snapshot();
8843        assert_eq!(
8844            &bytes[..8],
8845            b"SPGDB001",
8846            "must be the bare v3.x catalog magic"
8847        );
8848        let e2 = Engine::restore_envelope(&bytes).unwrap();
8849        assert!(e2.users().is_empty());
8850        assert_eq!(e2.catalog().table_count(), 1);
8851    }
8852
8853    #[test]
8854    fn snapshot_with_users_round_trips_both_via_envelope() {
8855        let mut e = Engine::new();
8856        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
8857        e.create_user("alice", "pw1", Role::Admin, [9; 16]).unwrap();
8858        e.create_user("bob", "pw2", Role::ReadOnly, [5; 16])
8859            .unwrap();
8860        let bytes = e.snapshot();
8861        assert_eq!(&bytes[..8], b"SPGENV01", "must be the v4.1 envelope magic");
8862        let e2 = Engine::restore_envelope(&bytes).unwrap();
8863        assert_eq!(e2.users().len(), 2);
8864        assert_eq!(e2.verify_user("alice", "pw1"), Some(Role::Admin));
8865        assert_eq!(e2.verify_user("bob", "pw2"), Some(Role::ReadOnly));
8866        assert_eq!(e2.verify_user("alice", "wrong"), None);
8867        assert_eq!(e2.catalog().table_count(), 1);
8868    }
8869
8870    #[test]
8871    fn ddl_inside_tx_also_rolled_back() {
8872        let mut e = Engine::new();
8873        e.execute("BEGIN").unwrap();
8874        e.execute("CREATE TABLE t (v INT)").unwrap();
8875        // Visible inside the TX.
8876        e.execute("SELECT * FROM t").unwrap();
8877        e.execute("ROLLBACK").unwrap();
8878        // Gone after rollback.
8879        let err = e.execute("SELECT * FROM t").unwrap_err();
8880        assert!(matches!(
8881            err,
8882            EngineError::Storage(StorageError::TableNotFound { .. })
8883        ));
8884    }
8885
8886    // ── v6.1.2: CREATE / DROP PUBLICATION (engine-side) ──────
8887
8888    #[test]
8889    fn create_publication_lands_in_catalog() {
8890        let mut e = Engine::new();
8891        assert!(e.publications().is_empty());
8892        e.execute("CREATE PUBLICATION pub_a").unwrap();
8893        assert_eq!(e.publications().len(), 1);
8894        assert!(e.publications().contains("pub_a"));
8895    }
8896
8897    #[test]
8898    fn create_publication_duplicate_errors() {
8899        let mut e = Engine::new();
8900        e.execute("CREATE PUBLICATION pub_a").unwrap();
8901        let err = e.execute("CREATE PUBLICATION pub_a").unwrap_err();
8902        assert!(
8903            alloc::format!("{err:?}").contains("DuplicateName"),
8904            "got {err:?}"
8905        );
8906    }
8907
8908    #[test]
8909    fn drop_publication_silent_when_absent() {
8910        let mut e = Engine::new();
8911        // PG-compatible: DROP a publication that doesn't exist
8912        // succeeds (no-op) but reports zero affected.
8913        let r = e.execute("DROP PUBLICATION nope").unwrap();
8914        match r {
8915            QueryResult::CommandOk { affected, .. } => assert_eq!(affected, 0),
8916            other => panic!("expected CommandOk, got {other:?}"),
8917        }
8918    }
8919
8920    #[test]
8921    fn drop_publication_present_reports_one_affected() {
8922        let mut e = Engine::new();
8923        e.execute("CREATE PUBLICATION pub_a").unwrap();
8924        let r = e.execute("DROP PUBLICATION pub_a").unwrap();
8925        match r {
8926            QueryResult::CommandOk {
8927                affected,
8928                modified_catalog,
8929            } => {
8930                assert_eq!(affected, 1);
8931                assert!(modified_catalog);
8932            }
8933            other => panic!("expected CommandOk, got {other:?}"),
8934        }
8935        assert!(e.publications().is_empty());
8936    }
8937
8938    #[test]
8939    fn publications_persist_across_snapshot_restore() {
8940        // The persist-across-restart ship-gate at the engine layer —
8941        // snapshot → restore_envelope round trip must preserve the
8942        // publication catalog. The spg-server e2e covers the
8943        // process-restart variant.
8944        let mut e = Engine::new();
8945        e.execute("CREATE PUBLICATION pub_a").unwrap();
8946        e.execute("CREATE PUBLICATION pub_b FOR ALL TABLES").unwrap();
8947        let snap = e.snapshot();
8948        let e2 = Engine::restore_envelope(&snap).unwrap();
8949        assert_eq!(e2.publications().len(), 2);
8950        assert!(e2.publications().contains("pub_a"));
8951        assert!(e2.publications().contains("pub_b"));
8952    }
8953
8954    #[test]
8955    fn create_publication_allowed_inside_transaction() {
8956        // v6.1.4 dropped the v6.1.2 in-TX guard — PG allows
8957        // CREATE PUBLICATION inside a TX and the auto-commit
8958        // wrap path needs the same allowance.
8959        let mut e = Engine::new();
8960        e.execute("BEGIN").unwrap();
8961        e.execute("CREATE PUBLICATION pub_a").unwrap();
8962        e.execute("COMMIT").unwrap();
8963        assert!(e.publications().contains("pub_a"));
8964    }
8965
8966    // ── v6.1.3: SHOW PUBLICATIONS + FOR-list variants ───────
8967
8968    #[test]
8969    fn create_publication_for_table_list_lands_with_scope() {
8970        let mut e = Engine::new();
8971        e.execute("CREATE TABLE t1 (id INT NOT NULL)").unwrap();
8972        e.execute("CREATE TABLE t2 (id INT NOT NULL)").unwrap();
8973        e.execute("CREATE PUBLICATION pub_a FOR TABLE t1, t2")
8974            .unwrap();
8975        let scope = e.publications().get("pub_a").cloned();
8976        let Some(spg_sql::ast::PublicationScope::ForTables(ts)) = scope else {
8977            panic!("expected ForTables scope, got {scope:?}")
8978        };
8979        assert_eq!(ts, alloc::vec!["t1".to_string(), "t2".to_string()]);
8980    }
8981
8982    #[test]
8983    fn create_publication_all_tables_except_lands_with_scope() {
8984        let mut e = Engine::new();
8985        e.execute("CREATE PUBLICATION pub_a FOR ALL TABLES EXCEPT t3")
8986            .unwrap();
8987        let scope = e.publications().get("pub_a").cloned();
8988        let Some(spg_sql::ast::PublicationScope::AllTablesExcept(ts)) = scope else {
8989            panic!("expected AllTablesExcept scope, got {scope:?}")
8990        };
8991        assert_eq!(ts, alloc::vec!["t3".to_string()]);
8992    }
8993
8994    #[test]
8995    fn show_publications_empty_returns_zero_rows() {
8996        let e = Engine::new();
8997        let r = e.execute_readonly("SHOW PUBLICATIONS").unwrap();
8998        let QueryResult::Rows { rows, columns } = r else {
8999            panic!()
9000        };
9001        assert!(rows.is_empty());
9002        assert_eq!(columns.len(), 3);
9003        assert_eq!(columns[0].name, "name");
9004        assert_eq!(columns[1].name, "scope");
9005        assert_eq!(columns[2].name, "table_count");
9006    }
9007
9008    #[test]
9009    fn show_publications_returns_one_row_per_publication_ordered_by_name() {
9010        let mut e = Engine::new();
9011        e.execute("CREATE PUBLICATION z_pub").unwrap();
9012        e.execute("CREATE PUBLICATION a_pub FOR TABLE t1, t2")
9013            .unwrap();
9014        e.execute("CREATE PUBLICATION m_pub FOR ALL TABLES EXCEPT bad")
9015            .unwrap();
9016        let r = e.execute_readonly("SHOW PUBLICATIONS").unwrap();
9017        let QueryResult::Rows { rows, .. } = r else {
9018            panic!()
9019        };
9020        assert_eq!(rows.len(), 3);
9021        // Alphabetical order: a_pub, m_pub, z_pub.
9022        let names: Vec<&str> = rows
9023            .iter()
9024            .map(|r| {
9025                if let Value::Text(s) = &r.values[0] {
9026                    s.as_str()
9027                } else {
9028                    panic!()
9029                }
9030            })
9031            .collect();
9032        assert_eq!(names, alloc::vec!["a_pub", "m_pub", "z_pub"]);
9033        // Row 0 — a_pub scope summary + table_count = 2.
9034        match &rows[0].values[1] {
9035            Value::Text(s) => assert_eq!(s, "FOR TABLE t1, t2"),
9036            other => panic!("expected Text, got {other:?}"),
9037        }
9038        assert_eq!(rows[0].values[2], Value::Int(2));
9039        // Row 1 — m_pub.
9040        match &rows[1].values[1] {
9041            Value::Text(s) => assert_eq!(s, "FOR ALL TABLES EXCEPT bad"),
9042            other => panic!("expected Text, got {other:?}"),
9043        }
9044        assert_eq!(rows[1].values[2], Value::Int(1));
9045        // Row 2 — z_pub (AllTables → NULL count).
9046        match &rows[2].values[1] {
9047            Value::Text(s) => assert_eq!(s, "FOR ALL TABLES"),
9048            other => panic!("expected Text, got {other:?}"),
9049        }
9050        assert_eq!(rows[2].values[2], Value::Null);
9051    }
9052
9053    #[test]
9054    fn for_list_scopes_persist_across_snapshot() {
9055        // The v6.1.2 envelope-v3 round-trip exercised AllTables;
9056        // v6.1.3 needs the scope-1 / scope-2 tags to survive too.
9057        let mut e = Engine::new();
9058        e.execute("CREATE PUBLICATION p1 FOR TABLE t1, t2").unwrap();
9059        e.execute("CREATE PUBLICATION p2 FOR ALL TABLES EXCEPT bad, worse")
9060            .unwrap();
9061        let snap = e.snapshot();
9062        let e2 = Engine::restore_envelope(&snap).unwrap();
9063        assert_eq!(e2.publications().len(), 2);
9064        let p1 = e2.publications().get("p1").cloned();
9065        let Some(spg_sql::ast::PublicationScope::ForTables(ts)) = p1 else {
9066            panic!("p1 scope lost: {p1:?}")
9067        };
9068        assert_eq!(ts, alloc::vec!["t1".to_string(), "t2".to_string()]);
9069        let p2 = e2.publications().get("p2").cloned();
9070        let Some(spg_sql::ast::PublicationScope::AllTablesExcept(ts)) = p2 else {
9071            panic!("p2 scope lost: {p2:?}")
9072        };
9073        assert_eq!(ts, alloc::vec!["bad".to_string(), "worse".to_string()]);
9074    }
9075
9076    // ── v6.1.4: CREATE / DROP SUBSCRIPTION + SHOW + envelope v4 ─
9077
9078    #[test]
9079    fn create_subscription_lands_in_catalog_with_defaults() {
9080        let mut e = Engine::new();
9081        e.execute(
9082            "CREATE SUBSCRIPTION sub_a CONNECTION 'host=127.0.0.1 port=20002' PUBLICATION pub_a",
9083        )
9084        .unwrap();
9085        let s = e.subscriptions().get("sub_a").cloned().expect("present");
9086        assert_eq!(s.conn_str, "host=127.0.0.1 port=20002");
9087        assert_eq!(s.publications, alloc::vec!["pub_a".to_string()]);
9088        assert!(s.enabled);
9089        assert_eq!(s.last_received_pos, 0);
9090    }
9091
9092    #[test]
9093    fn create_subscription_duplicate_name_errors() {
9094        let mut e = Engine::new();
9095        e.execute("CREATE SUBSCRIPTION s CONNECTION 'host=x' PUBLICATION p")
9096            .unwrap();
9097        let err = e
9098            .execute("CREATE SUBSCRIPTION s CONNECTION 'host=y' PUBLICATION p")
9099            .unwrap_err();
9100        assert!(
9101            alloc::format!("{err:?}").contains("DuplicateName"),
9102            "got {err:?}"
9103        );
9104    }
9105
9106    #[test]
9107    fn drop_subscription_silent_when_absent() {
9108        let mut e = Engine::new();
9109        let r = e.execute("DROP SUBSCRIPTION never").unwrap();
9110        match r {
9111            QueryResult::CommandOk { affected, .. } => assert_eq!(affected, 0),
9112            other => panic!("expected CommandOk, got {other:?}"),
9113        }
9114    }
9115
9116    #[test]
9117    fn subscription_advance_updates_last_pos_monotone() {
9118        let mut e = Engine::new();
9119        e.execute("CREATE SUBSCRIPTION s CONNECTION 'h=x' PUBLICATION p")
9120            .unwrap();
9121        assert!(e.subscription_advance("s", 100));
9122        assert_eq!(e.subscriptions().get("s").unwrap().last_received_pos, 100);
9123        assert!(e.subscription_advance("s", 50)); // stale → ignored
9124        assert_eq!(e.subscriptions().get("s").unwrap().last_received_pos, 100);
9125        assert!(e.subscription_advance("s", 200));
9126        assert_eq!(e.subscriptions().get("s").unwrap().last_received_pos, 200);
9127        assert!(!e.subscription_advance("missing", 1));
9128    }
9129
9130    #[test]
9131    fn show_subscriptions_returns_rows_ordered_by_name() {
9132        let mut e = Engine::new();
9133        e.execute("CREATE SUBSCRIPTION z_sub CONNECTION 'h=x' PUBLICATION p1, p2")
9134            .unwrap();
9135        e.execute("CREATE SUBSCRIPTION a_sub CONNECTION 'h=y' PUBLICATION p3")
9136            .unwrap();
9137        let r = e.execute_readonly("SHOW SUBSCRIPTIONS").unwrap();
9138        let QueryResult::Rows { rows, columns } = r else {
9139            panic!()
9140        };
9141        assert_eq!(rows.len(), 2);
9142        assert_eq!(columns.len(), 5);
9143        assert_eq!(columns[0].name, "name");
9144        assert_eq!(columns[4].name, "last_received_pos");
9145        // Alphabetical: a_sub, z_sub.
9146        let names: Vec<&str> = rows
9147            .iter()
9148            .map(|r| {
9149                if let Value::Text(s) = &r.values[0] {
9150                    s.as_str()
9151                } else {
9152                    panic!()
9153                }
9154            })
9155            .collect();
9156        assert_eq!(names, alloc::vec!["a_sub", "z_sub"]);
9157        // Row 0: a_sub
9158        assert_eq!(rows[0].values[1], Value::Text("h=y".to_string()));
9159        assert_eq!(rows[0].values[2], Value::Text("p3".to_string()));
9160        assert_eq!(rows[0].values[3], Value::Bool(true));
9161        assert_eq!(rows[0].values[4], Value::BigInt(0));
9162        // Row 1: z_sub — publications join with ", "
9163        assert_eq!(rows[1].values[2], Value::Text("p1, p2".to_string()));
9164    }
9165
9166    #[test]
9167    fn subscriptions_persist_across_snapshot_envelope_v4() {
9168        let mut e = Engine::new();
9169        e.execute("CREATE SUBSCRIPTION s1 CONNECTION 'h=A' PUBLICATION p1, p2")
9170            .unwrap();
9171        e.execute("CREATE SUBSCRIPTION s2 CONNECTION 'h=B' PUBLICATION p3")
9172            .unwrap();
9173        e.subscription_advance("s2", 42);
9174        let snap = e.snapshot();
9175        let e2 = Engine::restore_envelope(&snap).unwrap();
9176        assert_eq!(e2.subscriptions().len(), 2);
9177        let s1 = e2.subscriptions().get("s1").unwrap();
9178        assert_eq!(s1.conn_str, "h=A");
9179        assert_eq!(s1.publications, alloc::vec!["p1".to_string(), "p2".to_string()]);
9180        assert_eq!(s1.last_received_pos, 0);
9181        let s2 = e2.subscriptions().get("s2").unwrap();
9182        assert_eq!(s2.last_received_pos, 42);
9183    }
9184
9185    #[test]
9186    fn v3_envelope_loads_with_empty_subscriptions() {
9187        // v3 snapshot (publications-only). Forge it by hand so we
9188        // verify v6.1.4 readers don't panic — they must surface
9189        // empty subscriptions and a populated publication table.
9190        let mut e = Engine::new();
9191        e.execute("CREATE PUBLICATION pub_legacy").unwrap();
9192        let catalog = e.catalog.serialize();
9193        let users = crate::users::serialize_users(&e.users);
9194        let pubs = e.publications.serialize();
9195        let mut buf = Vec::new();
9196        buf.extend_from_slice(b"SPGENV01");
9197        buf.push(3u8); // v3
9198        buf.extend_from_slice(&u32::try_from(catalog.len()).unwrap().to_le_bytes());
9199        buf.extend_from_slice(&catalog);
9200        buf.extend_from_slice(&u32::try_from(users.len()).unwrap().to_le_bytes());
9201        buf.extend_from_slice(&users);
9202        buf.extend_from_slice(&u32::try_from(pubs.len()).unwrap().to_le_bytes());
9203        buf.extend_from_slice(&pubs);
9204        let crc = spg_crypto::crc32::crc32(&buf);
9205        buf.extend_from_slice(&crc.to_le_bytes());
9206
9207        let e2 = Engine::restore_envelope(&buf).expect("v3 envelope restores under v4 reader");
9208        assert!(e2.subscriptions().is_empty());
9209        assert!(e2.publications().contains("pub_legacy"));
9210    }
9211
9212    #[test]
9213    fn create_subscription_allowed_inside_transaction() {
9214        let mut e = Engine::new();
9215        e.execute("BEGIN").unwrap();
9216        e.execute("CREATE SUBSCRIPTION s CONNECTION 'h=x' PUBLICATION p")
9217            .unwrap();
9218        e.execute("COMMIT").unwrap();
9219        assert!(e.subscriptions().contains("s"));
9220    }
9221
9222    #[test]
9223    // ── v6.2.0: ANALYZE + spg_statistic + envelope v5 ──────────
9224
9225    #[test]
9226    fn analyze_populates_histogram_bounds() {
9227        let mut e = Engine::new();
9228        e.execute("CREATE TABLE t (id INT NOT NULL, name TEXT)").unwrap();
9229        for i in 0..50 {
9230            e.execute(&alloc::format!(
9231                "INSERT INTO t VALUES ({i}, 'name{i}')"
9232            ))
9233            .unwrap();
9234        }
9235        e.execute("ANALYZE t").unwrap();
9236        let stats = e.statistics();
9237        let id_stats = stats.get("t", "id").unwrap();
9238        assert!(id_stats.histogram_bounds.len() >= 2);
9239        assert_eq!(id_stats.histogram_bounds.first().unwrap(), "0");
9240        assert_eq!(id_stats.histogram_bounds.last().unwrap(), "49");
9241        assert!((id_stats.null_frac - 0.0).abs() < 1e-6);
9242        assert_eq!(id_stats.n_distinct, 50);
9243    }
9244
9245    #[test]
9246    fn reanalyze_overwrites_prior_stats() {
9247        let mut e = Engine::new();
9248        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
9249        for i in 0..10 {
9250            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
9251        }
9252        e.execute("ANALYZE t").unwrap();
9253        let n1 = e.statistics().get("t", "id").unwrap().n_distinct;
9254        assert_eq!(n1, 10);
9255        for i in 10..30 {
9256            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
9257        }
9258        e.execute("ANALYZE t").unwrap();
9259        let n2 = e.statistics().get("t", "id").unwrap().n_distinct;
9260        assert_eq!(n2, 30);
9261    }
9262
9263    #[test]
9264    fn analyze_unknown_table_errors() {
9265        let mut e = Engine::new();
9266        let err = e.execute("ANALYZE nonexistent").unwrap_err();
9267        assert!(matches!(err, EngineError::Storage(StorageError::TableNotFound { .. })));
9268    }
9269
9270    #[test]
9271    fn bare_analyze_covers_all_user_tables() {
9272        let mut e = Engine::new();
9273        e.execute("CREATE TABLE t1 (id INT NOT NULL)").unwrap();
9274        e.execute("CREATE TABLE t2 (name TEXT NOT NULL)").unwrap();
9275        e.execute("INSERT INTO t1 VALUES (1)").unwrap();
9276        e.execute("INSERT INTO t2 VALUES ('alice')").unwrap();
9277        let r = e.execute("ANALYZE").unwrap();
9278        match r {
9279            QueryResult::CommandOk { affected, modified_catalog } => {
9280                assert_eq!(affected, 2);
9281                assert!(modified_catalog);
9282            }
9283            other => panic!("expected CommandOk, got {other:?}"),
9284        }
9285        assert!(e.statistics().get("t1", "id").is_some());
9286        assert!(e.statistics().get("t2", "name").is_some());
9287    }
9288
9289    #[test]
9290    fn select_from_spg_statistic_returns_rows_per_column() {
9291        let mut e = Engine::new();
9292        e.execute("CREATE TABLE t (id INT NOT NULL, label TEXT)")
9293            .unwrap();
9294        e.execute("INSERT INTO t VALUES (1, 'a')").unwrap();
9295        e.execute("INSERT INTO t VALUES (2, 'b')").unwrap();
9296        e.execute("ANALYZE t").unwrap();
9297        let r = e.execute_readonly("SELECT * FROM spg_statistic").unwrap();
9298        let QueryResult::Rows { rows, columns } = r else {
9299            panic!()
9300        };
9301        // v6.7.0 — spg_statistic gained a `cold_row_count` column.
9302        assert_eq!(columns.len(), 6);
9303        assert_eq!(columns[0].name, "table_name");
9304        assert_eq!(columns[4].name, "histogram_bounds");
9305        assert_eq!(columns[5].name, "cold_row_count");
9306        assert_eq!(rows.len(), 2, "one row per column of t");
9307        // Sorted by (table_name, column_name).
9308        match (&rows[0].values[0], &rows[0].values[1]) {
9309            (Value::Text(t), Value::Text(c)) => {
9310                assert_eq!(t, "t");
9311                // BTreeMap orders (table, column); columns "id" < "label".
9312                assert_eq!(c, "id");
9313            }
9314            _ => panic!(),
9315        }
9316    }
9317
9318    #[test]
9319    fn analyze_skips_vector_columns() {
9320        // Vector columns have their own stats shape (HNSW graph);
9321        // ANALYZE leaves them out of spg_statistic.
9322        let mut e = Engine::new();
9323        e.execute("CREATE TABLE t (id INT NOT NULL, v VECTOR(3) NOT NULL)")
9324            .unwrap();
9325        e.execute("INSERT INTO t VALUES (1, [1, 2, 3])").unwrap();
9326        e.execute("ANALYZE t").unwrap();
9327        assert!(e.statistics().get("t", "id").is_some());
9328        assert!(e.statistics().get("t", "v").is_none());
9329    }
9330
9331    #[test]
9332    fn statistics_persist_across_envelope_v5_round_trip() {
9333        let mut e = Engine::new();
9334        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
9335        for i in 0..20 {
9336            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
9337        }
9338        e.execute("ANALYZE").unwrap();
9339        let snap = e.snapshot();
9340        let e2 = Engine::restore_envelope(&snap).unwrap();
9341        let s = e2.statistics().get("t", "id").unwrap();
9342        assert_eq!(s.n_distinct, 20);
9343    }
9344
9345    // ── v6.2.1 auto-analyze threshold ───────────────────────────
9346
9347    #[test]
9348    fn auto_analyze_threshold_fires_after_10pct_of_min_rows_on_small_table() {
9349        // For a table with 0 rows then 10 inserts → modified=10,
9350        // row_count=10. Threshold = 0.1 × max(10, 100) = 10. So
9351        // after the 10th INSERT the threshold is met.
9352        let mut e = Engine::new();
9353        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
9354        for i in 0..9 {
9355            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
9356        }
9357        assert!(e.tables_needing_analyze().is_empty(), "9 < threshold");
9358        e.execute("INSERT INTO t VALUES (9)").unwrap();
9359        let needs = e.tables_needing_analyze();
9360        assert_eq!(needs, alloc::vec!["t".to_string()]);
9361    }
9362
9363    #[test]
9364    fn auto_analyze_threshold_uses_10pct_of_row_count_for_large_tables() {
9365        // After ANALYZE on 1000 rows, threshold = 0.1 × row_count.
9366        // Each new INSERT bumps both modified and row_count, so to
9367        // trigger from N=1000 we need modifications ≥ 0.1 × (1000+M),
9368        // i.e. M ≥ 112. The test inserts 50 (no fire), then 150
9369        // more (200 total mods, row_count=1200, threshold=120 → fire).
9370        let mut e = Engine::new();
9371        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
9372        for i in 0..1000 {
9373            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
9374        }
9375        e.execute("ANALYZE t").unwrap();
9376        assert!(e.tables_needing_analyze().is_empty(), "fresh ANALYZE");
9377        for i in 1000..1050 {
9378            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
9379        }
9380        assert!(
9381            e.tables_needing_analyze().is_empty(),
9382            "50 inserts < threshold of ~105"
9383        );
9384        for i in 1050..1200 {
9385            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
9386        }
9387        assert_eq!(
9388            e.tables_needing_analyze(),
9389            alloc::vec!["t".to_string()],
9390            "200 inserts > 0.1 × 1200 threshold"
9391        );
9392    }
9393
9394    #[test]
9395    fn auto_analyze_threshold_resets_after_analyze() {
9396        let mut e = Engine::new();
9397        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
9398        for i in 0..200 {
9399            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
9400        }
9401        assert!(!e.tables_needing_analyze().is_empty());
9402        e.execute("ANALYZE").unwrap();
9403        assert!(
9404            e.tables_needing_analyze().is_empty(),
9405            "ANALYZE must reset the counter"
9406        );
9407    }
9408
9409    #[test]
9410    fn auto_analyze_threshold_tracks_updates_and_deletes() {
9411        let mut e = Engine::new();
9412        e.execute("CREATE TABLE t (id INT NOT NULL, label TEXT)").unwrap();
9413        for i in 0..50 {
9414            e.execute(&alloc::format!("INSERT INTO t VALUES ({i}, 'x')"))
9415                .unwrap();
9416        }
9417        e.execute("ANALYZE t").unwrap();
9418        // UPDATE 20 rows + DELETE 5 → modified=25. Threshold = 0.1
9419        // × max(50, 100) = 10. So 25 >= 10 → trigger.
9420        e.execute("UPDATE t SET label = 'y' WHERE id < 20").unwrap();
9421        e.execute("DELETE FROM t WHERE id >= 45").unwrap();
9422        assert_eq!(
9423            e.tables_needing_analyze(),
9424            alloc::vec!["t".to_string()]
9425        );
9426    }
9427
9428    #[test]
9429    fn v4_envelope_loads_with_empty_statistics() {
9430        // Forge a v4 envelope by hand: catalog + users + pubs +
9431        // subs trailer, no statistics. A v6.2.0 reader must accept
9432        // it and surface an empty Statistics.
9433        let mut e = Engine::new();
9434        e.create_user("alice", "secret", crate::users::Role::ReadOnly, [0u8; 16])
9435            .unwrap();
9436        let catalog = e.catalog.serialize();
9437        let users = crate::users::serialize_users(&e.users);
9438        let pubs = e.publications.serialize();
9439        let subs = e.subscriptions.serialize();
9440        let mut buf = Vec::new();
9441        buf.extend_from_slice(b"SPGENV01");
9442        buf.push(4u8);
9443        buf.extend_from_slice(&u32::try_from(catalog.len()).unwrap().to_le_bytes());
9444        buf.extend_from_slice(&catalog);
9445        buf.extend_from_slice(&u32::try_from(users.len()).unwrap().to_le_bytes());
9446        buf.extend_from_slice(&users);
9447        buf.extend_from_slice(&u32::try_from(pubs.len()).unwrap().to_le_bytes());
9448        buf.extend_from_slice(&pubs);
9449        buf.extend_from_slice(&u32::try_from(subs.len()).unwrap().to_le_bytes());
9450        buf.extend_from_slice(&subs);
9451        let crc = spg_crypto::crc32::crc32(&buf);
9452        buf.extend_from_slice(&crc.to_le_bytes());
9453        let e2 = Engine::restore_envelope(&buf).expect("v4 envelope restores");
9454        assert!(e2.statistics().is_empty());
9455    }
9456
9457    #[test]
9458    fn v1_v2_envelope_loads_with_empty_publications() {
9459        // A snapshot taken before v6.1.2 (no publication trailer,
9460        // envelope v2) must still deserialise — and the resulting
9461        // engine must report zero publications. Use the engine's own
9462        // round-trip with no publications: that emits v3 but with an
9463        // empty pubs block. Then forge a v2 envelope by hand to lock
9464        // the back-compat path.
9465        let mut e = Engine::new();
9466        // Force users to be non-empty so the snapshot takes the
9467        // envelope path rather than the bare-catalog fallback.
9468        e.create_user(
9469            "alice",
9470            "secret",
9471            crate::users::Role::ReadOnly,
9472            [0u8; 16],
9473        )
9474        .unwrap();
9475
9476        // Forge an envelope v2: same shape as v3 but no pubs trailer.
9477        let catalog = e.catalog.serialize();
9478        let users = crate::users::serialize_users(&e.users);
9479        let mut buf = Vec::new();
9480        buf.extend_from_slice(b"SPGENV01");
9481        buf.push(2u8); // v2
9482        buf.extend_from_slice(
9483            &u32::try_from(catalog.len()).unwrap().to_le_bytes(),
9484        );
9485        buf.extend_from_slice(&catalog);
9486        buf.extend_from_slice(
9487            &u32::try_from(users.len()).unwrap().to_le_bytes(),
9488        );
9489        buf.extend_from_slice(&users);
9490        let crc = spg_crypto::crc32::crc32(&buf);
9491        buf.extend_from_slice(&crc.to_le_bytes());
9492
9493        let e2 = Engine::restore_envelope(&buf).expect("v2 envelope restores");
9494        assert!(e2.publications().is_empty());
9495    }
9496}