powdb_query/executor/
mod.rs

1//! PowDB query executor.
2
3// Submodules that don't use macros defined in this file.
4mod compiled;
5mod eval;
6pub mod mem_budget;
7
8use crate::ast::*;
9use crate::canonicalize::canonicalize;
10use crate::plan::*;
11use crate::plan_cache::PlanCache;
12use crate::planner;
13use crate::result::{QueryError, QueryResult};
14use powdb_storage::catalog::Catalog;
15use powdb_storage::row::{decode_row, RowLayout, ROW_MAGIC, ROW_PREFIX_SIZE};
16use powdb_storage::types::*;
17use powdb_storage::view::ViewRegistry;
18pub use powdb_storage::wal::{WalDurabilityTicket, WalSyncMode};
19
20use std::io;
21use std::path::Path;
22use std::sync::{Arc, Mutex};
23use std::time::Instant;
24use tracing::{error, info, Level};
25
26use self::compiled::*;
27use self::eval::*;
28
29/// Legacy sentinel string constant — kept for backward compatibility with
30/// any external code matching on the string representation. New code should
31/// match on `QueryError::ReadonlyNeedsWrite` directly.
32pub const READONLY_NEEDS_WRITE: &str = "__POWDB_READONLY_NEEDS_WRITE__";
33
34/// Return the byte offset where the row body starts.
35///
36/// v0.5 rows begin with the `PROW` magic/version prefix. Legacy rows start
37/// directly with the row body. Raw executor fast paths must add this base
38/// before reading body-relative bitmap/data offsets.
39#[inline]
40pub(crate) fn row_body_base(row: &[u8]) -> usize {
41    if row.len() >= ROW_PREFIX_SIZE && &row[0..4] == ROW_MAGIC {
42        ROW_PREFIX_SIZE
43    } else {
44        0
45    }
46}
47
48/// Query frontend dialect. PowQL remains the default/native dialect; SQL is
49/// an explicit frontend that lowers to the same AST before planning.
50#[derive(Debug, Clone, Copy, PartialEq, Eq)]
51pub enum QueryDialect {
52    PowQL,
53    Sql,
54}
55
56/// Plan cache capacity. Bench workloads fill ~15 slots; real apps will sit
57/// comfortably in 256. Lookup is O(1), collisions clear the cache (see
58/// `plan_cache::PlanCache::insert`).
59const PLAN_CACHE_CAPACITY: usize = 256;
60const SQL_RAW_CACHE_SALT: u64 = 0x7261_772d_7371_6c01;
61
62#[inline]
63fn sql_raw_cache_hash(hash: u64) -> u64 {
64    hash ^ SQL_RAW_CACHE_SALT
65}
66type WalArchiveHook =
67    Arc<dyn Fn(&Path, &[powdb_storage::wal::WalRecord]) -> io::Result<()> + Send + Sync>;
68
69/// Maximum number of rows a join may produce before the executor aborts.
70/// Prevents Cartesian-product blowups (e.g. `T cross join T` on 10K rows
71/// would produce 100M rows in memory without this cap).
72pub(super) const MAX_JOIN_ROWS: usize = 1_000_000;
73
74/// Maximum candidate pairs allowed for a fallback nested-loop join. This is
75/// grounded in the release benchmark's conservative 250 ms evaluation budget.
76pub(super) const MAX_NESTED_LOOP_PAIRS: usize = 6_400_000;
77
78/// Maximum number of rows that may be materialized for sorting.
79/// Queries that exceed this should add a LIMIT clause to narrow the input
80/// before sorting.
81pub(super) const MAX_SORT_ROWS: usize = 10_000_000;
82
83#[inline]
84pub(super) fn check_join_limit(row_count: usize) -> Result<(), QueryError> {
85    if row_count > MAX_JOIN_ROWS {
86        return Err(QueryError::JoinLimitExceeded);
87    }
88    Ok(())
89}
90
91// ─── Mission D11 Phase 1: scalar hot-loop helpers ─────────────────────────
92//
93// These macros expand into the scan body of `agg_single_col_fast` and sit
94// inside the `for_each_row_raw` closure. They exist to:
95//
96//   1. Split the loop on presence of a predicate *outside* the hot body,
97//      so the no-predicate path (agg_sum/agg_min/agg_max bench workloads)
98//      never pays the `Option<CompiledPredicate>` branch per row.
99//   2. Drop two bounds checks per row by reading the null bitmap byte
100//      and the 8-byte value via raw pointer casts.
101//
102// SAFETY (shared across every call site below):
103//
104//   - `$bmp_byte` is `col_idx / 8` where `col_idx < n_cols`, and the row body
105//     encoding stores `bitmap_size = n_cols.div_ceil(8)` bytes of bitmap
106//     starting at body offset 2. So `bmp_off = row_body_base(row) + 2 +
107//     $bmp_byte < row_len`, and `get_unchecked(bmp_off)` is inside the
108//     row slice.
109//   - `$off = 2 + bitmap_size + fixed_offsets[col_idx]` is body-relative for a fixed-size
110//     column. Every fixed-size column contributes `fixed_size(type_id)`
111//     bytes to the fixed region, so the row always has
112//     `[data_off .. data_off + 8]` available for any i64/f64 column, where
113//     `data_off = row_body_base(row) + $off` — enforced by the row encoder
114//     (`storage/src/row.rs`) and the schema invariant that a row with a
115//     given schema has enough body bytes for `2 + bitmap_size + fixed_region_size`.
116//   - Both macros are only invoked from `agg_single_col_fast`, which
117//     early-returns if the column isn't Int/Float (8-byte fixed) and
118//     early-returns if `fast.fixed_offsets[col_idx]` is `None`.
119macro_rules! agg_int_loop {
120    (
121        $self:expr, $table:expr, $pred:expr,
122        $bmp_byte:expr, $bmp_bit:expr, $off:expr,
123        |$v:ident : i64| $body:block
124    ) => {{
125        let bmp_byte = $bmp_byte;
126        let bmp_bit = $bmp_bit;
127        let off = $off;
128        if let Some(pred) = &$pred {
129            for_each_row_raw_cancellable(&$self.catalog, $table, |_rid, data| {
130                if !pred(data) {
131                    return;
132                }
133                let base = row_body_base(data);
134                let bmp_off = base + 2 + bmp_byte;
135                let data_off = base + off;
136                // Bounds guard: skip corrupt/truncated rows that are too
137                // short to contain the bitmap byte or the 8-byte value.
138                if bmp_off >= data.len() || data_off + 8 > data.len() {
139                    return;
140                }
141                // SAFETY: `bmp_off < data.len()` is checked above.
142                // The bitmap byte lives at body offset 2..2+bitmap_size in
143                // the row encoding, and bmp_byte = col_idx / 8 < bitmap_size.
144                // Corrupt rows are rejected by the bounds guard.
145                let bmp = unsafe { *data.get_unchecked(bmp_off) };
146                if (bmp >> bmp_bit) & 1 == 1 {
147                    return;
148                }
149                // SAFETY: `data_off + 8 <= data.len()` is checked above.
150                // `data_off = base + 2 + bitmap_size + fixed_offsets[col_idx]`
151                // points to an 8-byte i64 in the fixed-size region of the row.
152                // The pointer cast is valid because we read exactly 8
153                // bytes via from_le_bytes. Corrupt rows are rejected by
154                // the bounds guard.
155                let $v: i64 =
156                    unsafe { i64::from_le_bytes(*(data.as_ptr().add(data_off) as *const [u8; 8])) };
157                $body
158            })?;
159        } else {
160            for_each_row_raw_cancellable(&$self.catalog, $table, |_rid, data| {
161                let base = row_body_base(data);
162                let bmp_off = base + 2 + bmp_byte;
163                let data_off = base + off;
164                // Bounds guard: skip corrupt/truncated rows.
165                if bmp_off >= data.len() || data_off + 8 > data.len() {
166                    return;
167                }
168                // SAFETY: `bmp_off < data.len()` is checked above.
169                // See the predicate branch for the full invariant.
170                let bmp = unsafe { *data.get_unchecked(bmp_off) };
171                if (bmp >> bmp_bit) & 1 == 1 {
172                    return;
173                }
174                // SAFETY: `data_off + 8 <= data.len()` is checked above.
175                // See the predicate branch for the full invariant.
176                let $v: i64 =
177                    unsafe { i64::from_le_bytes(*(data.as_ptr().add(data_off) as *const [u8; 8])) };
178                $body
179            })?;
180        }
181    }};
182}
183
184macro_rules! agg_float_loop {
185    (
186        $self:expr, $table:expr, $pred:expr,
187        $bmp_byte:expr, $bmp_bit:expr, $off:expr,
188        |$v:ident : f64| $body:block
189    ) => {{
190        let bmp_byte = $bmp_byte;
191        let bmp_bit = $bmp_bit;
192        let off = $off;
193        if let Some(pred) = &$pred {
194            for_each_row_raw_cancellable(&$self.catalog, $table, |_rid, data| {
195                if !pred(data) {
196                    return;
197                }
198                let base = row_body_base(data);
199                let bmp_off = base + 2 + bmp_byte;
200                let data_off = base + off;
201                // Bounds guard: skip corrupt/truncated rows that are too
202                // short to contain the bitmap byte or the 8-byte value.
203                if bmp_off >= data.len() || data_off + 8 > data.len() {
204                    return;
205                }
206                // SAFETY: `bmp_off < data.len()` is checked above.
207                // The bitmap byte lives at body offset 2..2+bitmap_size in
208                // the row encoding, and bmp_byte = col_idx / 8 < bitmap_size.
209                // Corrupt rows are rejected by the bounds guard.
210                let bmp = unsafe { *data.get_unchecked(bmp_off) };
211                if (bmp >> bmp_bit) & 1 == 1 {
212                    return;
213                }
214                // SAFETY: `data_off + 8 <= data.len()` is checked above.
215                // `data_off = base + 2 + bitmap_size + fixed_offsets[col_idx]`
216                // points to an 8-byte f64 in the fixed-size region of the row.
217                // The pointer cast is valid because we read exactly 8
218                // bytes via from_le_bytes. Corrupt rows are rejected by
219                // the bounds guard.
220                let $v: f64 =
221                    unsafe { f64::from_le_bytes(*(data.as_ptr().add(data_off) as *const [u8; 8])) };
222                $body
223            })?;
224        } else {
225            for_each_row_raw_cancellable(&$self.catalog, $table, |_rid, data| {
226                let base = row_body_base(data);
227                let bmp_off = base + 2 + bmp_byte;
228                let data_off = base + off;
229                // Bounds guard: skip corrupt/truncated rows.
230                if bmp_off >= data.len() || data_off + 8 > data.len() {
231                    return;
232                }
233                // SAFETY: `bmp_off < data.len()` is checked above.
234                // See the predicate branch for the full invariant.
235                let bmp = unsafe { *data.get_unchecked(bmp_off) };
236                if (bmp >> bmp_bit) & 1 == 1 {
237                    return;
238                }
239                // SAFETY: `data_off + 8 <= data.len()` is checked above.
240                // See the predicate branch for the full invariant.
241                let $v: f64 =
242                    unsafe { f64::from_le_bytes(*(data.as_ptr().add(data_off) as *const [u8; 8])) };
243                $body
244            })?;
245        }
246    }};
247}
248
249// Submodules that use the macros above — must be declared after macro_rules!.
250mod plan_exec;
251mod prepared;
252
253#[cfg(test)]
254mod tests;
255
256// Re-exports for the public API
257pub use self::prepared::PreparedQuery;
258
259use self::plan_exec::{
260    aggregate_rows, aggregate_rows_with_provenance, compare_order_values,
261    cooperative_stable_sort_by, exec_group_by, exec_group_by_with_provenance,
262    execute_materialized_join, execute_window, for_each_row_raw_cancellable, format_plan_tree,
263    lower_unindexed_scans, predicate_column_indices_json, range_matches,
264    synthesize_range_predicate, validate_json_path_types, validate_no_stray_aggregates,
265};
266
267/// Mission infra-1: classify a parsed statement as read-only vs. mutating.
268/// Used by [`Engine::execute_powql_readonly`] and by the server handler
269/// to decide between the RwLock reader and writer sides. `Union` recurses
270/// because each side can independently be read/write (though in practice
271/// both sides are reads — the parser only builds Union from query shapes).
272pub fn is_read_only_statement(stmt: &Statement) -> bool {
273    match stmt {
274        Statement::Query(_) => true,
275        Statement::ListTypes | Statement::Describe(_) => true,
276        Statement::Union(u) => is_read_only_statement(&u.left) && is_read_only_statement(&u.right),
277        Statement::Insert(_)
278        | Statement::Upsert(_)
279        | Statement::UpdateQuery(_)
280        | Statement::DeleteQuery(_)
281        | Statement::CreateType(_)
282        | Statement::AlterTable(_)
283        | Statement::DropTable(_)
284        | Statement::CreateView(_)
285        | Statement::RefreshView(_)
286        | Statement::DropView(_) => false,
287        Statement::Begin | Statement::Commit | Statement::Rollback => false,
288        Statement::Explain(inner) => is_read_only_statement(inner),
289    }
290}
291
292/// Map a read-only executor result into the read-only-engine surface: the
293/// internal "this statement writes" sentinel ([`QueryError::ReadonlyNeedsWrite`])
294/// becomes the terminal, operator-facing [`QueryError::ReadonlyMode`]. There is
295/// no writer to escalate to in read-only mode, so the sentinel never leaves the
296/// engine.
297fn to_readonly_terminal(
298    result: Result<QueryResult, QueryError>,
299) -> Result<QueryResult, QueryError> {
300    match result {
301        Err(QueryError::ReadonlyNeedsWrite) => Err(QueryError::ReadonlyMode),
302        other => other,
303    }
304}
305
306/// Return whether executing this read plan would have to refresh a dirty
307/// materialized view. This is intentionally a whole-plan preflight: the server
308/// may retry only this typed condition under exclusive admission, so it must be
309/// raised before any input branch performs row work.
310fn plan_reads_dirty_view(plan: &PlanNode, views: &ViewRegistry) -> bool {
311    match plan {
312        PlanNode::SeqScan { table }
313        | PlanNode::AliasScan { table, .. }
314        | PlanNode::IndexScan { table, .. }
315        | PlanNode::RangeScan { table, .. }
316        | PlanNode::ExprIndexScan { table, .. }
317        | PlanNode::ExprRangeScan { table, .. }
318        | PlanNode::OrderedExprIndexScan { table, .. } => views.is_dirty(table),
319
320        PlanNode::Filter { input, .. }
321        | PlanNode::Project { input, .. }
322        | PlanNode::Sort { input, .. }
323        | PlanNode::Limit { input, .. }
324        | PlanNode::Offset { input, .. }
325        | PlanNode::Aggregate { input, .. }
326        | PlanNode::Distinct { input }
327        | PlanNode::GroupBy { input, .. }
328        | PlanNode::Window { input, .. } => plan_reads_dirty_view(input, views),
329
330        PlanNode::NestedLoopJoin { left, right, .. } | PlanNode::Union { left, right, .. } => {
331            plan_reads_dirty_view(left, views) || plan_reads_dirty_view(right, views)
332        }
333
334        // EXPLAIN formats its input without executing it, so inspecting a plan
335        // that names a dirty view never requires a refresh.
336        PlanNode::Explain { .. }
337        | PlanNode::AlterTable { .. }
338        | PlanNode::DropTable { .. }
339        | PlanNode::Insert { .. }
340        | PlanNode::Upsert { .. }
341        | PlanNode::Update { .. }
342        | PlanNode::Delete { .. }
343        | PlanNode::CreateTable { .. }
344        | PlanNode::ListTypes
345        | PlanNode::Describe { .. }
346        | PlanNode::CreateView { .. }
347        | PlanNode::RefreshView { .. }
348        | PlanNode::DropView { .. }
349        | PlanNode::Begin
350        | PlanNode::Commit
351        | PlanNode::Rollback => false,
352    }
353}
354
355pub struct Engine {
356    catalog: Catalog,
357    /// Exclusive PID-based lock on the data directory, held for the engine's
358    /// lifetime so two separate processes can't open the same dir and corrupt
359    /// the heap/WAL. Released on clean drop; a `mem::forget` crash leaves a
360    /// stale lock the next open takes over. Leading `_`: it does its work
361    /// through `Drop`, never read directly.
362    _dir_lock: powdb_storage::dir_lock::DirLock,
363    /// Mission D9 — cached parsed+planned query trees keyed by canonical
364    /// hash. Saves the ~3μs parse+plan cost on repeat queries that differ
365    /// only in literal values.
366    ///
367    /// Mission infra-1: wrapped in `Mutex` so the read path can be driven
368    /// by `&self`. The critical section is extremely short — a single
369    /// hashmap lookup + plan clone on a hit, or a single insert on a miss.
370    /// A full `RwLock` would be over-engineered here; the contention window
371    /// is smaller than the read-path scan work it gates.
372    plan_cache: Mutex<PlanCache>,
373    /// Mission C Phase 13: reusable `Vec<Value>` scratch buffer for the
374    /// prepared-insert fast path. `execute_prepared` used to allocate a
375    /// fresh `vec![Value::Empty; n_cols]` on every insert; recycling this
376    /// buffer shaves one heap alloc per row on `insert_batch_1k`.
377    insert_values_scratch: Vec<Value>,
378    /// Materialized view registry: tracks view definitions, dependencies,
379    /// and dirty state. Views are backed by regular catalog tables; this
380    /// registry adds the lifecycle metadata.
381    view_registry: ViewRegistry,
382    in_transaction: bool,
383    /// WS2 — per-query memory budget ceiling (bytes). The running total lives
384    /// in a thread-local (see [`mem_budget`]) and is reset at every top-level
385    /// query entry, so sort/join/GROUP BY/IN-list materialization can be capped
386    /// without OOM-killing the process. This field holds only the *limit* (a
387    /// plain `usize`, so `Engine` stays `Sync` for the concurrent read path).
388    /// Default [`mem_budget::DEFAULT_QUERY_MEMORY_LIMIT`] (256 MB); overridable
389    /// via `Engine::with_memory_limit` (server reads `POWDB_QUERY_MEMORY_LIMIT`).
390    query_memory_limit: usize,
391    /// Maximum candidate pairs a fallback nested-loop join may evaluate before
392    /// it is rejected. Default [`MAX_NESTED_LOOP_PAIRS`], overridable via
393    /// [`Engine::set_nested_loop_pair_limit`] (server reads
394    /// `POWDB_MAX_NESTED_LOOP_PAIRS`). A plain `usize` so `Engine` stays `Sync`.
395    nested_loop_pair_limit: usize,
396    wal_archive_hook: Option<WalArchiveHook>,
397    /// True when opened via [`Engine::open_read_only`] for snapshot serving. In
398    /// this mode the catalog and its files are read-only, the `DirLock` is a
399    /// shared reader lock, and every mutating execute path returns the terminal
400    /// [`QueryError::ReadonlyMode`] instead of ever touching disk.
401    read_only: bool,
402}
403
404impl Engine {
405    /// Open or create a PowDB engine rooted at `data_dir`.
406    ///
407    /// If the directory already contains a catalog, it is reopened.
408    /// Otherwise a fresh empty database is created.
409    ///
410    /// # Examples
411    ///
412    /// ```
413    /// use powdb_query::executor::Engine;
414    ///
415    /// let dir = tempfile::tempdir().unwrap();
416    /// let engine = Engine::new(dir.path()).unwrap();
417    /// // Engine is ready — the directory now contains a catalog.
418    /// ```
419    pub fn new(data_dir: &Path) -> io::Result<Self> {
420        Self::new_inner(data_dir, None)
421    }
422
423    /// Open or create an engine that archives WAL records before any recovery,
424    /// rollback, or drop checkpoint truncates them. This keeps the query crate
425    /// independent of replication metadata while giving sync-aware callers one
426    /// lifecycle boundary for retained-history preservation.
427    pub fn new_with_wal_archive<F>(data_dir: &Path, archive: F) -> io::Result<Self>
428    where
429        F: Fn(&Path, &[powdb_storage::wal::WalRecord]) -> io::Result<()> + Send + Sync + 'static,
430    {
431        Self::new_inner(data_dir, Some(Arc::new(archive)))
432    }
433
434    fn new_inner(data_dir: &Path, wal_archive_hook: Option<WalArchiveHook>) -> io::Result<Self> {
435        powdb_storage::create_data_dir_secure(data_dir)?;
436        // Refuse to open a directory another live process already holds, before
437        // touching any on-disk state (concurrent writers corrupt the heap/WAL).
438        let dir_lock = powdb_storage::dir_lock::DirLock::acquire(data_dir)?;
439        // Try to reopen an existing database first; only create a fresh
440        // catalog when there isn't one already on disk.
441        let catalog_result = match &wal_archive_hook {
442            Some(hook) => {
443                let hook = Arc::clone(hook);
444                Catalog::open_with_wal_archive(data_dir, move |dir, records| hook(dir, records))
445            }
446            None => Catalog::open(data_dir),
447        };
448        let catalog = match catalog_result {
449            Ok(c) => {
450                info!(data_dir = %data_dir.display(), "engine reopened existing database");
451                c
452            }
453            Err(e) if e.kind() == io::ErrorKind::NotFound => {
454                info!(data_dir = %data_dir.display(), "engine initialized fresh database");
455                Catalog::create(data_dir)?
456            }
457            Err(e) => return Err(e),
458        };
459        let view_registry =
460            ViewRegistry::open(data_dir).unwrap_or_else(|_| ViewRegistry::new(data_dir));
461        Ok(Engine {
462            catalog,
463            _dir_lock: dir_lock,
464            plan_cache: Mutex::new(PlanCache::new(PLAN_CACHE_CAPACITY)),
465            insert_values_scratch: Vec::new(),
466            view_registry,
467            in_transaction: false,
468            query_memory_limit: mem_budget::DEFAULT_QUERY_MEMORY_LIMIT,
469            nested_loop_pair_limit: MAX_NESTED_LOOP_PAIRS,
470            wal_archive_hook,
471            read_only: false,
472        })
473    }
474
475    /// Open an engine **read-only** over a quiescent data directory for snapshot
476    /// serving (tier 1 of the replica story). This is the supported way to serve
477    /// a restored backup or a checkpointed replica with no write gate at all:
478    ///
479    /// - the catalog and every heap/index/WAL file are opened read-only
480    ///   ([`Catalog::open_read_only`]), so nothing on disk is ever mutated;
481    /// - a **shared reader** [`DirLock`] is taken, so N read-only processes may
482    ///   serve the same directory concurrently, while a read-write open refuses
483    ///   to start against live readers (and readers refuse a live writer);
484    /// - a non-empty WAL is refused with an actionable error rather than replayed
485    ///   (the directory must be recovered by a read-write engine first);
486    /// - every mutating statement returns the terminal [`QueryError::ReadonlyMode`].
487    ///
488    /// Materialized-view refresh must happen before snapshotting: a query over a
489    /// stale (dirty) view is refused in this mode rather than silently escalating.
490    pub fn open_read_only(data_dir: &Path) -> io::Result<Self> {
491        // Validate readability without ever chmod-ing the directory (a read-only
492        // open must leave it byte-identical).
493        powdb_storage::validate_data_dir_read_only(data_dir)?;
494        // A shared reader lock: coexists with other readers, refuses a live writer.
495        let dir_lock = powdb_storage::dir_lock::DirLock::acquire_reader(data_dir)?;
496        let catalog = Catalog::open_read_only(data_dir)?;
497        info!(data_dir = %data_dir.display(), "engine opened read-only for snapshot serving");
498        let view_registry =
499            ViewRegistry::open(data_dir).unwrap_or_else(|_| ViewRegistry::new(data_dir));
500        Ok(Engine {
501            catalog,
502            _dir_lock: dir_lock,
503            plan_cache: Mutex::new(PlanCache::new(PLAN_CACHE_CAPACITY)),
504            insert_values_scratch: Vec::new(),
505            view_registry,
506            in_transaction: false,
507            query_memory_limit: mem_budget::DEFAULT_QUERY_MEMORY_LIMIT,
508            nested_loop_pair_limit: MAX_NESTED_LOOP_PAIRS,
509            // No WAL-archive hook: a read-only engine never writes, so its Drop
510            // must never checkpoint (the hook is what would drive that).
511            wal_archive_hook: None,
512            read_only: true,
513        })
514    }
515
516    /// Read-only open with an explicit per-query memory budget (bytes).
517    pub fn open_read_only_with_memory_limit(
518        data_dir: &Path,
519        limit_bytes: usize,
520    ) -> io::Result<Self> {
521        let mut engine = Engine::open_read_only(data_dir)?;
522        engine.set_query_memory_limit(limit_bytes);
523        Ok(engine)
524    }
525
526    /// Whether this engine was opened read-only for snapshot serving.
527    pub fn is_read_only(&self) -> bool {
528        self.read_only
529    }
530
531    /// Open or create an engine with an explicit per-query memory limit
532    /// (bytes). Used by the server to apply `POWDB_QUERY_MEMORY_LIMIT`, and by
533    /// tests that need a tiny limit to exercise the budget guard.
534    pub fn with_memory_limit(data_dir: &Path, limit_bytes: usize) -> io::Result<Self> {
535        let mut engine = Engine::new(data_dir)?;
536        engine.set_query_memory_limit(limit_bytes);
537        Ok(engine)
538    }
539
540    /// Open or create an archive-aware engine with an explicit per-query memory
541    /// limit.
542    pub fn with_memory_limit_and_wal_archive<F>(
543        data_dir: &Path,
544        limit_bytes: usize,
545        archive: F,
546    ) -> io::Result<Self>
547    where
548        F: Fn(&Path, &[powdb_storage::wal::WalRecord]) -> io::Result<()> + Send + Sync + 'static,
549    {
550        let mut engine = Engine::new_with_wal_archive(data_dir, archive)?;
551        engine.set_query_memory_limit(limit_bytes);
552        Ok(engine)
553    }
554
555    /// Current per-query memory limit in bytes.
556    pub fn query_memory_limit(&self) -> usize {
557        self.query_memory_limit
558    }
559
560    /// Override the per-query memory limit in bytes (builder-style).
561    pub fn set_query_memory_limit(&mut self, limit_bytes: usize) {
562        self.query_memory_limit = limit_bytes;
563    }
564
565    /// Current fallback nested-loop join candidate-pair cap.
566    pub fn nested_loop_pair_limit(&self) -> usize {
567        self.nested_loop_pair_limit
568    }
569
570    /// Override the fallback nested-loop join candidate-pair cap. Used by the
571    /// server to apply `POWDB_MAX_NESTED_LOOP_PAIRS`, and by tests that need a
572    /// tiny cap to exercise the guard on a small join. A zero limit is clamped
573    /// to 1 so a valid single-pair join is never rejected outright.
574    pub fn set_nested_loop_pair_limit(&mut self, limit: usize) {
575        self.nested_loop_pair_limit = limit.max(1);
576    }
577
578    /// Set the WAL durability mode (see [`WalSyncMode`]). `Full` (the default)
579    /// fsyncs every commit; `Normal` moves the fsync to a background flusher
580    /// with a bounded crash-loss window; `Off` is bench-only (no durability).
581    /// Wired from the server's `POWDB_SYNC_MODE` / `--sync-mode` config.
582    pub fn set_wal_sync_mode(&mut self, mode: WalSyncMode) {
583        self.catalog.set_wal_sync_mode(mode);
584    }
585
586    /// Run `f` with commit durability deferred — the WAL group-commit entry
587    /// point for callers that serialize writers behind an exclusive lock.
588    ///
589    /// Inside `f`, Full-mode commit points register the WAL generation they
590    /// need durable instead of fsyncing inline. The returned ticket (if any)
591    /// must be waited on before the statement's result is acknowledged; the
592    /// caller should release its exclusive engine lock first, so other
593    /// committers can append while the fsync runs. That overlap is what lets
594    /// one fsync cover many commits. A lone committer's wait performs the
595    /// fsync immediately — group commit never introduces a delay.
596    ///
597    /// `Normal`/`Off` sync modes return no ticket; their durability
598    /// contracts are unchanged. If `f` panics the engine must not be reused
599    /// (the deferral flag may still be set); lock poisoning enforces this
600    /// for callers that share the engine behind a lock.
601    pub fn run_with_deferred_durability<T>(
602        &mut self,
603        f: impl FnOnce(&mut Engine) -> T,
604    ) -> (T, Option<WalDurabilityTicket>) {
605        self.catalog.set_wal_sync_deferred(true);
606        let out = f(self);
607        self.catalog.set_wal_sync_deferred(false);
608        let ticket = self.catalog.take_wal_durability_ticket();
609        (out, ticket)
610    }
611
612    /// Number of fsyncs issued against the WAL (test/metrics hook).
613    pub fn wal_fsync_count(&self) -> u64 {
614        self.catalog.wal_fsync_count()
615    }
616
617    /// Roll back the active explicit transaction while archiving any committed
618    /// pre-transaction WAL records that recovery must replay and truncate.
619    /// This is the sync-aware counterpart to the ordinary `rollback` statement;
620    /// callers provide the archive hook so the query crate stays independent of
621    /// replication metadata.
622    pub fn rollback_transaction_with_wal_archive<F>(
623        &mut self,
624        archive: F,
625    ) -> Result<QueryResult, QueryError>
626    where
627        F: FnMut(&Path, &[powdb_storage::wal::WalRecord]) -> io::Result<()>,
628    {
629        if !self.in_transaction {
630            return Err(QueryError::Execution(
631                "no active transaction to roll back".into(),
632            ));
633        }
634        self.catalog
635            .rollback_to_last_sync_with_wal_archive(archive)
636            .map_err(|e| QueryError::StorageError(e.to_string()))?;
637        self.finish_rollback_after_catalog_restore()
638    }
639
640    pub fn rollback_transaction_preserving_wal_archive(
641        &mut self,
642    ) -> Result<QueryResult, QueryError> {
643        let Some(hook) = self.wal_archive_hook.clone() else {
644            if !self.in_transaction {
645                return Err(QueryError::Execution(
646                    "no active transaction to roll back".into(),
647                ));
648            }
649            self.catalog
650                .rollback_to_last_sync()
651                .map_err(|e| QueryError::StorageError(e.to_string()))?;
652            return self.finish_rollback_after_catalog_restore();
653        };
654        self.rollback_transaction_with_wal_archive(move |dir, records| hook(dir, records))
655    }
656
657    fn finish_rollback_after_catalog_restore(&mut self) -> Result<QueryResult, QueryError> {
658        self.in_transaction = false;
659        if let Ok(mut cache) = self.plan_cache.lock() {
660            cache.clear();
661        }
662        self.view_registry = ViewRegistry::open(self.catalog.data_dir())
663            .unwrap_or_else(|_| ViewRegistry::new(self.catalog.data_dir()));
664        Ok(QueryResult::Executed {
665            message: "transaction rolled back".to_string(),
666        })
667    }
668
669    /// Enter a budgeted-statement frame for the current query. The returned
670    /// guard must be held for the duration of the statement; on its drop the
671    /// reentrancy depth is decremented. Only the *outermost* statement entry
672    /// zeroes this thread's running total, so a nested `execute_powql` (the
673    /// source query of a `create_view`/`refresh_view`) does NOT discard the
674    /// outer frame's accounting. The accumulator is thread-local, so this never
675    /// touches another concurrent query's total.
676    #[must_use = "the budget guard must outlive the statement body"]
677    pub(super) fn enter_memory_budget(&self) -> mem_budget::EnterGuard {
678        mem_budget::enter()
679    }
680
681    /// Charge the estimated footprint of a freshly materialized batch of rows
682    /// against the current per-query budget. Returns
683    /// [`QueryError::MemoryLimitExceeded`] cleanly if the batch would push the
684    /// query over its limit. Used at every full-materialization point (sort
685    /// buffer, join build side, GROUP BY hash table, IN-list).
686    pub(super) fn charge_rows(&self, rows: &[Vec<Value>]) -> Result<(), QueryError> {
687        let mut total = 0usize;
688        let mut cancel = crate::cancel::CancelCheck::new();
689        for row in rows {
690            cancel.tick()?;
691            total = total.saturating_add(mem_budget::estimate_row_size(row));
692        }
693        mem_budget::charge(total, self.query_memory_limit)
694    }
695
696    /// Charge a materialized IN-list (the literal expressions pulled out of an
697    /// uncorrelated `IN (subquery)`) against the current per-query budget.
698    /// Each item is conservatively sized at the `Expr` slot plus, for string
699    /// literals, the owned heap bytes.
700    pub(super) fn charge_in_list(&self, list: &[crate::ast::Expr]) -> Result<(), QueryError> {
701        let base = std::mem::size_of::<crate::ast::Expr>();
702        let mut total = std::mem::size_of::<Vec<crate::ast::Expr>>();
703        let mut cancel = crate::cancel::CancelCheck::new();
704        for item in list {
705            cancel.tick()?;
706            total = total.saturating_add(base);
707            if let crate::ast::Expr::Literal(crate::ast::Literal::String(s)) = item {
708                total = total.saturating_add(s.capacity());
709            }
710        }
711        mem_budget::charge(total, self.query_memory_limit)
712    }
713
714    /// Dispatch to the requested query frontend.
715    pub fn execute_with_dialect(
716        &mut self,
717        dialect: QueryDialect,
718        input: &str,
719    ) -> Result<QueryResult, QueryError> {
720        match dialect {
721            QueryDialect::PowQL => self.execute_powql(input),
722            QueryDialect::Sql => self.execute_sql(input),
723        }
724    }
725
726    /// Read-only variant of [`Engine::execute_with_dialect`].
727    pub fn execute_readonly_with_dialect(
728        &self,
729        dialect: QueryDialect,
730        input: &str,
731    ) -> Result<QueryResult, QueryError> {
732        match dialect {
733            QueryDialect::PowQL => self.execute_powql_readonly(input),
734            QueryDialect::Sql => self.execute_sql_readonly(input),
735        }
736    }
737
738    /// Parse + plan + execute a PowQL query.
739    ///
740    /// # Examples
741    ///
742    /// ```
743    /// use powdb_query::executor::Engine;
744    /// use powdb_query::result::QueryResult;
745    ///
746    /// let dir = tempfile::tempdir().unwrap();
747    /// let mut engine = Engine::new(dir.path()).unwrap();
748    ///
749    /// // Create a table and insert a row.
750    /// engine.execute_powql("type User { required name: str, age: int }").unwrap();
751    /// engine.execute_powql(r#"insert User { name := "Alice", age := 30 }"#).unwrap();
752    ///
753    /// // Query rows back.
754    /// let result = engine.execute_powql("User").unwrap();
755    /// assert_eq!(result.row_count(), 1);
756    /// ```
757    ///
758    /// Mission D6 — tracing collapse: the previous implementation ran 4
759    /// `Instant::now()` + 3 `elapsed().as_micros()` calls + formatted an
760    /// `info!` span on every query, even when tracing was disabled. On a
761    /// sub-microsecond `point_lookup_indexed` call that overhead was
762    /// 100-200ns — 20%+ of the whole query. We now measure time only when
763    /// INFO is actually enabled via `tracing::enabled!`, and we moved the
764    /// noisy `debug!(?plan)` line behind the same gate so the Debug
765    /// formatter can't run unconditionally either.
766    ///
767    /// Mission D9 — plan cache: on the hot path we canonicalise the query
768    /// text (lex + FNV-1a hash with literal values stripped), check the
769    /// cache, and on a hit substitute the new literals into a clone of the
770    /// cached plan. This skips re-lexing, re-parsing, and re-planning —
771    /// around 3μs per call on bench workloads. On a miss we plan as before
772    /// and insert the plan under its canonical hash.
773    pub fn execute_powql(&mut self, input: &str) -> Result<QueryResult, QueryError> {
774        if self.read_only {
775            // Snapshot-serving mode: run reads through the read-only executor and
776            // turn the "this statement writes" sentinel into the terminal
777            // ReadonlyMode error. No mutation ever reaches disk.
778            return to_readonly_terminal(self.execute_powql_readonly(input));
779        }
780        // WS2: each *outermost* statement starts with the full memory
781        // allowance. The guard holds the reentrancy depth so a nested
782        // `execute_powql` (e.g. a view's source query) does not reset the
783        // outer frame's accounting mid-statement.
784        let _budget = self.enter_memory_budget();
785        // A token may be cancelled before execution starts (for example, EOF
786        // detected while this job was waiting for the engine lock). Check once
787        // at the statement boundary so even point operations with no long loop
788        // honor cancellation before they can mutate state.
789        crate::cancel::check()?;
790        // Hot path: tracing disabled. Zero syscalls, zero formatting.
791        if !tracing::enabled!(Level::INFO) {
792            // D9: try the plan cache first. Canonicalisation lexes the
793            // query once; on a hit we skip the parser and planner entirely.
794            if let Ok((hash, literals)) = canonicalize(input) {
795                let cached = self
796                    .plan_cache
797                    .lock()
798                    .map_err(|e| QueryError::Execution(format!("plan cache lock poisoned: {e}")))?
799                    .get_with_substitution(hash, &literals);
800                if let Some(plan) = cached {
801                    let plan = lower_unindexed_scans(&self.catalog, &plan);
802                    let result = self.execute_plan(&plan);
803                    // Mission B (post-review): statement-boundary WAL
804                    // group commit. Catalog::wal_log now only appends;
805                    // the fsync happens here exactly once per statement.
806                    // `sync_wal` is a no-op when nothing was buffered
807                    // (pure reads pay zero fsync).
808                    if !self.in_transaction {
809                        self.catalog
810                            .commit_autocommit()
811                            .map_err(|e| QueryError::StorageError(e.to_string()))?;
812                    }
813                    return result;
814                }
815                // Miss — plan, insert, execute.
816                return match planner::plan(input) {
817                    Ok(plan) => {
818                        self.plan_cache
819                            .lock()
820                            .map_err(|e| {
821                                QueryError::Execution(format!("plan cache lock poisoned: {e}"))
822                            })?
823                            .insert(hash, plan.clone(), literals.len());
824                        let plan = lower_unindexed_scans(&self.catalog, &plan);
825                        let result = self.execute_plan(&plan);
826                        if !self.in_transaction {
827                            self.catalog
828                                .commit_autocommit()
829                                .map_err(|e| QueryError::StorageError(e.to_string()))?;
830                        }
831                        result
832                    }
833                    Err(e) => Err(QueryError::Parse(e.to_string())),
834                };
835            }
836            // Lex error — fall through to the planner so the caller gets a
837            // consistent error shape.
838            return match planner::plan(input) {
839                Ok(plan) => {
840                    let plan = lower_unindexed_scans(&self.catalog, &plan);
841                    let result = self.execute_plan(&plan);
842                    if !self.in_transaction {
843                        self.catalog
844                            .commit_autocommit()
845                            .map_err(|e| QueryError::StorageError(e.to_string()))?;
846                    }
847                    result
848                }
849                Err(e) => Err(QueryError::Parse(e.to_string())),
850            };
851        }
852
853        // Instrumented path — only taken under explicit tracing subscribers.
854        let total_start = Instant::now();
855        let plan_start = Instant::now();
856        let plan = planner::plan(input).map_err(|e| {
857            let msg = e.to_string();
858            error!(query = %input, error = %msg, "query plan failed");
859            QueryError::Parse(msg)
860        })?;
861        let plan_us = plan_start.elapsed().as_micros();
862
863        let exec_start = Instant::now();
864        let plan = lower_unindexed_scans(&self.catalog, &plan);
865        let result = self.execute_plan(&plan);
866        if !self.in_transaction {
867            self.catalog
868                .commit_autocommit()
869                .map_err(|e| QueryError::StorageError(e.to_string()))?;
870        }
871        let exec_us = exec_start.elapsed().as_micros();
872
873        let total_us = total_start.elapsed().as_micros();
874        match &result {
875            Ok(r) => {
876                info!(
877                    query = %input,
878                    plan_us = plan_us,
879                    exec_us = exec_us,
880                    total_us = total_us,
881                    rows = r.row_count(),
882                    "query ok"
883                );
884            }
885            Err(e) => {
886                error!(
887                    query = %input,
888                    plan_us = plan_us,
889                    exec_us = exec_us,
890                    error = %e,
891                    "query failed"
892                );
893            }
894        }
895        result
896    }
897
898    /// Parse + plan + execute a SQL query through the SQL frontend.
899    ///
900    /// SQL is lowered to the existing PowDB AST and to canonical PowQL text.
901    /// The canonical PowQL text is used as the plan-cache key, so equivalent
902    /// SQL and PowQL spellings share cached plans.
903    pub fn execute_sql(&mut self, input: &str) -> Result<QueryResult, QueryError> {
904        if self.read_only {
905            return to_readonly_terminal(self.execute_sql_readonly(input));
906        }
907        let _budget = self.enter_memory_budget();
908        crate::cancel::check()?;
909        let parsed = crate::sql::parse_sql_with_canonical(input)
910            .map_err(|e| QueryError::Parse(e.to_string()))?;
911
912        if !tracing::enabled!(Level::INFO) {
913            if let Ok((hash, literals)) = canonicalize(&parsed.canonical_powql) {
914                let hash = if crate::sql::statement_has_aggregate(&parsed.statement) {
915                    sql_raw_cache_hash(hash)
916                } else {
917                    hash
918                };
919                let cached = self
920                    .plan_cache
921                    .lock()
922                    .map_err(|e| QueryError::Execution(format!("plan cache lock poisoned: {e}")))?
923                    .get_with_substitution(hash, &literals);
924                if let Some(plan) = cached {
925                    let plan = lower_unindexed_scans(&self.catalog, &plan);
926                    let result = self.execute_plan(&plan);
927                    if !self.in_transaction {
928                        self.catalog
929                            .commit_autocommit()
930                            .map_err(|e| QueryError::StorageError(e.to_string()))?;
931                    }
932                    return result;
933                }
934
935                let plan = crate::planner::plan_statement(parsed.statement)
936                    .map_err(|e| QueryError::Parse(e.to_string()))?;
937                self.plan_cache
938                    .lock()
939                    .map_err(|e| QueryError::Execution(format!("plan cache lock poisoned: {e}")))?
940                    .insert(hash, plan.clone(), literals.len());
941                let plan = lower_unindexed_scans(&self.catalog, &plan);
942                let result = self.execute_plan(&plan);
943                if !self.in_transaction {
944                    self.catalog
945                        .commit_autocommit()
946                        .map_err(|e| QueryError::StorageError(e.to_string()))?;
947                }
948                return result;
949            }
950        }
951
952        let plan = crate::planner::plan_statement(parsed.statement)
953            .map_err(|e| QueryError::Parse(e.to_string()))?;
954        let plan = lower_unindexed_scans(&self.catalog, &plan);
955        let result = self.execute_plan(&plan);
956        if !self.in_transaction {
957            self.catalog
958                .commit_autocommit()
959                .map_err(|e| QueryError::StorageError(e.to_string()))?;
960        }
961        result
962    }
963
964    /// Read-only variant of [`Engine::execute_sql`].
965    pub fn execute_sql_readonly(&self, input: &str) -> Result<QueryResult, QueryError> {
966        let _budget = self.enter_memory_budget();
967        crate::cancel::check()?;
968        let parsed = crate::sql::parse_sql_with_canonical(input)
969            .map_err(|e| QueryError::Parse(e.to_string()))?;
970        if !is_read_only_statement(&parsed.statement) {
971            return Err(QueryError::ReadonlyNeedsWrite);
972        }
973
974        if let Ok((hash, literals)) = canonicalize(&parsed.canonical_powql) {
975            let hash = if crate::sql::statement_has_aggregate(&parsed.statement) {
976                sql_raw_cache_hash(hash)
977            } else {
978                hash
979            };
980            let cached = self
981                .plan_cache
982                .lock()
983                .map_err(|e| QueryError::Execution(format!("plan cache lock poisoned: {e}")))?
984                .get_with_substitution(hash, &literals);
985            if let Some(plan) = cached {
986                let plan = lower_unindexed_scans(&self.catalog, &plan);
987                return self.execute_plan_readonly(&plan);
988            }
989            let plan = crate::planner::plan_statement(parsed.statement)
990                .map_err(|e| QueryError::Parse(e.to_string()))?;
991            self.plan_cache
992                .lock()
993                .map_err(|e| QueryError::Execution(format!("plan cache lock poisoned: {e}")))?
994                .insert(hash, plan.clone(), literals.len());
995            let plan = lower_unindexed_scans(&self.catalog, &plan);
996            return self.execute_plan_readonly(&plan);
997        }
998
999        let plan = crate::planner::plan_statement(parsed.statement)
1000            .map_err(|e| QueryError::Parse(e.to_string()))?;
1001        let plan = lower_unindexed_scans(&self.catalog, &plan);
1002        self.execute_plan_readonly(&plan)
1003    }
1004
1005    /// Execute PowQL with `$N` placeholders bound to positional `params`.
1006    ///
1007    /// Task 4: parameters are substituted as literal *tokens* before
1008    /// parsing (see [`crate::parser::parse_with_params`]), so untrusted
1009    /// input can never change the query's shape. This path deliberately
1010    /// **bypasses the plan cache** — template caching is a follow-up — and
1011    /// otherwise mirrors the non-cached tail of [`Engine::execute_powql`].
1012    pub fn execute_powql_with_params(
1013        &mut self,
1014        input: &str,
1015        params: &[crate::ast::ParamValue],
1016    ) -> Result<QueryResult, QueryError> {
1017        if self.read_only {
1018            return to_readonly_terminal(self.execute_powql_readonly_with_params(input, params));
1019        }
1020        let _budget = self.enter_memory_budget();
1021        crate::cancel::check()?;
1022        let stmt = crate::parser::parse_with_params(input, params)
1023            .map_err(|e| QueryError::Parse(e.to_string()))?;
1024        let plan =
1025            crate::planner::plan_statement(stmt).map_err(|e| QueryError::Parse(e.to_string()))?;
1026        let plan = lower_unindexed_scans(&self.catalog, &plan);
1027        let result = self.execute_plan(&plan);
1028        if !self.in_transaction {
1029            self.catalog
1030                .commit_autocommit()
1031                .map_err(|e| QueryError::StorageError(e.to_string()))?;
1032        }
1033        result
1034    }
1035
1036    /// Read-only variant of [`Engine::execute_powql_with_params`].
1037    ///
1038    /// Mirrors [`Engine::execute_powql_readonly`]: parses with bound
1039    /// params, rejects any write statement with
1040    /// [`QueryError::ReadonlyNeedsWrite`] so the caller can escalate to the
1041    /// write lock, then executes under a shared borrow. No plan-cache
1042    /// interaction.
1043    pub fn execute_powql_readonly_with_params(
1044        &self,
1045        input: &str,
1046        params: &[crate::ast::ParamValue],
1047    ) -> Result<QueryResult, QueryError> {
1048        let _budget = self.enter_memory_budget();
1049        crate::cancel::check()?;
1050        let stmt = crate::parser::parse_with_params(input, params)
1051            .map_err(|e| QueryError::Parse(e.to_string()))?;
1052        if !is_read_only_statement(&stmt) {
1053            return Err(QueryError::ReadonlyNeedsWrite);
1054        }
1055        let plan =
1056            crate::planner::plan_statement(stmt).map_err(|e| QueryError::Parse(e.to_string()))?;
1057        let plan = lower_unindexed_scans(&self.catalog, &plan);
1058        self.execute_plan_readonly(&plan)
1059    }
1060
1061    /// Cancellation-aware variant of [`Engine::execute_powql`]. Installs
1062    /// `cancel` as the current thread's cancellation token for the duration of
1063    /// the statement, so cancellable read and mutation-target discovery loops
1064    /// poll it. Mutation application checks once before its first write, then
1065    /// finishes without polling because the engine has no statement savepoint
1066    /// with which to undo a written prefix. The base methods also honor an
1067    /// already installed token; a caller with no token (embedded/direct use)
1068    /// never cancels.
1069    pub fn execute_powql_with_cancel(
1070        &mut self,
1071        input: &str,
1072        cancel: Arc<crate::cancel::ExecCancel>,
1073    ) -> Result<QueryResult, QueryError> {
1074        let _cancel_guard = crate::cancel::install(cancel);
1075        self.execute_powql(input)
1076    }
1077
1078    /// Cancellation-aware variant of [`Engine::execute_sql`].
1079    pub fn execute_sql_with_cancel(
1080        &mut self,
1081        input: &str,
1082        cancel: Arc<crate::cancel::ExecCancel>,
1083    ) -> Result<QueryResult, QueryError> {
1084        let _cancel_guard = crate::cancel::install(cancel);
1085        self.execute_sql(input)
1086    }
1087
1088    /// Cancellation-aware variant of [`Engine::execute_powql_readonly`].
1089    pub fn execute_powql_readonly_with_cancel(
1090        &self,
1091        input: &str,
1092        cancel: Arc<crate::cancel::ExecCancel>,
1093    ) -> Result<QueryResult, QueryError> {
1094        let _cancel_guard = crate::cancel::install(cancel);
1095        self.execute_powql_readonly(input)
1096    }
1097
1098    /// Cancellation-aware variant of [`Engine::execute_sql_readonly`].
1099    pub fn execute_sql_readonly_with_cancel(
1100        &self,
1101        input: &str,
1102        cancel: Arc<crate::cancel::ExecCancel>,
1103    ) -> Result<QueryResult, QueryError> {
1104        let _cancel_guard = crate::cancel::install(cancel);
1105        self.execute_sql_readonly(input)
1106    }
1107
1108    /// Cancellation-aware variant of [`Engine::execute_powql_with_params`].
1109    pub fn execute_powql_with_params_and_cancel(
1110        &mut self,
1111        input: &str,
1112        params: &[crate::ast::ParamValue],
1113        cancel: Arc<crate::cancel::ExecCancel>,
1114    ) -> Result<QueryResult, QueryError> {
1115        let _cancel_guard = crate::cancel::install(cancel);
1116        self.execute_powql_with_params(input, params)
1117    }
1118
1119    /// Cancellation-aware variant of [`Engine::execute_powql_readonly_with_params`].
1120    pub fn execute_powql_readonly_with_params_and_cancel(
1121        &self,
1122        input: &str,
1123        params: &[crate::ast::ParamValue],
1124        cancel: Arc<crate::cancel::ExecCancel>,
1125    ) -> Result<QueryResult, QueryError> {
1126        let _cancel_guard = crate::cancel::install(cancel);
1127        self.execute_powql_readonly_with_params(input, params)
1128    }
1129
1130    /// Plan cache stats — useful for benches and debugging.
1131    pub fn plan_cache_stats(&self) -> (u64, u64, usize) {
1132        let cache = self.plan_cache.lock().unwrap_or_else(|e| e.into_inner());
1133        (cache.hits, cache.misses, cache.len())
1134    }
1135
1136    /// Mission infra-1: read-only entry point.
1137    ///
1138    /// Parses + plans + executes a PowQL query using only a shared borrow
1139    /// on the engine. Rejects any statement that would mutate state
1140    /// (Insert/Update/Delete/CreateTable/AlterTable/DropTable/CreateView/
1141    /// RefreshView/DropView) by returning [`READONLY_NEEDS_WRITE`] so the
1142    /// caller can escalate to the write lock.
1143    ///
1144    /// Also returns [`READONLY_NEEDS_WRITE`] if a materialized view in the
1145    /// query is dirty — refreshing one requires `&mut self`, so the caller
1146    /// must retake the write lock for the first refresh.
1147    ///
1148    /// This method is the concurrent-read fast path behind
1149    /// `Arc<RwLock<Engine>>`: multiple threads can call it simultaneously
1150    /// under a shared `.read()` lock and each will scan independently.
1151    pub fn execute_powql_readonly(&self, input: &str) -> Result<QueryResult, QueryError> {
1152        // WS2: each *outermost* statement starts with the full memory
1153        // allowance. The guard holds the reentrancy depth so a nested
1154        // `execute_powql*` does not reset the outer frame's accounting.
1155        let _budget = self.enter_memory_budget();
1156        crate::cancel::check()?;
1157        // Parse the statement first so we can classify read vs. write
1158        // without touching the catalog. This is the same lex+parse cost
1159        // the hot path would pay anyway.
1160        let stmt = crate::parser::parse(input).map_err(|e| QueryError::Parse(e.to_string()))?;
1161        if !is_read_only_statement(&stmt) {
1162            return Err(QueryError::ReadonlyNeedsWrite);
1163        }
1164
1165        // Try the plan cache first — identical hash scheme to
1166        // `execute_powql` so both paths share cache state. The mutex
1167        // section is just a hashmap lookup + plan clone.
1168        if let Ok((hash, literals)) = canonicalize(input) {
1169            let cached = self
1170                .plan_cache
1171                .lock()
1172                .map_err(|e| QueryError::Execution(format!("plan cache lock poisoned: {e}")))?
1173                .get_with_substitution(hash, &literals);
1174            if let Some(plan) = cached {
1175                let plan = lower_unindexed_scans(&self.catalog, &plan);
1176                return self.execute_plan_readonly(&plan);
1177            }
1178            // Miss: plan + insert + execute. The planner is pure, so this
1179            // is safe from `&self`.
1180            let plan = crate::planner::plan_statement(stmt)
1181                .map_err(|e| QueryError::Parse(e.to_string()))?;
1182            self.plan_cache
1183                .lock()
1184                .map_err(|e| QueryError::Execution(format!("plan cache lock poisoned: {e}")))?
1185                .insert(hash, plan.clone(), literals.len());
1186            let plan = lower_unindexed_scans(&self.catalog, &plan);
1187            return self.execute_plan_readonly(&plan);
1188        }
1189        // Lex error — fall through to the planner for a consistent error
1190        // shape (though `parse` above would usually have caught it).
1191        let plan =
1192            crate::planner::plan_statement(stmt).map_err(|e| QueryError::Parse(e.to_string()))?;
1193        let plan = lower_unindexed_scans(&self.catalog, &plan);
1194        self.execute_plan_readonly(&plan)
1195    }
1196
1197    /// Read-only version of [`Engine::execute_plan`]. Dispatches the
1198    /// read-path plan variants by calling `&self` helpers and errors with
1199    /// [`READONLY_NEEDS_WRITE`] on any write variant. This is the
1200    /// recursion target for composite read plans under the RwLock reader.
1201    ///
1202    /// The dispatch mirrors `execute_plan` for the read branches but does
1203    /// not carry any of the fast-paths that need `&mut self` (e.g. plan-
1204    /// cache mutation on inner subqueries is handled via the shared mutex
1205    /// in [`Engine::execute_powql_readonly`]; in-flight subquery
1206    /// materialisation uses [`Engine::materialize_subqueries_readonly`]).
1207    fn execute_plan_readonly(&self, plan: &PlanNode) -> Result<QueryResult, QueryError> {
1208        // Detect every dirty materialized-view source before executing any
1209        // branch of the plan. Without this preflight, a join could fully scan
1210        // its clean left input before discovering a dirty right input, then
1211        // repeat that work after the server upgrades to writer admission.
1212        // Alias scans also need this centralized check: they do not pass
1213        // through the SeqScan arm below.
1214        if plan_reads_dirty_view(plan, &self.view_registry) {
1215            return Err(QueryError::ReadonlyNeedsWrite);
1216        }
1217        // Mirror the mutable path: reject a stray aggregate FunctionCall before
1218        // evaluating any row (see execute_plan for the rationale).
1219        validate_no_stray_aggregates(plan)?;
1220        validate_json_path_types(&self.catalog, plan)?;
1221        match plan {
1222            PlanNode::ExprIndexScan { .. }
1223            | PlanNode::ExprRangeScan { .. }
1224            | PlanNode::OrderedExprIndexScan { .. } => {
1225                if let Some(result) = self.execute_expression_index_plan(plan, None)? {
1226                    return Ok(result);
1227                }
1228                let fallback = lower_unindexed_scans(&self.catalog, plan);
1229                self.execute_plan_readonly(&fallback)
1230            }
1231            PlanNode::SeqScan { table } => {
1232                // Dirty view means we'd need to refresh it — can't do that
1233                // under `&self`. Escalate to the write path.
1234                if self.view_registry.is_dirty(table) {
1235                    return Err(QueryError::ReadonlyNeedsWrite);
1236                }
1237                let schema = self
1238                    .catalog
1239                    .schema(table)
1240                    .ok_or_else(|| QueryError::TableNotFound(table.clone()))?
1241                    .clone();
1242                let columns: Vec<String> = schema.columns.iter().map(|c| c.name.clone()).collect();
1243                // Cooperative cancellation: a full-table scan of a huge table
1244                // must stay stoppable.
1245                let mut cancel = crate::cancel::CancelCheck::new();
1246                let mut rows: Vec<Vec<Value>> = Vec::new();
1247                for (_, row) in self.catalog.scan(table).map_err(|e| e.to_string())? {
1248                    cancel.tick()?;
1249                    rows.push(row);
1250                }
1251                Ok(QueryResult::Rows { columns, rows })
1252            }
1253
1254            PlanNode::AliasScan { table, alias } => {
1255                let schema = self
1256                    .catalog
1257                    .schema(table)
1258                    .ok_or_else(|| QueryError::TableNotFound(table.clone()))?
1259                    .clone();
1260                let columns: Vec<String> = schema
1261                    .columns
1262                    .iter()
1263                    .map(|c| format!("{alias}.{}", c.name))
1264                    .collect();
1265                let mut cancel = crate::cancel::CancelCheck::new();
1266                let mut rows: Vec<Vec<Value>> = Vec::new();
1267                for (_, row) in self.catalog.scan(table).map_err(|e| e.to_string())? {
1268                    cancel.tick()?;
1269                    rows.push(row);
1270                }
1271                Ok(QueryResult::Rows { columns, rows })
1272            }
1273
1274            PlanNode::IndexScan { table, column, key } => {
1275                let schema = self
1276                    .catalog
1277                    .schema(table)
1278                    .ok_or_else(|| QueryError::TableNotFound(table.clone()))?
1279                    .clone();
1280                let columns: Vec<String> = schema.columns.iter().map(|c| c.name.clone()).collect();
1281                let key_value = literal_to_value(key)?;
1282                let tbl = self
1283                    .catalog
1284                    .get_table(table)
1285                    .ok_or_else(|| QueryError::TableNotFound(table.clone()))?;
1286
1287                if tbl.has_index(column) {
1288                    // Use index_lookup_all to handle both unique and
1289                    // non-unique indexes — returns all matching RowIds.
1290                    let rids = tbl.index_lookup_all(column, &key_value);
1291                    let mut rows: Vec<Vec<Value>> = Vec::with_capacity(rids.len());
1292                    let mut cancel = crate::cancel::CancelCheck::new();
1293                    for rid in rids {
1294                        cancel.tick()?;
1295                        // Overflow safety (P0-3/P0-4): `tbl.get` reassembles
1296                        // spilled columns (the old `heap.get` + `decode_row`
1297                        // returned Empty / wrapped a >= 64KB value).
1298                        if let Some(row) = tbl.get(rid) {
1299                            rows.push(row);
1300                        }
1301                    }
1302                    return Ok(QueryResult::Rows { columns, rows });
1303                }
1304
1305                // No index: synthetic eq predicate + compiled scan.
1306                // Overflow safety (P0-4/P1): v2-capable tables use the decoded
1307                // last-resort scan below (raw scan drops/mis-reads spilled cols).
1308                let fast = FastLayout::new(&schema);
1309                let synth_pred = Expr::BinaryOp(
1310                    Box::new(Expr::Field(column.clone())),
1311                    BinOp::Eq,
1312                    Box::new(key.clone()),
1313                );
1314                if !tbl.has_overflow_rows() {
1315                    if let Some(compiled) = compile_predicate(&synth_pred, &columns, &fast, &schema)
1316                    {
1317                        let mut rows: Vec<Vec<Value>> = Vec::with_capacity(64);
1318                        for_each_row_raw_cancellable(&self.catalog, table, |_rid, data| {
1319                            if compiled(data) {
1320                                rows.push(decode_row(&schema, data));
1321                            }
1322                        })?;
1323                        return Ok(QueryResult::Rows { columns, rows });
1324                    }
1325                }
1326
1327                // Last resort: slow eq-check.
1328                let col_idx =
1329                    schema
1330                        .column_index(column)
1331                        .ok_or_else(|| QueryError::ColumnNotFound {
1332                            table: String::new(),
1333                            column: column.clone(),
1334                        })?;
1335                let mut cancel = crate::cancel::CancelCheck::new();
1336                let mut rows: Vec<Vec<Value>> = Vec::new();
1337                for (_, row) in tbl.scan() {
1338                    cancel.tick()?;
1339                    if row[col_idx] == key_value {
1340                        rows.push(row);
1341                    }
1342                }
1343                Ok(QueryResult::Rows { columns, rows })
1344            }
1345
1346            PlanNode::RangeScan {
1347                table,
1348                column,
1349                start,
1350                end,
1351            } => {
1352                let tbl = self
1353                    .catalog
1354                    .get_table(table)
1355                    .ok_or_else(|| QueryError::TableNotFound(table.clone()))?;
1356                let columns: Vec<String> = tbl
1357                    .schema()
1358                    .columns
1359                    .iter()
1360                    .map(|c| c.name.clone())
1361                    .collect();
1362                let schema = tbl.schema().clone();
1363
1364                let start_val = match start {
1365                    Some((expr, _)) => Some(literal_to_value(expr)?),
1366                    None => None,
1367                };
1368                let end_val = match end {
1369                    Some((expr, _)) => Some(literal_to_value(expr)?),
1370                    None => None,
1371                };
1372                let start_inclusive = start.as_ref().map(|(_, inc)| *inc).unwrap_or(true);
1373                let end_inclusive = end.as_ref().map(|(_, inc)| *inc).unwrap_or(true);
1374
1375                // Range scans only use the btree fast path for unique indexes.
1376                // Non-unique indexes store composite keys that don't compare
1377                // directly against raw column values.
1378                if tbl.is_index_unique(column) == Some(true) {
1379                    if let Some(btree) = tbl.index(column) {
1380                        let hits: Vec<(Value, RowId)> = match (&start_val, &end_val) {
1381                            (Some(s), Some(e)) => btree.range(s, e).collect(),
1382                            (Some(s), None) => btree.range_from(s),
1383                            (None, Some(e)) => btree.range_to(e),
1384                            (None, None) => {
1385                                // Unbounded both sides — equivalent to seq scan.
1386                                let mut cancel = crate::cancel::CancelCheck::new();
1387                                let mut rows: Vec<Vec<Value>> = Vec::new();
1388                                for (_, row) in tbl.scan() {
1389                                    cancel.tick()?;
1390                                    rows.push(row);
1391                                }
1392                                return Ok(QueryResult::Rows { columns, rows });
1393                            }
1394                        };
1395                        let mut rows: Vec<Vec<Value>> = Vec::with_capacity(hits.len());
1396                        let mut cancel = crate::cancel::CancelCheck::new();
1397                        for (key, rid) in hits {
1398                            cancel.tick()?;
1399                            // Filter for exclusive bounds.
1400                            if !start_inclusive {
1401                                if let Some(ref s) = start_val {
1402                                    if &key == s {
1403                                        continue;
1404                                    }
1405                                }
1406                            }
1407                            if !end_inclusive {
1408                                if let Some(ref e) = end_val {
1409                                    if &key == e {
1410                                        continue;
1411                                    }
1412                                }
1413                            }
1414                            // Overflow safety (P0-3): reassemble spilled cols.
1415                            if let Some(row) = tbl.get(rid) {
1416                                rows.push(row);
1417                            }
1418                        }
1419                        return Ok(QueryResult::Rows { columns, rows });
1420                    }
1421                }
1422
1423                // Fallback: no index — synthesize the range predicate and scan.
1424                // Overflow safety (P0-4): v2-capable tables use the decoded
1425                // last-resort scan below.
1426                let fast = FastLayout::new(&schema);
1427                let synth = synthesize_range_predicate(column, start, end);
1428                if !tbl.has_overflow_rows() {
1429                    if let Some(compiled) = compile_predicate(&synth, &columns, &fast, &schema) {
1430                        let mut rows: Vec<Vec<Value>> = Vec::with_capacity(64);
1431                        for_each_row_raw_cancellable(&self.catalog, table, |_rid, data| {
1432                            if compiled(data) {
1433                                rows.push(decode_row(&schema, data));
1434                            }
1435                        })?;
1436                        return Ok(QueryResult::Rows { columns, rows });
1437                    }
1438                }
1439
1440                // Last resort: decoded row eval.
1441                let col_idx =
1442                    schema
1443                        .column_index(column)
1444                        .ok_or_else(|| QueryError::ColumnNotFound {
1445                            table: String::new(),
1446                            column: column.clone(),
1447                        })?;
1448                let mut cancel = crate::cancel::CancelCheck::new();
1449                let mut rows: Vec<Vec<Value>> = Vec::new();
1450                for (_, row) in tbl.scan() {
1451                    cancel.tick()?;
1452                    if range_matches(
1453                        &row[col_idx],
1454                        &start_val,
1455                        start_inclusive,
1456                        &end_val,
1457                        end_inclusive,
1458                    ) {
1459                        rows.push(row);
1460                    }
1461                }
1462                Ok(QueryResult::Rows { columns, rows })
1463            }
1464
1465            PlanNode::Filter { input, predicate } => {
1466                // Materialise subqueries using the `&self` variant.
1467                // Uncorrelated subqueries are replaced with InList/Bool;
1468                // correlated ones are left as InSubquery/ExistsSubquery
1469                // for per-row materialisation below.
1470                let materialized;
1471                let predicate = if contains_subquery(predicate) {
1472                    materialized = self.materialize_subqueries_readonly(predicate)?;
1473                    &materialized
1474                } else {
1475                    predicate
1476                };
1477
1478                // Correlated subquery path: per-row materialisation.
1479                if contains_subquery(predicate) {
1480                    let result = self.execute_plan_readonly(input)?;
1481                    return match result {
1482                        QueryResult::Rows { columns, rows } => {
1483                            let mut filtered = Vec::new();
1484                            // Cooperative cancellation: this runs a subquery per
1485                            // outer row, so a large outer scan must stay stoppable.
1486                            let mut cancel = crate::cancel::CancelCheck::new();
1487                            for row in rows {
1488                                cancel.tick()?;
1489                                let row_pred = self.materialize_correlated_for_row_readonly(
1490                                    predicate, &row, &columns,
1491                                )?;
1492                                if eval_predicate(&row_pred, &row, &columns) {
1493                                    filtered.push(row);
1494                                }
1495                            }
1496                            Ok(QueryResult::Rows {
1497                                columns,
1498                                rows: filtered,
1499                            })
1500                        }
1501                        _ => Err("filter requires row input".into()),
1502                    };
1503                }
1504
1505                // Lane A fast path: Filter over an equality-driven index scan
1506                // (mirrors the mutable path). Pure `&self`, so it is shared.
1507                if matches!(
1508                    input.as_ref(),
1509                    PlanNode::IndexScan { .. } | PlanNode::ExprIndexScan { .. }
1510                ) {
1511                    if let Some(result) = self.try_filter_index_residual_fast(input, predicate)? {
1512                        return Ok(result);
1513                    }
1514                }
1515
1516                // Fused Filter+SeqScan fast path.
1517                // Overflow safety (P0-4/P1): v2-capable tables fall through to
1518                // the decoded general path below.
1519                if let PlanNode::SeqScan { table } = input.as_ref() {
1520                    if !self.catalog.table_has_overflow(table) {
1521                        if self.view_registry.is_dirty(table) {
1522                            return Err(QueryError::ReadonlyNeedsWrite);
1523                        }
1524                        let schema = self
1525                            .catalog
1526                            .schema(table)
1527                            .ok_or_else(|| QueryError::TableNotFound(table.clone()))?
1528                            .clone();
1529                        let columns: Vec<String> =
1530                            schema.columns.iter().map(|c| c.name.clone()).collect();
1531                        let fast = FastLayout::new(&schema);
1532                        let row_layout = RowLayout::new(&schema);
1533                        let mut rows: Vec<Vec<Value>> = Vec::with_capacity(64);
1534
1535                        // Cooperative cancellation: full-table compiled/selective
1536                        // predicate scan must stay stoppable (see the write-path
1537                        // Filter fast path for the same pattern).
1538                        let mut cancel = crate::cancel::CancelCheck::new();
1539                        let mut cancel_err: Option<QueryError> = None;
1540                        if let Some(compiled) =
1541                            compile_predicate(predicate, &columns, &fast, &schema)
1542                        {
1543                            self.catalog
1544                                .try_for_each_row_raw(table, |_rid, data| {
1545                                    if let Err(e) = cancel.tick() {
1546                                        cancel_err = Some(e);
1547                                        return std::ops::ControlFlow::Break(());
1548                                    }
1549                                    if compiled(data) {
1550                                        rows.push(decode_row(&schema, data));
1551                                    }
1552                                    std::ops::ControlFlow::Continue(())
1553                                })
1554                                .map_err(|e| QueryError::StorageError(e.to_string()))?;
1555                        } else {
1556                            let pred_cols = predicate_column_indices_json(predicate, &columns);
1557                            self.catalog
1558                                .try_for_each_row_raw(table, |_rid, data| {
1559                                    if let Err(e) = cancel.tick() {
1560                                        cancel_err = Some(e);
1561                                        return std::ops::ControlFlow::Break(());
1562                                    }
1563                                    let pred_row =
1564                                        decode_selective(&schema, &row_layout, data, &pred_cols);
1565                                    if eval_predicate(predicate, &pred_row, &columns) {
1566                                        rows.push(decode_row(&schema, data));
1567                                    }
1568                                    std::ops::ControlFlow::Continue(())
1569                                })
1570                                .map_err(|e| QueryError::StorageError(e.to_string()))?;
1571                        }
1572                        if let Some(e) = cancel_err {
1573                            return Err(e);
1574                        }
1575
1576                        return Ok(QueryResult::Rows { columns, rows });
1577                    }
1578                }
1579
1580                // General path.
1581                let result = self.execute_plan_readonly(input)?;
1582                match result {
1583                    QueryResult::Rows { columns, rows } => {
1584                        let mut cancel = crate::cancel::CancelCheck::new();
1585                        let mut filtered: Vec<Vec<Value>> = Vec::new();
1586                        for row in rows {
1587                            cancel.tick()?;
1588                            if eval_predicate(predicate, &row, &columns) {
1589                                filtered.push(row);
1590                            }
1591                        }
1592                        Ok(QueryResult::Rows {
1593                            columns,
1594                            rows: filtered,
1595                        })
1596                    }
1597                    _ => Err("filter requires row input".into()),
1598                }
1599            }
1600
1601            PlanNode::Project { input, fields } => {
1602                if matches!(
1603                    input.as_ref(),
1604                    PlanNode::ExprIndexScan { .. }
1605                        | PlanNode::ExprRangeScan { .. }
1606                        | PlanNode::OrderedExprIndexScan { .. }
1607                ) {
1608                    if let Some(result) = self.execute_expression_index_plan(input, Some(fields))? {
1609                        return Ok(result);
1610                    }
1611                }
1612                // Fast path: Project over IndexScan. Avoids full-row decode
1613                // by calling decode_column only for projected fields.
1614                if let PlanNode::IndexScan { table, column, key } = input.as_ref() {
1615                    let key_value = literal_to_value(key)?;
1616                    let tbl = self
1617                        .catalog
1618                        .get_table(table)
1619                        .ok_or_else(|| QueryError::TableNotFound(table.clone()))?;
1620                    let schema = tbl.schema();
1621
1622                    let proj_columns: Vec<String> = fields
1623                        .iter()
1624                        .map(|f| {
1625                            f.alias.clone().unwrap_or_else(|| match &f.expr {
1626                                Expr::Field(name) => name.clone(),
1627                                _ => "?".into(),
1628                            })
1629                        })
1630                        .collect();
1631
1632                    let proj_indices: Vec<usize> = fields
1633                        .iter()
1634                        .filter_map(|f| {
1635                            if let Expr::Field(name) = &f.expr {
1636                                schema.column_index(name)
1637                            } else {
1638                                None
1639                            }
1640                        })
1641                        .collect();
1642
1643                    // Plain-field projections only; a computed projection
1644                    // (e.g. `length(.v)`) falls through to the generic
1645                    // expression-evaluating path (its column is otherwise
1646                    // dropped — proj_indices only collects Fields).
1647                    let all_plain_fields = fields.iter().all(|f| matches!(f.expr, Expr::Field(_)));
1648                    if tbl.has_index(column) && all_plain_fields {
1649                        let rids = tbl.index_lookup_all(column, &key_value);
1650                        let mut rows: Vec<Vec<Value>> = Vec::with_capacity(rids.len());
1651                        let mut cancel = crate::cancel::CancelCheck::new();
1652                        for rid in rids {
1653                            cancel.tick()?;
1654                            // Overflow safety (P0-3/P0-4): reassemble via
1655                            // `tbl.get` so spilled projected columns return
1656                            // their value, not Empty / a wrapped >= 64KB blob.
1657                            if let Some(full) = tbl.get(rid) {
1658                                let row: Vec<Value> =
1659                                    proj_indices.iter().map(|&ci| full[ci].clone()).collect();
1660                                rows.push(row);
1661                            }
1662                        }
1663                        return Ok(QueryResult::Rows {
1664                            columns: proj_columns,
1665                            rows,
1666                        });
1667                    }
1668                }
1669
1670                // Fast paths over Limit(Sort(...)) / Limit(Filter(...)) / Limit(SeqScan).
1671                if let PlanNode::Limit {
1672                    input: inner,
1673                    count: limit_expr,
1674                } = input.as_ref()
1675                {
1676                    if let PlanNode::Sort {
1677                        input: sort_input,
1678                        keys,
1679                    } = inner.as_ref()
1680                    {
1681                        if keys.len() == 1 {
1682                            if let Expr::Field(sort_field) = &keys[0].expr {
1683                                let descending = keys[0].descending;
1684                                let limit = match limit_expr {
1685                                    Expr::Literal(Literal::Int(v)) if *v >= 0 => *v as usize,
1686                                    _ => usize::MAX,
1687                                };
1688                                let (table_opt, pred_opt): (Option<&str>, Option<&Expr>) =
1689                                    match sort_input.as_ref() {
1690                                        PlanNode::SeqScan { table } => (Some(table.as_str()), None),
1691                                        PlanNode::Filter {
1692                                            input: fi,
1693                                            predicate,
1694                                        } => {
1695                                            if let PlanNode::SeqScan { table } = fi.as_ref() {
1696                                                (Some(table.as_str()), Some(predicate))
1697                                            } else {
1698                                                (None, None)
1699                                            }
1700                                        }
1701                                        _ => (None, None),
1702                                    };
1703                                if let Some(table) = table_opt {
1704                                    if let Some(result) = self.project_filter_sort_limit_fast(
1705                                        table, fields, sort_field, descending, limit, pred_opt,
1706                                    )? {
1707                                        return Ok(result);
1708                                    }
1709                                }
1710                            }
1711                        }
1712                    }
1713                    if let PlanNode::Filter {
1714                        input: fi,
1715                        predicate,
1716                    } = inner.as_ref()
1717                    {
1718                        if let PlanNode::SeqScan { table } = fi.as_ref() {
1719                            let limit = match limit_expr {
1720                                Expr::Literal(Literal::Int(v)) if *v >= 0 => *v as usize,
1721                                _ => usize::MAX,
1722                            };
1723                            if let Some(result) = self.project_filter_limit_fast(
1724                                table,
1725                                fields,
1726                                limit,
1727                                Some(predicate),
1728                            )? {
1729                                return Ok(result);
1730                            }
1731                        }
1732                    }
1733                    if let PlanNode::SeqScan { table } = inner.as_ref() {
1734                        let limit = match limit_expr {
1735                            Expr::Literal(Literal::Int(v)) if *v >= 0 => *v as usize,
1736                            _ => usize::MAX,
1737                        };
1738                        if let Some(result) =
1739                            self.project_filter_limit_fast(table, fields, limit, None)?
1740                        {
1741                            return Ok(result);
1742                        }
1743                    }
1744                }
1745
1746                // Project(Filter(SeqScan)) without Limit.
1747                if let PlanNode::Filter {
1748                    input: fi,
1749                    predicate,
1750                } = input.as_ref()
1751                {
1752                    if let PlanNode::SeqScan { table } = fi.as_ref() {
1753                        if let Some(result) = self.project_filter_limit_fast(
1754                            table,
1755                            fields,
1756                            usize::MAX,
1757                            Some(predicate),
1758                        )? {
1759                            return Ok(result);
1760                        }
1761                    }
1762                }
1763
1764                // Project(SeqScan) without Filter or Limit.
1765                if let PlanNode::SeqScan { table } = input.as_ref() {
1766                    if let Some(result) =
1767                        self.project_filter_limit_fast(table, fields, usize::MAX, None)?
1768                    {
1769                        return Ok(result);
1770                    }
1771                }
1772
1773                // Generic path.
1774                let result = self.execute_plan_readonly(input)?;
1775                match result {
1776                    QueryResult::Rows { columns, rows } => {
1777                        let proj_columns: Vec<String> = fields
1778                            .iter()
1779                            .map(|f| {
1780                                f.alias.clone().unwrap_or_else(|| match &f.expr {
1781                                    Expr::Field(name) => name.clone(),
1782                                    Expr::QualifiedField { qualifier, field } => {
1783                                        format!("{qualifier}.{field}")
1784                                    }
1785                                    _ => "?".into(),
1786                                })
1787                            })
1788                            .collect();
1789                        let mut cancel = crate::cancel::CancelCheck::new();
1790                        let mut proj_rows: Vec<Vec<Value>> = Vec::with_capacity(rows.len());
1791                        for row in &rows {
1792                            cancel.tick()?;
1793                            proj_rows.push(
1794                                fields
1795                                    .iter()
1796                                    .map(|f| eval_expr(&f.expr, row, &columns))
1797                                    .collect(),
1798                            );
1799                        }
1800                        Ok(QueryResult::Rows {
1801                            columns: proj_columns,
1802                            rows: proj_rows,
1803                        })
1804                    }
1805                    _ => Err("project requires row input".into()),
1806                }
1807            }
1808
1809            PlanNode::Sort { input, keys } => {
1810                let result = self.execute_plan_readonly(input)?;
1811                match result {
1812                    QueryResult::Rows { columns, mut rows } => {
1813                        if rows.len() > MAX_SORT_ROWS {
1814                            return Err(QueryError::SortLimitExceeded);
1815                        }
1816                        // WS2: byte-budget guard on the sort buffer.
1817                        self.charge_rows(&rows)?;
1818                        let key_specs: Vec<(Option<usize>, &Expr, bool)> = keys
1819                            .iter()
1820                            .map(|k| {
1821                                let stored_name = match &k.expr {
1822                                    Expr::Field(name) => Some(name.clone()),
1823                                    Expr::QualifiedField { qualifier, field } => {
1824                                        Some(format!("{qualifier}.{field}"))
1825                                    }
1826                                    _ => None,
1827                                };
1828                                let index = stored_name
1829                                    .as_ref()
1830                                    .and_then(|name| columns.iter().position(|c| c == name));
1831                                if let Some(name) = stored_name {
1832                                    if index.is_none() {
1833                                        return Err(QueryError::ColumnNotFound {
1834                                            table: String::new(),
1835                                            column: name,
1836                                        });
1837                                    }
1838                                }
1839                                Ok((index, &k.expr, k.descending))
1840                            })
1841                            .collect::<Result<_, QueryError>>()?;
1842                        cooperative_stable_sort_by(&mut rows, self.query_memory_limit, |a, b| {
1843                            for &(col_idx, expr, descending) in &key_specs {
1844                                let (left_value, right_value) = match col_idx {
1845                                    Some(index) => (&a[index], &b[index]),
1846                                    None => {
1847                                        let left = eval_expr(expr, a, &columns);
1848                                        let right = eval_expr(expr, b, &columns);
1849                                        let cmp = compare_order_values(&left, &right, descending);
1850                                        if cmp != std::cmp::Ordering::Equal {
1851                                            return cmp;
1852                                        }
1853                                        continue;
1854                                    }
1855                                };
1856                                let cmp = compare_order_values(left_value, right_value, descending);
1857                                if cmp != std::cmp::Ordering::Equal {
1858                                    return cmp;
1859                                }
1860                            }
1861                            std::cmp::Ordering::Equal
1862                        })?;
1863                        Ok(QueryResult::Rows { columns, rows })
1864                    }
1865                    _ => Err("sort requires row input".into()),
1866                }
1867            }
1868
1869            PlanNode::Limit { input, count } => {
1870                let result = self.execute_plan_readonly(input)?;
1871                let n = match count {
1872                    Expr::Literal(Literal::Int(v)) => *v as usize,
1873                    _ => return Err("limit must be integer literal".into()),
1874                };
1875                match result {
1876                    QueryResult::Rows { columns, rows } => {
1877                        let mut cancel = crate::cancel::CancelCheck::new();
1878                        let mut limited = Vec::with_capacity(n.min(rows.len()));
1879                        for row in rows.into_iter().take(n) {
1880                            cancel.tick()?;
1881                            limited.push(row);
1882                        }
1883                        Ok(QueryResult::Rows {
1884                            columns,
1885                            rows: limited,
1886                        })
1887                    }
1888                    _ => Err("limit requires row input".into()),
1889                }
1890            }
1891
1892            PlanNode::Offset { input, count } => {
1893                let result = self.execute_plan_readonly(input)?;
1894                let n = match count {
1895                    Expr::Literal(Literal::Int(v)) => *v as usize,
1896                    _ => return Err("offset must be integer literal".into()),
1897                };
1898                match result {
1899                    QueryResult::Rows { columns, rows } => {
1900                        let mut cancel = crate::cancel::CancelCheck::new();
1901                        let mut offset = Vec::with_capacity(rows.len().saturating_sub(n));
1902                        for (index, row) in rows.into_iter().enumerate() {
1903                            cancel.tick()?;
1904                            if index >= n {
1905                                offset.push(row);
1906                            }
1907                        }
1908                        Ok(QueryResult::Rows {
1909                            columns,
1910                            rows: offset,
1911                        })
1912                    }
1913                    _ => Err("offset requires row input".into()),
1914                }
1915            }
1916
1917            PlanNode::Aggregate {
1918                input,
1919                function,
1920                argument,
1921                mode: _,
1922                provenance_alias,
1923            } => {
1924                if let Some(provenance_alias) = provenance_alias {
1925                    let input = self.materialize_rows_with_provenance(input)?;
1926                    self.charge_rows(&input.rows)?;
1927                    return aggregate_rows_with_provenance(
1928                        *function,
1929                        argument.as_ref(),
1930                        &input,
1931                        provenance_alias,
1932                        self.query_memory_limit,
1933                    );
1934                }
1935                // Fast path: count() over SeqScan.
1936                // Overflow safety (P0-4): v2-capable tables use the decoded
1937                // generic path (raw count drops >= 64KB rows).
1938                if *function == AggFunc::Count {
1939                    if let PlanNode::SeqScan { table } = input.as_ref() {
1940                        if !self.catalog.table_has_overflow(table) {
1941                            // A dirty materialized view must be refreshed before
1942                            // it can be counted, which needs `&mut self`. Escalate
1943                            // to the write path (F3: count(View) returned stale).
1944                            if self.view_registry.is_dirty(table) {
1945                                return Err(QueryError::ReadonlyNeedsWrite);
1946                            }
1947                            let mut count: i64 = 0;
1948                            for_each_row_raw_cancellable(&self.catalog, table, |_rid, _data| {
1949                                count += 1;
1950                            })?;
1951                            return Ok(QueryResult::Scalar(Value::Int(count)));
1952                        }
1953                    }
1954                    if let PlanNode::Filter {
1955                        input: inner,
1956                        predicate,
1957                    } = input.as_ref()
1958                    {
1959                        // Only take the fast path for a plain Filter(SeqScan)
1960                        // with no subquery in the predicate. A subquery
1961                        // predicate (`count(T filter .x in (...))`) must be
1962                        // resolved first; the fast path evaluates the raw
1963                        // predicate with no subquery materialisation, which
1964                        // silently yields 0 (F1). Falling through routes it to
1965                        // the generic path that runs the subquery correctly.
1966                        if let PlanNode::SeqScan { table } = inner.as_ref() {
1967                            if self.view_registry.is_dirty(table) {
1968                                // F3: count(View filter ...) over a dirty view.
1969                                return Err(QueryError::ReadonlyNeedsWrite);
1970                            }
1971                        }
1972                        if let (PlanNode::SeqScan { table }, false) =
1973                            (inner.as_ref(), contains_subquery(predicate))
1974                        {
1975                            if !self.catalog.table_has_overflow(table) {
1976                                let schema = self
1977                                    .catalog
1978                                    .schema(table)
1979                                    .ok_or_else(|| QueryError::TableNotFound(table.clone()))?
1980                                    .clone();
1981                                let columns: Vec<String> =
1982                                    schema.columns.iter().map(|c| c.name.clone()).collect();
1983                                let fast = FastLayout::new(&schema);
1984                                let row_layout = RowLayout::new(&schema);
1985
1986                                if let Some(compiled) =
1987                                    compile_predicate(predicate, &columns, &fast, &schema)
1988                                {
1989                                    let mut count: i64 = 0;
1990                                    for_each_row_raw_cancellable(
1991                                        &self.catalog,
1992                                        table,
1993                                        |_rid, data| {
1994                                            if compiled(data) {
1995                                                count += 1;
1996                                            }
1997                                        },
1998                                    )?;
1999                                    return Ok(QueryResult::Scalar(Value::Int(count)));
2000                                }
2001
2002                                let pred_cols = predicate_column_indices_json(predicate, &columns);
2003                                let mut count: i64 = 0;
2004                                for_each_row_raw_cancellable(
2005                                    &self.catalog,
2006                                    table,
2007                                    |_rid, data| {
2008                                        let pred_row = decode_selective(
2009                                            &schema,
2010                                            &row_layout,
2011                                            data,
2012                                            &pred_cols,
2013                                        );
2014                                        if eval_predicate(predicate, &pred_row, &columns) {
2015                                            count += 1;
2016                                        }
2017                                    },
2018                                )?;
2019                                return Ok(QueryResult::Scalar(Value::Int(count)));
2020                            }
2021                        }
2022                    }
2023                }
2024
2025                // Fast path: sum/avg/min/max over single fixed-size numeric.
2026                if matches!(
2027                    function,
2028                    AggFunc::Sum
2029                        | AggFunc::Avg
2030                        | AggFunc::Min
2031                        | AggFunc::Max
2032                        | AggFunc::CountDistinct
2033                ) {
2034                    if let Some(Expr::Field(col)) = argument.as_ref() {
2035                        let (table_opt, pred_opt): (Option<&str>, Option<&Expr>) =
2036                            match input.as_ref() {
2037                                PlanNode::SeqScan { table } => (Some(table.as_str()), None),
2038                                PlanNode::Filter {
2039                                    input: inner,
2040                                    predicate,
2041                                } => {
2042                                    if let PlanNode::SeqScan { table } = inner.as_ref() {
2043                                        (Some(table.as_str()), Some(predicate))
2044                                    } else {
2045                                        (None, None)
2046                                    }
2047                                }
2048                                _ => (None, None),
2049                            };
2050                        if let Some(table) = table_opt {
2051                            if let Some(result) =
2052                                self.agg_single_col_fast(table, col, *function, pred_opt)?
2053                            {
2054                                return Ok(result);
2055                            }
2056                        }
2057                    }
2058                }
2059
2060                // Generic path.
2061                let result = self.execute_plan_readonly(input)?;
2062                match result {
2063                    QueryResult::Rows { columns, rows } => {
2064                        aggregate_rows(*function, argument.as_ref(), &columns, &rows)
2065                    }
2066                    _ => Err("aggregate requires row input".into()),
2067                }
2068            }
2069
2070            PlanNode::Distinct { input } => {
2071                let result = self.execute_plan_readonly(input)?;
2072                match result {
2073                    QueryResult::Rows { columns, rows } => {
2074                        let mut seen = std::collections::HashSet::new();
2075                        let mut unique_rows = Vec::new();
2076                        let mut cancel = crate::cancel::CancelCheck::new();
2077                        for row in rows {
2078                            cancel.tick()?;
2079                            if seen.insert(row.clone()) {
2080                                unique_rows.push(row);
2081                            }
2082                        }
2083                        Ok(QueryResult::Rows {
2084                            columns,
2085                            rows: unique_rows,
2086                        })
2087                    }
2088                    other => Ok(other),
2089                }
2090            }
2091
2092            PlanNode::GroupBy {
2093                input,
2094                keys,
2095                aggregates,
2096                having,
2097            } => {
2098                if aggregates
2099                    .iter()
2100                    .any(|aggregate| aggregate.provenance_alias.is_some())
2101                {
2102                    let input = self.materialize_rows_with_provenance(input)?;
2103                    self.charge_rows(&input.rows)?;
2104                    return exec_group_by_with_provenance(
2105                        input,
2106                        keys,
2107                        aggregates,
2108                        having,
2109                        self.query_memory_limit,
2110                    );
2111                }
2112                let result = self.execute_plan_readonly(input)?;
2113                match result {
2114                    QueryResult::Rows { columns, rows } => {
2115                        // WS2: byte-budget guard on the GROUP BY input buffer
2116                        // (the hash table is bounded by the input it groups).
2117                        self.charge_rows(&rows)?;
2118                        exec_group_by(columns, rows, keys, aggregates, having)
2119                    }
2120                    _ => Err("group by requires row input".into()),
2121                }
2122            }
2123
2124            PlanNode::NestedLoopJoin {
2125                left,
2126                right,
2127                on,
2128                kind,
2129            } => {
2130                let left_result = self.execute_plan_readonly(left)?;
2131                let right_result = self.execute_plan_readonly(right)?;
2132                let (left_columns, left_rows) = match left_result {
2133                    QueryResult::Rows { columns, rows } => (columns, rows),
2134                    _ => return Err("join left side must produce rows".into()),
2135                };
2136                let (right_columns, right_rows) = match right_result {
2137                    QueryResult::Rows { columns, rows } => (columns, rows),
2138                    _ => return Err("join right side must produce rows".into()),
2139                };
2140
2141                // WS2: byte-budget guard on the join build side.
2142                self.charge_rows(&left_rows)?;
2143                self.charge_rows(&right_rows)?;
2144
2145                execute_materialized_join(
2146                    left_columns,
2147                    left_rows,
2148                    right_columns,
2149                    right_rows,
2150                    on.as_ref(),
2151                    *kind,
2152                    self.nested_loop_pair_limit,
2153                )
2154            }
2155
2156            PlanNode::Window { input, windows } => {
2157                let result = self.execute_plan_readonly(input)?;
2158                execute_window(result, windows, self.query_memory_limit)
2159            }
2160
2161            PlanNode::Union { left, right, all } => {
2162                let left_result = self.execute_plan_readonly(left)?;
2163                let right_result = self.execute_plan_readonly(right)?;
2164                let (left_cols, left_rows) = match left_result {
2165                    QueryResult::Rows { columns, rows } => (columns, rows),
2166                    _ => return Err("UNION requires query results on left side".into()),
2167                };
2168                let (_, right_rows) = match right_result {
2169                    QueryResult::Rows { columns, rows } => (columns, rows),
2170                    _ => return Err("UNION requires query results on right side".into()),
2171                };
2172                let mut combined = left_rows;
2173                let mut cancel = crate::cancel::CancelCheck::new();
2174                if *all {
2175                    for row in right_rows {
2176                        cancel.tick()?;
2177                        combined.push(row);
2178                    }
2179                } else {
2180                    let mut seen = std::collections::HashSet::new();
2181                    for row in &combined {
2182                        cancel.tick()?;
2183                        seen.insert(row.clone());
2184                    }
2185                    for row in right_rows {
2186                        cancel.tick()?;
2187                        if seen.insert(row.clone()) {
2188                            combined.push(row);
2189                        }
2190                    }
2191                }
2192                Ok(QueryResult::Rows {
2193                    columns: left_cols,
2194                    rows: combined,
2195                })
2196            }
2197
2198            PlanNode::Explain { input } => {
2199                // Every execute entry point runs lower_unindexed_scans before
2200                // dispatch and lowering recurses into Explain, so `input` is
2201                // already the plan that will actually run.
2202                let text = format_plan_tree(&self.catalog, input, 0);
2203                Ok(QueryResult::Rows {
2204                    columns: vec!["plan".to_string()],
2205                    rows: text
2206                        .lines()
2207                        .map(|line| vec![Value::Str(line.to_string())])
2208                        .collect(),
2209                })
2210            }
2211
2212            PlanNode::ListTypes => self.introspect_list_types(),
2213
2214            PlanNode::Describe { table } => self.introspect_describe(table),
2215
2216            // All write variants — caller must escalate to the write lock.
2217            PlanNode::Insert { .. }
2218            | PlanNode::Update { .. }
2219            | PlanNode::Delete { .. }
2220            | PlanNode::Upsert { .. }
2221            | PlanNode::CreateTable { .. }
2222            | PlanNode::AlterTable { .. }
2223            | PlanNode::DropTable { .. }
2224            | PlanNode::CreateView { .. }
2225            | PlanNode::RefreshView { .. }
2226            | PlanNode::DropView { .. }
2227            | PlanNode::Begin
2228            | PlanNode::Commit
2229            | PlanNode::Rollback => Err(QueryError::ReadonlyNeedsWrite),
2230        }
2231    }
2232
2233    /// `&self` variant of [`Engine::materialize_subqueries`]. Used by the
2234    /// read path so `Filter` predicates with `InSubquery`/`ExistsSubquery`
2235    /// children can evaluate their inner queries without taking the write
2236    /// lock. Inner queries that would themselves need a write (e.g. dirty
2237    /// view) escalate via [`READONLY_NEEDS_WRITE`] just like the top-level
2238    /// read path does.
2239    fn materialize_subqueries_readonly(&self, expr: &Expr) -> Result<Expr, QueryError> {
2240        match expr {
2241            Expr::InSubquery {
2242                expr: inner,
2243                subquery,
2244                negated,
2245            } => {
2246                if is_correlated_subquery(subquery, &self.catalog) {
2247                    // Pass through — will be materialized per-row in the
2248                    // Filter handler's correlated subquery path.
2249                    let inner = self.materialize_subqueries_readonly(inner)?;
2250                    return Ok(Expr::InSubquery {
2251                        expr: Box::new(inner),
2252                        subquery: subquery.clone(),
2253                        negated: *negated,
2254                    });
2255                }
2256                let inner = self.materialize_subqueries_readonly(inner)?;
2257                let sub_plan = crate::planner::plan_statement(Statement::Query(*subquery.clone()))
2258                    .map_err(|e| QueryError::StorageError(e.to_string()))?;
2259                let result = self.execute_plan_readonly(&sub_plan)?;
2260                let values = match result {
2261                    QueryResult::Rows { rows, .. } => {
2262                        let mut values = Vec::with_capacity(rows.len());
2263                        let mut cancel = crate::cancel::CancelCheck::new();
2264                        for mut row in rows {
2265                            cancel.tick()?;
2266                            if !row.is_empty() {
2267                                values.push(value_to_expr(row.swap_remove(0)));
2268                            }
2269                        }
2270                        values
2271                    }
2272                    _ => Vec::new(),
2273                };
2274                // WS2: byte-budget guard on the materialized IN-list.
2275                self.charge_in_list(&values)?;
2276                Ok(Expr::InList {
2277                    expr: Box::new(inner),
2278                    list: values,
2279                    negated: *negated,
2280                })
2281            }
2282            Expr::ExistsSubquery { subquery, negated } => {
2283                if is_correlated_subquery(subquery, &self.catalog) {
2284                    return Ok(expr.clone());
2285                }
2286                let sub_plan = crate::planner::plan_statement(Statement::Query(*subquery.clone()))
2287                    .map_err(|e| QueryError::StorageError(e.to_string()))?;
2288                let result = self.execute_plan_readonly(&sub_plan)?;
2289                let has_rows = match result {
2290                    QueryResult::Rows { rows, .. } => !rows.is_empty(),
2291                    _ => false,
2292                };
2293                let truth = if *negated { !has_rows } else { has_rows };
2294                Ok(Expr::Literal(Literal::Bool(truth)))
2295            }
2296            Expr::BinaryOp(l, op, r) => {
2297                let l = self.materialize_subqueries_readonly(l)?;
2298                let r = self.materialize_subqueries_readonly(r)?;
2299                Ok(Expr::BinaryOp(Box::new(l), *op, Box::new(r)))
2300            }
2301            Expr::UnaryOp(op, inner) => {
2302                let inner = self.materialize_subqueries_readonly(inner)?;
2303                Ok(Expr::UnaryOp(*op, Box::new(inner)))
2304            }
2305            Expr::Case { whens, else_expr } => {
2306                let whens = whens
2307                    .iter()
2308                    .map(|(c, r)| {
2309                        let c = self.materialize_subqueries_readonly(c)?;
2310                        let r = self.materialize_subqueries_readonly(r)?;
2311                        Ok((Box::new(c), Box::new(r)))
2312                    })
2313                    .collect::<Result<Vec<_>, QueryError>>()?;
2314                let else_expr = match else_expr {
2315                    Some(e) => Some(Box::new(self.materialize_subqueries_readonly(e)?)),
2316                    None => None,
2317                };
2318                Ok(Expr::Case { whens, else_expr })
2319            }
2320            other => Ok(other.clone()),
2321        }
2322    }
2323
2324    /// Per-row materialisation of correlated subqueries. For each row in the
2325    /// outer query, substitute outer column references in the subquery's
2326    /// filter with the current row's literal values, execute the modified
2327    /// subquery, and return the result as an InList or Bool literal.
2328    fn materialize_correlated_for_row_readonly(
2329        &self,
2330        expr: &Expr,
2331        outer_row: &[Value],
2332        outer_columns: &[String],
2333    ) -> Result<Expr, QueryError> {
2334        match expr {
2335            Expr::InSubquery {
2336                expr: inner,
2337                subquery,
2338                negated,
2339            } => {
2340                let inner =
2341                    self.materialize_correlated_for_row_readonly(inner, outer_row, outer_columns)?;
2342                let mut sub = *subquery.clone();
2343                if let Some(ref filter) = sub.filter {
2344                    sub.filter = Some(substitute_outer_refs(
2345                        filter,
2346                        &sub.source,
2347                        &self.catalog,
2348                        outer_row,
2349                        outer_columns,
2350                    ));
2351                }
2352                let sub_plan = crate::planner::plan_statement(Statement::Query(sub))
2353                    .map_err(|e| QueryError::StorageError(e.to_string()))?;
2354                let result = self.execute_plan_readonly(&sub_plan)?;
2355                let values = match result {
2356                    QueryResult::Rows { rows, .. } => {
2357                        let mut values = Vec::with_capacity(rows.len());
2358                        let mut cancel = crate::cancel::CancelCheck::new();
2359                        for mut row in rows {
2360                            cancel.tick()?;
2361                            if !row.is_empty() {
2362                                values.push(value_to_expr(row.swap_remove(0)));
2363                            }
2364                        }
2365                        values
2366                    }
2367                    _ => Vec::new(),
2368                };
2369                // WS2: byte-budget guard on the per-row materialized IN-list.
2370                self.charge_in_list(&values)?;
2371                Ok(Expr::InList {
2372                    expr: Box::new(inner),
2373                    list: values,
2374                    negated: *negated,
2375                })
2376            }
2377            Expr::ExistsSubquery { subquery, negated } => {
2378                let mut sub = *subquery.clone();
2379                if let Some(ref filter) = sub.filter {
2380                    sub.filter = Some(substitute_outer_refs(
2381                        filter,
2382                        &sub.source,
2383                        &self.catalog,
2384                        outer_row,
2385                        outer_columns,
2386                    ));
2387                }
2388                let sub_plan = crate::planner::plan_statement(Statement::Query(sub))
2389                    .map_err(|e| QueryError::StorageError(e.to_string()))?;
2390                let result = self.execute_plan_readonly(&sub_plan)?;
2391                let has_rows = match result {
2392                    QueryResult::Rows { rows, .. } => !rows.is_empty(),
2393                    _ => false,
2394                };
2395                let truth = if *negated { !has_rows } else { has_rows };
2396                Ok(Expr::Literal(Literal::Bool(truth)))
2397            }
2398            Expr::BinaryOp(l, op, r) => {
2399                let l =
2400                    self.materialize_correlated_for_row_readonly(l, outer_row, outer_columns)?;
2401                let r =
2402                    self.materialize_correlated_for_row_readonly(r, outer_row, outer_columns)?;
2403                Ok(Expr::BinaryOp(Box::new(l), *op, Box::new(r)))
2404            }
2405            Expr::UnaryOp(op, inner) => {
2406                let inner =
2407                    self.materialize_correlated_for_row_readonly(inner, outer_row, outer_columns)?;
2408                Ok(Expr::UnaryOp(*op, Box::new(inner)))
2409            }
2410            other => Ok(other.clone()),
2411        }
2412    }
2413
2414    pub fn catalog(&self) -> &Catalog {
2415        &self.catalog
2416    }
2417
2418    pub fn catalog_mut(&mut self) -> &mut Catalog {
2419        &mut self.catalog
2420    }
2421}
2422
2423impl Drop for Engine {
2424    fn drop(&mut self) {
2425        let Some(hook) = self.wal_archive_hook.clone() else {
2426            return;
2427        };
2428        if let Err(err) = self
2429            .catalog
2430            .checkpoint_with_wal_archive(move |dir, records| hook(dir, records))
2431        {
2432            error!(error = %err, "sync-aware engine checkpoint on drop failed");
2433        }
2434    }
2435}
powdb_query/executor/mod.rs

powdb_query/executor/
mod.rs