Skip to main content

reddb_server/runtime/
impl_core.rs

1use super::*;
2use crate::application::entity::metadata_to_json;
3use crate::auth::column_policy_gate::ColumnAccessRequest;
4use crate::auth::UserId;
5use crate::replication::cdc::ChangeRecord;
6use crate::replication::logical::{ApplyMode, LogicalChangeApplier};
7use crate::storage::query::ast::TableSource;
8
9thread_local! {
10    /// Current connection id for the executing statement. Set by the
11    /// per-connection wrapper (stdio/gRPC handlers) before dispatching
12    /// into `execute_query`; falls back to `0` for embedded callers.
13    static CURRENT_CONN_ID: std::cell::Cell<u64> = const { std::cell::Cell::new(0) };
14
15    /// Authenticated user + role for the executing statement (Phase 2.5.2
16    /// RLS enforcement). Set by the transport middleware after validating
17    /// credentials (password / cert / oauth); unset means "anonymous" /
18    /// "embedded" — RLS policies degrade to the role-agnostic subset.
19    ///
20    /// `None` skips RLS injection entirely; `Some((username, role))`
21    /// passes `role` to `matching_rls_policies(table, Some(role), action)`.
22    static CURRENT_AUTH_IDENTITY: std::cell::RefCell<Option<(String, crate::auth::Role)>> =
23        const { std::cell::RefCell::new(None) };
24
25    /// MVCC snapshot scoped to the currently-executing statement (Phase
26    /// 2.3.2d PG parity). `execute_query` captures it on entry and drops
27    /// it on exit; every scan consults it via
28    /// `entity_visible_under_current_snapshot` to hide tuples whose xmin
29    /// hasn't committed or whose xmax already has.
30    ///
31    /// `None` means "pre-MVCC semantics" — the read path returns every
32    /// tuple regardless of xmin/xmax. All embedded callers that bypass
33    /// `execute_query` see this default.
34    static CURRENT_SNAPSHOT: std::cell::RefCell<Option<SnapshotContext>> =
35        const { std::cell::RefCell::new(None) };
36
37    /// Cheap presence flag for `CURRENT_SNAPSHOT`. Scan hot paths
38    /// poll this instead of `borrow()`-ing the RefCell on every
39    /// row — the common case (autocommit / no MVCC session) reads
40    /// one atomic `Cell<bool>` and short-circuits, saving ~10ns × N
41    /// rows on aggregate_group / select_range scans.
42    static HAS_SNAPSHOT: std::cell::Cell<bool> = const { std::cell::Cell::new(false) };
43
44    /// Session-scoped tenant id for the current connection (Phase 2.5.3
45    /// multi-tenancy). Populated by `SET TENANT 'id'` or by transport
46    /// middleware after resolving tenant from auth claims. Read by the
47    /// `CURRENT_TENANT()` scalar function — RLS policies typically
48    /// combine it as `USING (tenant_id = CURRENT_TENANT())` to scope
49    /// every query to one tenant.
50    ///
51    /// `None` means "no tenant bound" — `CURRENT_TENANT()` returns
52    /// NULL, and RLS policies that gate on it hide every row.
53    static CURRENT_TENANT_ID: std::cell::RefCell<Option<String>> =
54        const { std::cell::RefCell::new(None) };
55
56    /// Statement-local config resolver. SQL expressions materialize the
57    /// `red_config` snapshot lazily on the first `$config.*`/`CONFIG()`
58    /// access, keeping ordinary statements on the zero-scan path.
59    static CURRENT_CONFIG_RESOLVER: std::cell::RefCell<Option<ConfigResolver>> =
60        const { std::cell::RefCell::new(None) };
61
62    /// Statement-local secret resolver. SQL expressions materialize the
63    /// vault KV snapshot lazily on first `$secret.*` access, then use
64    /// lock-free map reads for the rest of the statement.
65    static CURRENT_SECRET_RESOLVER: std::cell::RefCell<Option<SecretResolver>> =
66        const { std::cell::RefCell::new(None) };
67}
68
69fn secret_sql_value_to_string(value: &Value) -> RedDBResult<String> {
70    match value {
71        Value::Text(s) => Ok(s.to_string()),
72        Value::Integer(n) => Ok(n.to_string()),
73        Value::UnsignedInteger(n) => Ok(n.to_string()),
74        Value::Float(n) => Ok(n.to_string()),
75        Value::Boolean(b) => Ok(b.to_string()),
76        Value::Null => Err(RedDBError::Query(
77            "SET SECRET key = NULL deletes the secret; use DELETE SECRET for explicit deletes"
78                .to_string(),
79        )),
80        Value::Password(_) | Value::Secret(_) => Err(RedDBError::Query(
81            "SET SECRET accepts plain scalar literals; PASSWORD() and SECRET() are for typed columns"
82                .to_string(),
83        )),
84        _ => Err(RedDBError::Query(format!(
85            "SET SECRET does not support value type {:?} yet",
86            value.data_type()
87        ))),
88    }
89}
90
91fn system_keyed_collection_contract(
92    name: &str,
93    model: crate::catalog::CollectionModel,
94) -> crate::physical::CollectionContract {
95    let now = crate::utils::now_unix_millis() as u128;
96    crate::physical::CollectionContract {
97        name: name.to_string(),
98        declared_model: model,
99        schema_mode: crate::catalog::SchemaMode::Dynamic,
100        origin: crate::physical::ContractOrigin::Implicit,
101        version: 1,
102        created_at_unix_ms: now,
103        updated_at_unix_ms: now,
104        default_ttl_ms: None,
105        vector_dimension: None,
106        vector_metric: None,
107        context_index_fields: Vec::new(),
108        declared_columns: Vec::new(),
109        table_def: None,
110        timestamps_enabled: false,
111        context_index_enabled: false,
112        append_only: false,
113        subscriptions: Vec::new(),
114    }
115}
116
117/// Snapshot + manager pair used for read-path visibility checks.
118///
119/// The manager is needed in addition to the snapshot because `aborted`
120/// state mutates after the snapshot is captured — a ROLLBACK by a
121/// committed-at-capture-time writer must still hide its tuples. Keeping
122/// the Arc around is O(pointer) and the RwLock reads on `is_aborted`
123/// are cheap (HashSet lookup under a parking_lot read guard).
124///
125/// `own_xids` (Phase 2.3.2e) lists the xids belonging to the current
126/// connection's transaction — the parent xid plus open and released
127/// savepoint sub-xids. The visibility rule promotes rows stamped with
128/// these xids to "always visible (unless aborted)" so the writer sees
129/// its own nested-savepoint writes even though their xids exceed
130/// `snapshot.xid`.
131#[derive(Clone)]
132pub struct SnapshotContext {
133    pub snapshot: crate::storage::transaction::snapshot::Snapshot,
134    pub manager: Arc<crate::storage::transaction::snapshot::SnapshotManager>,
135    pub own_xids: std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
136    pub requires_index_fallback: bool,
137}
138
139/// Install a connection id on the current thread for the duration of a
140/// statement. Transaction state (`RuntimeInner::tx_contexts`) is keyed
141/// by this id so different connections can hold independent BEGINs.
142///
143/// Pub so transports (PG wire, gRPC, HTTP per-request spawners) and
144/// tests can emulate per-connection isolation. Call it once when
145/// binding the connection's worker thread; pair with
146/// `clear_current_connection_id` on teardown.
147pub fn set_current_connection_id(id: u64) {
148    CURRENT_CONN_ID.with(|c| c.set(id));
149}
150
151/// Reset the thread's connection id back to `0` (autocommit).
152pub fn clear_current_connection_id() {
153    CURRENT_CONN_ID.with(|c| c.set(0));
154}
155
156/// Read the connection id set by `set_current_connection_id`. Returns
157/// `0` when no wrapper installed one — auto-commit path.
158pub fn current_connection_id() -> u64 {
159    CURRENT_CONN_ID.with(|c| c.get())
160}
161
162/// Install the authenticated identity for the current thread (Phase 2.5.2
163/// RLS enforcement). Transport layers call this right after resolving
164/// auth so the query dispatch can fold RLS policies into the filter.
165pub fn set_current_auth_identity(username: String, role: crate::auth::Role) {
166    CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = Some((username, role)));
167}
168
169/// Clear the thread-local auth identity. Transports call this after the
170/// statement completes so pooled threads don't leak identities across
171/// requests.
172pub fn clear_current_auth_identity() {
173    CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = None);
174}
175
176/// Read the current-thread auth identity. `None` when no transport
177/// installed one (embedded mode / anonymous access).
178pub(crate) fn current_auth_identity() -> Option<(String, crate::auth::Role)> {
179    CURRENT_AUTH_IDENTITY.with(|cell| cell.borrow().clone())
180}
181
182/// Install the session tenant id for the current thread (Phase 2.5.3
183/// multi-tenancy). Called by `SET TENANT 'id'` dispatch and by
184/// transport middleware that resolves tenant from auth claims (e.g.
185/// JWT `tenant` claim, HTTP header, subdomain).
186pub fn set_current_tenant(tenant_id: String) {
187    CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = Some(tenant_id));
188}
189
190/// Clear the current-thread tenant — `CURRENT_TENANT()` will then
191/// return NULL and any RLS policy gated on it will hide every row.
192pub fn clear_current_tenant() {
193    CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = None);
194}
195
196/// Read the current-thread tenant id, applying overrides in priority order:
197///   1. `WITHIN TENANT '<id>' …` per-statement override (highest)
198///   2. `SET LOCAL TENANT '<id>'` transaction-local override (consulted
199///      only when the current connection has an open transaction)
200///   3. `SET TENANT '<id>'` session-level thread-local
201///   4. `None` (deny-default for RLS).
202///
203/// The transaction-local layer is read through the runtime; an embedded
204/// helper crate that has no `RedDBRuntime` access still gets correct
205/// behaviour for layers 1, 3, and 4.
206pub fn current_tenant() -> Option<String> {
207    let inherited = CURRENT_TENANT_ID.with(|cell| cell.borrow().clone());
208    if let Some(over) = current_scope_override() {
209        if over.tenant.is_active() {
210            return over.tenant.resolve(inherited);
211        }
212    }
213    if let Some(tx_local) = current_tx_local_tenant() {
214        return tx_local;
215    }
216    inherited
217}
218
219thread_local! {
220    /// Snapshot of the active connection's `tx_local_tenants` entry for
221    /// the current `execute_query` call. Outer `Some(_)` means "a
222    /// transaction-local tenant override is active for this call";
223    /// inner is the override's value (`Some(s)` overrides to `s`,
224    /// `None` overrides to NULL/cleared). Refreshed at the top of every
225    /// `execute_query` invocation and cleared by the RAII guard on
226    /// return so pooled connections cannot leak the override past the
227    /// statement that owns it.
228    static TX_LOCAL_TENANT: std::cell::RefCell<Option<Option<String>>> =
229        const { std::cell::RefCell::new(None) };
230}
231
232fn current_tx_local_tenant() -> Option<Option<String>> {
233    TX_LOCAL_TENANT.with(|cell| cell.borrow().clone())
234}
235
236/// Recognise `SET LOCAL TENANT '<id>'` / `SET LOCAL TENANT NULL` —
237/// returns `Ok(Some(Some(id)))` for an explicit value, `Ok(Some(None))`
238/// for an explicit NULL clear, `Ok(None)` when the input is not a
239/// `SET LOCAL TENANT` statement at all, and `Err` when the prefix
240/// matches but the value is malformed.
241fn parse_set_local_tenant(query: &str) -> RedDBResult<Option<Option<String>>> {
242    let mut tokens = query.split_ascii_whitespace();
243    let Some(w1) = tokens.next() else {
244        return Ok(None);
245    };
246    if !w1.eq_ignore_ascii_case("SET") {
247        return Ok(None);
248    }
249    let Some(w2) = tokens.next() else {
250        return Ok(None);
251    };
252    if !w2.eq_ignore_ascii_case("LOCAL") {
253        return Ok(None);
254    }
255    let Some(w3) = tokens.next() else {
256        return Ok(None);
257    };
258    if !w3.eq_ignore_ascii_case("TENANT") {
259        return Ok(None);
260    }
261    let rest: String = tokens.collect::<Vec<_>>().join(" ");
262    let rest = rest.trim().trim_end_matches(';').trim();
263    let value_str = rest.strip_prefix('=').map(|s| s.trim()).unwrap_or(rest);
264    if value_str.is_empty() {
265        return Err(RedDBError::Query(
266            "SET LOCAL TENANT expects a string literal or NULL".to_string(),
267        ));
268    }
269    if value_str.eq_ignore_ascii_case("NULL") {
270        return Ok(Some(None));
271    }
272    if value_str.starts_with('\'') && value_str.ends_with('\'') && value_str.len() >= 2 {
273        let inner = &value_str[1..value_str.len() - 1];
274        return Ok(Some(Some(inner.to_string())));
275    }
276    Err(RedDBError::Query(format!(
277        "SET LOCAL TENANT expects a string literal or NULL, got `{value_str}`"
278    )))
279}
280
281pub(crate) struct TxLocalTenantGuard;
282
283impl TxLocalTenantGuard {
284    pub fn install(value: Option<Option<String>>) -> Self {
285        TX_LOCAL_TENANT.with(|cell| *cell.borrow_mut() = value);
286        Self
287    }
288}
289
290impl Drop for TxLocalTenantGuard {
291    fn drop(&mut self) {
292        TX_LOCAL_TENANT.with(|cell| *cell.borrow_mut() = None);
293    }
294}
295
296thread_local! {
297    /// Stack of `WITHIN ... <stmt>` overrides active on the current
298    /// thread. Every entry corresponds to one in-flight `execute_query`
299    /// call that started with a `WITHIN` prefix; the entry is pushed
300    /// before dispatch and popped before the call returns. The stack
301    /// shape supports nested invocations (e.g. a view body that itself
302    /// re-enters execute_query).
303    static SCOPE_OVERRIDES: std::cell::RefCell<Vec<crate::runtime::within_clause::ScopeOverride>> =
304        const { std::cell::RefCell::new(Vec::new()) };
305}
306
307pub(crate) fn push_scope_override(over: crate::runtime::within_clause::ScopeOverride) {
308    SCOPE_OVERRIDES.with(|cell| cell.borrow_mut().push(over));
309}
310
311pub(crate) fn pop_scope_override() {
312    SCOPE_OVERRIDES.with(|cell| {
313        cell.borrow_mut().pop();
314    });
315}
316
317pub(crate) fn current_scope_override() -> Option<crate::runtime::within_clause::ScopeOverride> {
318    SCOPE_OVERRIDES.with(|cell| cell.borrow().last().cloned())
319}
320
321/// Cheap probe: is any `WITHIN …` scope override active on this
322/// thread? The fast-path needs to know without paying for the full
323/// `.last().cloned()` allocation — just peek at stack length.
324pub(crate) fn has_scope_override_active() -> bool {
325    SCOPE_OVERRIDES.with(|cell| !cell.borrow().is_empty())
326}
327
328/// RAII guard pairing `push_scope_override` with the matching pop, so
329/// the stack stays balanced even when the inner `execute_query` returns
330/// early via `?`.
331pub(crate) struct ScopeOverrideGuard;
332
333impl ScopeOverrideGuard {
334    pub fn install(over: crate::runtime::within_clause::ScopeOverride) -> Self {
335        push_scope_override(over);
336        Self
337    }
338}
339
340impl Drop for ScopeOverrideGuard {
341    fn drop(&mut self) {
342        pop_scope_override();
343    }
344}
345
346/// Read the current-thread auth identity, honouring per-statement
347/// `WITHIN ... USER '<u>' AS ROLE '<r>'` overrides. The override only
348/// supplies projected strings — it never grants additional privilege —
349/// so callers that need to make authorisation decisions must read from
350/// the underlying `current_auth_identity()` directly.
351pub(crate) fn current_user_projected() -> Option<String> {
352    let inherited = current_auth_identity().map(|(u, _)| u);
353    if let Some(over) = current_scope_override() {
354        if over.user.is_active() {
355            return over.user.resolve(inherited);
356        }
357    }
358    inherited
359}
360
361pub(crate) fn current_role_projected() -> Option<String> {
362    let inherited = current_auth_identity().map(|(_, r)| format!("{r:?}").to_lowercase());
363    if let Some(over) = current_scope_override() {
364        if over.role.is_active() {
365            return over.role.resolve(inherited);
366        }
367    }
368    inherited
369}
370
371pub(crate) fn current_secret_value(path: &str) -> Option<String> {
372    let key = path.to_ascii_lowercase();
373    CURRENT_SECRET_RESOLVER.with(|cell| {
374        let mut resolver = cell.borrow_mut();
375        let resolver = resolver.as_mut()?;
376        if resolver.values.is_none() {
377            resolver.values = resolver
378                .store
379                .as_ref()
380                .map(|store| store.vault_kv_snapshot());
381        }
382        let values = resolver.values.as_ref()?;
383        values.get(&key).cloned().or_else(|| {
384            key.strip_prefix("red.vault/").and_then(|rest| {
385                values
386                    .get(rest)
387                    .cloned()
388                    .or_else(|| values.get(&format!("red.secret.{rest}")).cloned())
389            })
390        })
391    })
392}
393
394struct SecretResolver {
395    store: Option<Arc<crate::auth::store::AuthStore>>,
396    values: Option<HashMap<String, String>>,
397}
398
399pub(super) struct SecretStoreGuard {
400    previous: Option<SecretResolver>,
401}
402
403impl SecretStoreGuard {
404    pub(super) fn install(store: Option<Arc<crate::auth::store::AuthStore>>) -> Self {
405        let previous = CURRENT_SECRET_RESOLVER.with(|cell| {
406            cell.replace(Some(SecretResolver {
407                store,
408                values: None,
409            }))
410        });
411        Self { previous }
412    }
413}
414
415impl Drop for SecretStoreGuard {
416    fn drop(&mut self) {
417        let previous = self.previous.take();
418        CURRENT_SECRET_RESOLVER.with(|cell| {
419            cell.replace(previous);
420        });
421    }
422}
423
424pub(crate) fn current_config_value(path: &str) -> Option<Value> {
425    let key = path.to_ascii_lowercase();
426    CURRENT_CONFIG_RESOLVER.with(|cell| {
427        let mut resolver = cell.borrow_mut();
428        let resolver = resolver.as_mut()?;
429        if resolver.values.is_none() {
430            resolver.values = Some(latest_config_snapshot(&resolver.db));
431        }
432        let values = resolver.values.as_ref()?;
433        values.get(&key).cloned().or_else(|| {
434            key.strip_prefix("red.config/")
435                .and_then(|rest| values.get(&format!("red.config.{rest}")).cloned())
436        })
437    })
438}
439
440fn update_current_config_value(path: &str, value: Value) {
441    let key = path.to_ascii_lowercase();
442    CURRENT_CONFIG_RESOLVER.with(|cell| {
443        if let Some(resolver) = cell.borrow_mut().as_mut() {
444            if let Some(values) = resolver.values.as_mut() {
445                values.insert(key, value);
446            }
447        }
448    });
449}
450
451fn update_current_secret_value(path: &str, value: Option<String>) {
452    let key = path.to_ascii_lowercase();
453    CURRENT_SECRET_RESOLVER.with(|cell| {
454        if let Some(resolver) = cell.borrow_mut().as_mut() {
455            let Some(values) = resolver.values.as_mut() else {
456                return;
457            };
458            match value {
459                Some(value) => {
460                    values.insert(key, value);
461                }
462                None => {
463                    values.remove(&key);
464                }
465            }
466        }
467    });
468}
469
470fn latest_config_snapshot(db: &RedDB) -> HashMap<String, Value> {
471    let mut latest: HashMap<String, (u64, Value)> = HashMap::new();
472
473    if let Some(manager) = db.store().get_collection("red_config") {
474        manager.for_each_entity(|entity| {
475            let Some(row) = entity.data.as_row() else {
476                return true;
477            };
478            let Some(Value::Text(key)) = row.get_field("key") else {
479                return true;
480            };
481            let value = row.get_field("value").cloned().unwrap_or(Value::Null);
482            let id = entity.id.raw();
483            let key = key.to_ascii_lowercase();
484            insert_latest_config_value(&mut latest, key.clone(), id, value.clone());
485            if let Some(rest) = key.strip_prefix("red.config.") {
486                insert_latest_config_value(&mut latest, format!("red.config/{rest}"), id, value);
487            }
488            true
489        });
490    }
491
492    if let Some(manager) = db.store().get_collection("red.config") {
493        manager.for_each_entity(|entity| {
494            let Some(row) = entity.data.as_row() else {
495                return true;
496            };
497            if matches!(row.get_field("tombstone"), Some(Value::Boolean(true))) {
498                return true;
499            }
500            let Some(Value::Text(key)) = row.get_field("key") else {
501                return true;
502            };
503            let value = row.get_field("value").cloned().unwrap_or(Value::Null);
504            insert_latest_config_value(
505                &mut latest,
506                format!("red.config/{}", key.to_ascii_lowercase()),
507                entity.id.raw(),
508                value,
509            );
510            true
511        });
512    }
513
514    latest
515        .into_iter()
516        .map(|(key, (_, value))| (key, value))
517        .collect()
518}
519
520fn insert_latest_config_value(
521    latest: &mut HashMap<String, (u64, Value)>,
522    key: String,
523    id: u64,
524    value: Value,
525) {
526    match latest.get(&key) {
527        Some((prev_id, _)) if *prev_id > id => {}
528        _ => {
529            latest.insert(key, (id, value));
530        }
531    }
532}
533
534struct ConfigResolver {
535    db: Arc<RedDB>,
536    values: Option<HashMap<String, Value>>,
537}
538
539pub(super) struct ConfigSnapshotGuard {
540    previous: Option<ConfigResolver>,
541}
542
543impl ConfigSnapshotGuard {
544    pub(super) fn install(db: Arc<RedDB>) -> Self {
545        let previous = CURRENT_CONFIG_RESOLVER
546            .with(|cell| cell.replace(Some(ConfigResolver { db, values: None })));
547        Self { previous }
548    }
549}
550
551impl Drop for ConfigSnapshotGuard {
552    fn drop(&mut self) {
553        let previous = self.previous.take();
554        CURRENT_CONFIG_RESOLVER.with(|cell| {
555            cell.replace(previous);
556        });
557    }
558}
559
560/// Install the MVCC snapshot used by the current thread for the duration
561/// of one statement. Paired with `clear_current_snapshot()` — callers
562/// should prefer the `CurrentSnapshotGuard` RAII wrapper so early returns
563/// still clean up.
564pub fn set_current_snapshot(ctx: SnapshotContext) {
565    CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = Some(ctx));
566    HAS_SNAPSHOT.with(|c| c.set(true));
567}
568
569pub fn clear_current_snapshot() {
570    CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = None);
571    HAS_SNAPSHOT.with(|c| c.set(false));
572}
573
574/// Drop-guard that restores the previous snapshot on scope exit. Safe to
575/// nest — each statement saves the caller's snapshot and puts it back
576/// instead of blindly clearing, so a top-level `execute_query` called
577/// from inside another statement dispatch (e.g. vector source subqueries)
578/// doesn't strip visibility from the outer scan.
579pub(crate) struct CurrentSnapshotGuard {
580    previous: Option<SnapshotContext>,
581}
582
583impl CurrentSnapshotGuard {
584    pub(crate) fn install(ctx: SnapshotContext) -> Self {
585        let previous = CURRENT_SNAPSHOT.with(|cell| cell.borrow().clone());
586        set_current_snapshot(ctx);
587        Self { previous }
588    }
589}
590
591impl Drop for CurrentSnapshotGuard {
592    fn drop(&mut self) {
593        let prev = self.previous.take();
594        let has = prev.is_some();
595        CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = prev);
596        HAS_SNAPSHOT.with(|c| c.set(has));
597    }
598}
599
600/// Is this entity visible under the current thread's MVCC snapshot?
601///
602/// Returns `true` (no filtering) when no snapshot is installed — that
603/// path is used by embedded callers and by operations that intentionally
604/// bypass MVCC (VACUUM, snapshot export, admin introspection).
605///
606/// When a snapshot is installed the result is
607///   `snapshot.sees(xmin, xmax) && !mgr.is_aborted(xmin) && !xmax_half_abort`
608/// where `xmax_half_abort` re-grants visibility for tuples whose
609/// deleting transaction rolled back.
610#[inline]
611pub fn entity_visible_under_current_snapshot(
612    entity: &crate::storage::unified::entity::UnifiedEntity,
613) -> bool {
614    // Fast path — one `Cell<bool>` read, no RefCell borrow. Autocommit
615    // reads (no active MVCC transaction) still hide superseded physical
616    // versions while avoiding a full snapshot-context lookup.
617    // This runs on every row of every scan; the slow path only fires
618    // inside an explicit transaction.
619    if !HAS_SNAPSHOT.with(|c| c.get()) {
620        return entity.xmax == 0;
621    }
622    CURRENT_SNAPSHOT.with(|cell| {
623        let guard = cell.borrow();
624        let Some(ctx) = guard.as_ref() else {
625            return true;
626        };
627        visibility_check(ctx, entity.xmin, entity.xmax)
628    })
629}
630
631/// Direct visibility check from raw `(xmin, xmax)` — bypasses the
632/// entity borrow for callers that already decomposed the tuple (e.g.
633/// pre-materialized scan caches). Same semantics as
634/// `entity_visible_under_current_snapshot`.
635#[inline]
636pub(crate) fn xids_visible_under_current_snapshot(xmin: u64, xmax: u64) -> bool {
637    if !HAS_SNAPSHOT.with(|c| c.get()) {
638        return true;
639    }
640    CURRENT_SNAPSHOT.with(|cell| {
641        let guard = cell.borrow();
642        let Some(ctx) = guard.as_ref() else {
643            return true;
644        };
645        visibility_check(ctx, xmin, xmax)
646    })
647}
648
649/// Clone the current thread's snapshot context. Parallel scan paths
650/// (`query_all_zoned` with `std::thread::scope`) call this on the main
651/// thread *before* spawning workers so the captured `SnapshotContext`
652/// can be moved into every worker closure. Worker threads do not
653/// inherit thread-locals, so calling `entity_visible_under_current_snapshot`
654/// from inside a spawned closure would silently skip the filter.
655pub fn capture_current_snapshot() -> Option<SnapshotContext> {
656    CURRENT_SNAPSHOT.with(|cell| cell.borrow().clone())
657}
658
659/// Whether the active read snapshot may need historical tuple versions
660/// that the current secondary indexes cannot prove. Index paths can still
661/// recheck visible candidates, but only a heap scan can discover versions
662/// whose indexed value was changed or deleted after this snapshot.
663pub(crate) fn current_snapshot_requires_index_fallback() -> bool {
664    if !HAS_SNAPSHOT.with(|c| c.get()) {
665        return false;
666    }
667    CURRENT_SNAPSHOT.with(|cell| {
668        cell.borrow()
669            .as_ref()
670            .is_some_and(|ctx| ctx.requires_index_fallback)
671    })
672}
673
674/// Frozen MVCC + identity context for callers that need to reinstall
675/// the same view across thread-local boundaries — long-lived cursors,
676/// background batchers, anything that detaches from the dispatch path
677/// and re-enters later.
678///
679/// The bundle bakes in the three thread-locals every read path
680/// consults: `SnapshotContext` (MVCC visibility), the auth identity
681/// (RLS policy gate), and the tenant id (RLS scope). A FETCH that
682/// reinstalls the bundle sees exactly the same rows as the DECLARE
683/// would have, regardless of writes that landed in between.
684///
685/// Cheap to clone — `SnapshotContext` is a clone of three
686/// `Arc`-backed fields, identity is a `(String, Role)`, tenant is a
687/// `String`. None of these contend with the read path.
688#[derive(Clone, Default)]
689pub struct SnapshotBundle {
690    pub snapshot: Option<SnapshotContext>,
691    pub auth: Option<(String, crate::auth::Role)>,
692    pub tenant: Option<String>,
693}
694
695/// Capture the three read-path thread-locals into a `SnapshotBundle`.
696/// Pairs with `with_snapshot_bundle` for re-entry.
697pub fn snapshot_bundle() -> SnapshotBundle {
698    SnapshotBundle {
699        snapshot: capture_current_snapshot(),
700        auth: current_auth_identity(),
701        tenant: CURRENT_TENANT_ID.with(|cell| cell.borrow().clone()),
702    }
703}
704
705/// Reinstall a captured `SnapshotBundle` for the duration of `f`.
706/// Restores the caller's previous thread-locals on exit (panic-safe via
707/// the explicit guard struct so a panic in `f` cannot leak the
708/// installed identity into the worker's next request).
709pub fn with_snapshot_bundle<R>(bundle: &SnapshotBundle, f: impl FnOnce() -> R) -> R {
710    struct Guard {
711        prev_snapshot: Option<SnapshotContext>,
712        prev_auth: Option<(String, crate::auth::Role)>,
713        prev_tenant: Option<String>,
714    }
715    impl Drop for Guard {
716        fn drop(&mut self) {
717            let snap = self.prev_snapshot.take();
718            let has = snap.is_some();
719            CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = snap);
720            HAS_SNAPSHOT.with(|c| c.set(has));
721            CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = self.prev_auth.take());
722            CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = self.prev_tenant.take());
723        }
724    }
725
726    let _guard = {
727        let prev_snapshot = CURRENT_SNAPSHOT.with(|cell| cell.borrow().clone());
728        let prev_auth = CURRENT_AUTH_IDENTITY.with(|cell| cell.borrow().clone());
729        let prev_tenant = CURRENT_TENANT_ID.with(|cell| cell.borrow().clone());
730
731        match bundle.snapshot.clone() {
732            Some(ctx) => set_current_snapshot(ctx),
733            None => clear_current_snapshot(),
734        }
735        CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = bundle.auth.clone());
736        CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = bundle.tenant.clone());
737
738        Guard {
739            prev_snapshot,
740            prev_auth,
741            prev_tenant,
742        }
743    };
744    f()
745}
746
747/// Apply the same visibility rules used by the thread-local helpers
748/// against a caller-provided context. Intended for parallel workers
749/// that captured the snapshot with `capture_current_snapshot()`.
750#[inline]
751pub fn entity_visible_with_context(
752    ctx: Option<&SnapshotContext>,
753    entity: &crate::storage::unified::entity::UnifiedEntity,
754) -> bool {
755    match ctx {
756        Some(ctx) => visibility_check(ctx, entity.xmin, entity.xmax),
757        None => true,
758    }
759}
760
761fn table_row_index_fields(
762    entity: &crate::storage::unified::entity::UnifiedEntity,
763) -> Vec<(String, crate::storage::schema::Value)> {
764    let crate::storage::EntityData::Row(row) = &entity.data else {
765        return Vec::new();
766    };
767    if let Some(named) = &row.named {
768        return named
769            .iter()
770            .map(|(name, value)| (name.clone(), value.clone()))
771            .collect();
772    }
773    if let Some(schema) = &row.schema {
774        return schema
775            .iter()
776            .zip(row.columns.iter())
777            .map(|(name, value)| (name.clone(), value.clone()))
778            .collect();
779    }
780    Vec::new()
781}
782
783#[inline]
784fn visibility_check(ctx: &SnapshotContext, xmin: u64, xmax: u64) -> bool {
785    // Writer aborted → tuple never existed from any future reader's view.
786    // Checked *before* the own-xids fast path so an aborted own-sub-xid
787    // (rolled-back savepoint) stays hidden from the parent.
788    if xmin != 0 && ctx.manager.is_aborted(xmin) {
789        return false;
790    }
791    // Deleter aborted → treat xmax as unset; fall back to xmin-only check.
792    let effective_xmax = if xmax != 0 && ctx.manager.is_aborted(xmax) {
793        0
794    } else {
795        xmax
796    };
797    // Phase 2.3.2e: own-tx writes are always visible to the connection
798    // that stamped them, even when xmin/xmax exceed `snapshot.xid` (as
799    // happens for sub-xids allocated by SAVEPOINT after BEGIN).
800    let own_xmin = xmin != 0 && ctx.own_xids.contains(&xmin);
801    let own_xmax = effective_xmax != 0 && ctx.own_xids.contains(&effective_xmax);
802    if own_xmax {
803        // This connection deleted the row via this xid — hide it from self.
804        return false;
805    }
806    if own_xmin {
807        return true;
808    }
809    ctx.snapshot.sees(xmin, effective_xmax)
810}
811
812fn runtime_pool_lock(runtime: &RedDBRuntime) -> std::sync::MutexGuard<'_, PoolState> {
813    runtime
814        .inner
815        .pool
816        .lock()
817        .unwrap_or_else(|poisoned| poisoned.into_inner())
818}
819
820fn cache_scope_insert(scopes: &mut HashSet<String>, name: &str) {
821    if name.is_empty() || name.starts_with("__subq_") || is_universal_query_source(name) {
822        return;
823    }
824    scopes.insert(name.to_string());
825}
826
827fn collect_table_source_scopes(scopes: &mut HashSet<String>, query: &TableQuery) {
828    match query.source.as_ref() {
829        Some(crate::storage::query::ast::TableSource::Name(name)) => {
830            cache_scope_insert(scopes, name)
831        }
832        Some(crate::storage::query::ast::TableSource::Subquery(subquery)) => {
833            collect_query_expr_result_cache_scopes(scopes, subquery);
834        }
835        None => cache_scope_insert(scopes, &query.table),
836    }
837}
838
839fn collect_vector_source_scopes(
840    scopes: &mut HashSet<String>,
841    source: &crate::storage::query::ast::VectorSource,
842) {
843    match source {
844        crate::storage::query::ast::VectorSource::Reference { collection, .. } => {
845            cache_scope_insert(scopes, collection);
846        }
847        crate::storage::query::ast::VectorSource::Subquery(subquery) => {
848            collect_query_expr_result_cache_scopes(scopes, subquery);
849        }
850        crate::storage::query::ast::VectorSource::Literal(_)
851        | crate::storage::query::ast::VectorSource::Text(_) => {}
852    }
853}
854
855fn collect_path_selector_scopes(
856    scopes: &mut HashSet<String>,
857    selector: &crate::storage::query::ast::NodeSelector,
858) {
859    if let crate::storage::query::ast::NodeSelector::ByRow { table, .. } = selector {
860        cache_scope_insert(scopes, table);
861    }
862}
863
864fn collect_query_expr_result_cache_scopes(scopes: &mut HashSet<String>, expr: &QueryExpr) {
865    match expr {
866        QueryExpr::Table(query) => collect_table_source_scopes(scopes, query),
867        QueryExpr::Join(query) => {
868            collect_query_expr_result_cache_scopes(scopes, &query.left);
869            collect_query_expr_result_cache_scopes(scopes, &query.right);
870        }
871        QueryExpr::Path(query) => {
872            collect_path_selector_scopes(scopes, &query.from);
873            collect_path_selector_scopes(scopes, &query.to);
874        }
875        QueryExpr::Vector(query) => {
876            cache_scope_insert(scopes, &query.collection);
877            collect_vector_source_scopes(scopes, &query.query_vector);
878        }
879        QueryExpr::Hybrid(query) => {
880            collect_query_expr_result_cache_scopes(scopes, &query.structured);
881            cache_scope_insert(scopes, &query.vector.collection);
882            collect_vector_source_scopes(scopes, &query.vector.query_vector);
883        }
884        QueryExpr::Insert(query) => cache_scope_insert(scopes, &query.table),
885        QueryExpr::Update(query) => cache_scope_insert(scopes, &query.table),
886        QueryExpr::Delete(query) => cache_scope_insert(scopes, &query.table),
887        QueryExpr::CreateTable(query) => cache_scope_insert(scopes, &query.name),
888        QueryExpr::CreateCollection(query) => cache_scope_insert(scopes, &query.name),
889        QueryExpr::CreateVector(query) => cache_scope_insert(scopes, &query.name),
890        QueryExpr::DropTable(query) => cache_scope_insert(scopes, &query.name),
891        QueryExpr::DropGraph(query) => cache_scope_insert(scopes, &query.name),
892        QueryExpr::DropVector(query) => cache_scope_insert(scopes, &query.name),
893        QueryExpr::DropDocument(query) => cache_scope_insert(scopes, &query.name),
894        QueryExpr::DropKv(query) => cache_scope_insert(scopes, &query.name),
895        QueryExpr::DropCollection(query) => cache_scope_insert(scopes, &query.name),
896        QueryExpr::Truncate(query) => cache_scope_insert(scopes, &query.name),
897        QueryExpr::AlterTable(query) => cache_scope_insert(scopes, &query.name),
898        QueryExpr::CreateIndex(query) => cache_scope_insert(scopes, &query.table),
899        QueryExpr::DropIndex(query) => cache_scope_insert(scopes, &query.table),
900        QueryExpr::CreateTimeSeries(query) => cache_scope_insert(scopes, &query.name),
901        QueryExpr::DropTimeSeries(query) => cache_scope_insert(scopes, &query.name),
902        QueryExpr::CreateQueue(query) => cache_scope_insert(scopes, &query.name),
903        QueryExpr::AlterQueue(query) => cache_scope_insert(scopes, &query.name),
904        QueryExpr::DropQueue(query) => cache_scope_insert(scopes, &query.name),
905        QueryExpr::QueueSelect(query) => cache_scope_insert(scopes, &query.queue),
906        QueryExpr::QueueCommand(query) => match query {
907            QueueCommand::Push { queue, .. }
908            | QueueCommand::Pop { queue, .. }
909            | QueueCommand::Peek { queue, .. }
910            | QueueCommand::Len { queue }
911            | QueueCommand::Purge { queue }
912            | QueueCommand::GroupCreate { queue, .. }
913            | QueueCommand::GroupRead { queue, .. }
914            | QueueCommand::Pending { queue, .. }
915            | QueueCommand::Claim { queue, .. }
916            | QueueCommand::Ack { queue, .. }
917            | QueueCommand::Nack { queue, .. } => cache_scope_insert(scopes, queue),
918            QueueCommand::Move {
919                source,
920                destination,
921                ..
922            } => {
923                cache_scope_insert(scopes, source);
924                cache_scope_insert(scopes, destination);
925            }
926        },
927        QueryExpr::EventsBackfill(query) => {
928            cache_scope_insert(scopes, &query.collection);
929            cache_scope_insert(scopes, &query.target_queue);
930        }
931        QueryExpr::CreateTree(query) => cache_scope_insert(scopes, &query.collection),
932        QueryExpr::DropTree(query) => cache_scope_insert(scopes, &query.collection),
933        QueryExpr::TreeCommand(query) => match query {
934            TreeCommand::Insert { collection, .. }
935            | TreeCommand::Move { collection, .. }
936            | TreeCommand::Delete { collection, .. }
937            | TreeCommand::Validate { collection, .. }
938            | TreeCommand::Rebalance { collection, .. } => cache_scope_insert(scopes, collection),
939        },
940        QueryExpr::SearchCommand(query) => match query {
941            SearchCommand::Similar { collection, .. }
942            | SearchCommand::Hybrid { collection, .. }
943            | SearchCommand::SpatialRadius { collection, .. }
944            | SearchCommand::SpatialBbox { collection, .. }
945            | SearchCommand::SpatialNearest { collection, .. } => {
946                cache_scope_insert(scopes, collection);
947            }
948            SearchCommand::Text { collection, .. }
949            | SearchCommand::Multimodal { collection, .. }
950            | SearchCommand::Index { collection, .. }
951            | SearchCommand::Context { collection, .. } => {
952                if let Some(collection) = collection.as_deref() {
953                    cache_scope_insert(scopes, collection);
954                }
955            }
956        },
957        QueryExpr::Ask(query) => {
958            if let Some(collection) = query.collection.as_deref() {
959                cache_scope_insert(scopes, collection);
960            }
961        }
962        QueryExpr::ExplainAlter(query) => cache_scope_insert(scopes, &query.target.name),
963        QueryExpr::MaintenanceCommand(cmd) => match cmd {
964            crate::storage::query::ast::MaintenanceCommand::Vacuum { target, .. }
965            | crate::storage::query::ast::MaintenanceCommand::Analyze { target } => {
966                if let Some(t) = target {
967                    cache_scope_insert(scopes, t);
968                }
969            }
970        },
971        QueryExpr::CopyFrom(cmd) => cache_scope_insert(scopes, &cmd.table),
972        QueryExpr::CreateView(cmd) => {
973            cache_scope_insert(scopes, &cmd.name);
974            // Invalidating the view should also invalidate its dependencies.
975            collect_query_expr_result_cache_scopes(scopes, &cmd.query);
976        }
977        QueryExpr::DropView(cmd) => cache_scope_insert(scopes, &cmd.name),
978        QueryExpr::RefreshMaterializedView(cmd) => cache_scope_insert(scopes, &cmd.name),
979        QueryExpr::CreatePolicy(cmd) => cache_scope_insert(scopes, &cmd.table),
980        QueryExpr::DropPolicy(cmd) => cache_scope_insert(scopes, &cmd.table),
981        QueryExpr::CreateServer(_) | QueryExpr::DropServer(_) => {}
982        QueryExpr::CreateForeignTable(cmd) => cache_scope_insert(scopes, &cmd.name),
983        QueryExpr::DropForeignTable(cmd) => cache_scope_insert(scopes, &cmd.name),
984        QueryExpr::Graph(_)
985        | QueryExpr::GraphCommand(_)
986        | QueryExpr::ProbabilisticCommand(_)
987        | QueryExpr::SetConfig { .. }
988        | QueryExpr::ShowConfig { .. }
989        | QueryExpr::SetSecret { .. }
990        | QueryExpr::DeleteSecret { .. }
991        | QueryExpr::ShowSecrets { .. }
992        | QueryExpr::SetTenant(_)
993        | QueryExpr::ShowTenant
994        | QueryExpr::TransactionControl(_)
995        | QueryExpr::CreateSchema(_)
996        | QueryExpr::DropSchema(_)
997        | QueryExpr::CreateSequence(_)
998        | QueryExpr::DropSequence(_)
999        | QueryExpr::Grant(_)
1000        | QueryExpr::Revoke(_)
1001        | QueryExpr::AlterUser(_)
1002        | QueryExpr::CreateIamPolicy { .. }
1003        | QueryExpr::DropIamPolicy { .. }
1004        | QueryExpr::AttachPolicy { .. }
1005        | QueryExpr::DetachPolicy { .. }
1006        | QueryExpr::ShowPolicies { .. }
1007        | QueryExpr::ShowEffectivePermissions { .. }
1008        | QueryExpr::SimulatePolicy { .. }
1009        | QueryExpr::CreateMigration(_)
1010        | QueryExpr::ApplyMigration(_)
1011        | QueryExpr::RollbackMigration(_)
1012        | QueryExpr::ExplainMigration(_)
1013        | QueryExpr::EventsBackfillStatus { .. } => {}
1014        QueryExpr::KvCommand(cmd) => {
1015            use crate::storage::query::ast::KvCommand;
1016            match cmd {
1017                KvCommand::Put { collection, .. }
1018                | KvCommand::InvalidateTags { collection, .. }
1019                | KvCommand::Get { collection, .. }
1020                | KvCommand::Unseal { collection, .. }
1021                | KvCommand::Rotate { collection, .. }
1022                | KvCommand::History { collection, .. }
1023                | KvCommand::List { collection, .. }
1024                | KvCommand::Purge { collection, .. }
1025                | KvCommand::Watch { collection, .. }
1026                | KvCommand::Delete { collection, .. }
1027                | KvCommand::Incr { collection, .. }
1028                | KvCommand::Cas { collection, .. } => cache_scope_insert(scopes, collection),
1029            }
1030        }
1031        QueryExpr::ConfigCommand(cmd) => {
1032            use crate::storage::query::ast::ConfigCommand;
1033            match cmd {
1034                ConfigCommand::Put { collection, .. }
1035                | ConfigCommand::Get { collection, .. }
1036                | ConfigCommand::Resolve { collection, .. }
1037                | ConfigCommand::Rotate { collection, .. }
1038                | ConfigCommand::Delete { collection, .. }
1039                | ConfigCommand::History { collection, .. }
1040                | ConfigCommand::List { collection, .. }
1041                | ConfigCommand::Watch { collection, .. }
1042                | ConfigCommand::InvalidVolatileOperation { collection, .. } => {
1043                    cache_scope_insert(scopes, collection)
1044                }
1045            }
1046        }
1047    }
1048}
1049
1050/// Combine matching RLS policies for a table + action into a single
1051/// `Filter` suitable for AND-ing into a caller's `WHERE` clause.
1052///
1053/// Returns `None` when RLS is disabled or no policy admits the caller's
1054/// role — callers use that to short-circuit the mutation (for DELETE /
1055/// UPDATE we simply skip the operation, which PG expresses as "no rows
1056/// match the policy + predicate combination").
1057pub(crate) fn rls_policy_filter(
1058    runtime: &RedDBRuntime,
1059    table: &str,
1060    action: crate::storage::query::ast::PolicyAction,
1061) -> Option<crate::storage::query::ast::Filter> {
1062    rls_policy_filter_for_kind(
1063        runtime,
1064        table,
1065        action,
1066        crate::storage::query::ast::PolicyTargetKind::Table,
1067    )
1068}
1069
1070/// Kind-aware policy filter combiner (Phase 2.5.5 RLS universal).
1071/// Graph / vector / queue / timeseries scans pass the concrete kind;
1072/// policies targeting other kinds are ignored. Legacy Table-scoped
1073/// policies still apply cross-kind — callers register auto-tenancy
1074/// policies as Table today.
1075pub(crate) fn rls_policy_filter_for_kind(
1076    runtime: &RedDBRuntime,
1077    table: &str,
1078    action: crate::storage::query::ast::PolicyAction,
1079    kind: crate::storage::query::ast::PolicyTargetKind,
1080) -> Option<crate::storage::query::ast::Filter> {
1081    use crate::storage::query::ast::Filter;
1082
1083    if !runtime.inner.rls_enabled_tables.read().contains(table) {
1084        return None;
1085    }
1086    let role = current_auth_identity().map(|(_, role)| role);
1087    let role_str = role.map(|r| r.as_str().to_string());
1088    let policies = runtime.matching_rls_policies_for_kind(table, role_str.as_deref(), action, kind);
1089    if policies.is_empty() {
1090        return None;
1091    }
1092    policies
1093        .into_iter()
1094        .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1095}
1096
1097/// Returns true when the table has RLS enforcement enabled. Convenience
1098/// shortcut so DML paths can gate the AND-combine work without reaching
1099/// into `runtime.inner.rls_enabled_tables` directly.
1100pub(crate) fn rls_is_enabled(runtime: &RedDBRuntime, table: &str) -> bool {
1101    runtime.inner.rls_enabled_tables.read().contains(table)
1102}
1103
1104/// Per-entity gate used by the graph materialiser for `GraphNode`
1105/// entities. RLS is checked against the source collection with
1106/// `kind = Nodes`, which `matching_rls_policies_for_kind` resolves to
1107/// either `Nodes`-targeted policies or legacy `Table`-targeted ones
1108/// (for back-compat with auto-tenancy declarations). Cached per
1109/// collection so big graphs only resolve the policy chain once.
1110fn node_passes_rls(
1111    runtime: &RedDBRuntime,
1112    collection: &str,
1113    role: Option<&str>,
1114    cache: &mut std::collections::HashMap<String, Option<crate::storage::query::ast::Filter>>,
1115    entity: &crate::storage::unified::entity::UnifiedEntity,
1116) -> bool {
1117    use crate::storage::query::ast::{Filter, PolicyAction, PolicyTargetKind};
1118
1119    if !runtime.inner.rls_enabled_tables.read().contains(collection) {
1120        return true;
1121    }
1122    let filter = cache.entry(collection.to_string()).or_insert_with(|| {
1123        let policies = runtime.matching_rls_policies_for_kind(
1124            collection,
1125            role,
1126            PolicyAction::Select,
1127            PolicyTargetKind::Nodes,
1128        );
1129        if policies.is_empty() {
1130            None
1131        } else {
1132            policies
1133                .into_iter()
1134                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1135        }
1136    });
1137    let Some(filter) = filter else {
1138        return false;
1139    };
1140    crate::runtime::query_exec::evaluate_entity_filter_with_db(
1141        Some(&runtime.inner.db),
1142        entity,
1143        filter,
1144        collection,
1145        collection,
1146    )
1147}
1148
1149/// Edge counterpart of `node_passes_rls`. Same caching strategy with
1150/// `kind = Edges`.
1151fn edge_passes_rls(
1152    runtime: &RedDBRuntime,
1153    collection: &str,
1154    role: Option<&str>,
1155    cache: &mut std::collections::HashMap<String, Option<crate::storage::query::ast::Filter>>,
1156    entity: &crate::storage::unified::entity::UnifiedEntity,
1157) -> bool {
1158    use crate::storage::query::ast::{Filter, PolicyAction, PolicyTargetKind};
1159
1160    if !runtime.inner.rls_enabled_tables.read().contains(collection) {
1161        return true;
1162    }
1163    let filter = cache.entry(collection.to_string()).or_insert_with(|| {
1164        let policies = runtime.matching_rls_policies_for_kind(
1165            collection,
1166            role,
1167            PolicyAction::Select,
1168            PolicyTargetKind::Edges,
1169        );
1170        if policies.is_empty() {
1171            None
1172        } else {
1173            policies
1174                .into_iter()
1175                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1176        }
1177    });
1178    let Some(filter) = filter else {
1179        return false;
1180    };
1181    crate::runtime::query_exec::evaluate_entity_filter_with_db(
1182        Some(&runtime.inner.db),
1183        entity,
1184        filter,
1185        collection,
1186        collection,
1187    )
1188}
1189
1190/// RLS policy injection (Phase 2.5.2 PG parity).
1191///
1192/// Fetch every matching policy for the current thread-local role and
1193/// fold them into the query's filter. Semantics mirror PostgreSQL:
1194///
1195/// * Multiple policies on the same table combine with **OR** — a row is
1196///   visible if *any* policy admits it.
1197/// * The combined policy predicate is **AND**-ed into the caller's
1198///   existing `WHERE` clause so explicit predicates continue to trim
1199///   the policy-allowed set.
1200/// * No matching policies + RLS enabled = zero rows (PG's
1201///   restrictive-default). Callers get `None` and return an empty
1202///   `UnifiedResult` without ever dispatching the scan.
1203///
1204/// This runs only when `RuntimeInner::rls_enabled_tables` already
1205/// contains the table name — callers gate the hot path upfront to
1206/// avoid the lock acquisition on tables without RLS.
1207///
1208/// Returns `None` when no policy admits the current role; returns
1209/// `Some(mutated_table)` with policy filters folded in otherwise.
1210fn inject_rls_filters(
1211    runtime: &RedDBRuntime,
1212    frame: &dyn super::statement_frame::ReadFrame,
1213    mut table: crate::storage::query::ast::TableQuery,
1214) -> Option<crate::storage::query::ast::TableQuery> {
1215    use crate::storage::query::ast::{Filter, PolicyAction};
1216
1217    // `None` role falls through to policies with no `TO role` clause.
1218    let role = frame.identity().map(|(_, role)| role);
1219    let role_str = role.map(|r| r.as_str().to_string());
1220    let policies =
1221        runtime.matching_rls_policies(&table.table, role_str.as_deref(), PolicyAction::Select);
1222
1223    if policies.is_empty() {
1224        // RLS enabled + no policy match = deny everything. Signal the
1225        // caller to short-circuit with an empty result set.
1226        return None;
1227    }
1228
1229    // Combine policy predicates with OR (PG's permissive default).
1230    let combined = policies
1231        .into_iter()
1232        .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1233        .expect("policies non-empty");
1234
1235    // AND into the caller's existing filter.
1236    table.filter = Some(match table.filter.take() {
1237        Some(existing) => Filter::And(Box::new(existing), Box::new(combined)),
1238        None => combined,
1239    });
1240    Some(table)
1241}
1242
1243/// Apply per-table RLS to a `JoinQuery` by folding each side's policy
1244/// predicate into the join's outer filter. Walking the merged record
1245/// at the join layer (rather than mutating the per-side scan filter)
1246/// keeps the planner's strategy choice and per-side index selection
1247/// undisturbed — the policy predicate uses the qualified `t.col` form
1248/// that resolves cleanly against the merged record's keys.
1249///
1250/// Returns `None` when any leaf has RLS enabled and no policy admits
1251/// the caller — the join short-circuits to an empty result.
1252fn inject_rls_into_join(
1253    runtime: &RedDBRuntime,
1254    frame: &dyn super::statement_frame::ReadFrame,
1255    mut join: crate::storage::query::ast::JoinQuery,
1256) -> Option<crate::storage::query::ast::JoinQuery> {
1257    use crate::storage::query::ast::Filter;
1258
1259    let mut policy_filters: Vec<Filter> = Vec::new();
1260    if !collect_join_side_policy(runtime, frame, join.left.as_ref(), &mut policy_filters) {
1261        return None;
1262    }
1263    if !collect_join_side_policy(runtime, frame, join.right.as_ref(), &mut policy_filters) {
1264        return None;
1265    }
1266
1267    if policy_filters.is_empty() {
1268        return Some(join);
1269    }
1270
1271    let combined = policy_filters
1272        .into_iter()
1273        .reduce(|acc, f| Filter::And(Box::new(acc), Box::new(f)))
1274        .expect("policy_filters non-empty");
1275
1276    join.filter = Some(match join.filter.take() {
1277        Some(existing) => Filter::And(Box::new(existing), Box::new(combined)),
1278        None => combined,
1279    });
1280
1281    Some(join)
1282}
1283
1284/// For each `Table` leaf reachable through nested joins, append the
1285/// RLS-policy filter (combined with OR across that side's matching
1286/// policies) into `out`. Returns `false` when a side has RLS enabled
1287/// but no policy admits the caller — the join must short-circuit.
1288fn collect_join_side_policy(
1289    runtime: &RedDBRuntime,
1290    frame: &dyn super::statement_frame::ReadFrame,
1291    expr: &crate::storage::query::ast::QueryExpr,
1292    out: &mut Vec<crate::storage::query::ast::Filter>,
1293) -> bool {
1294    use crate::storage::query::ast::{Filter, PolicyAction, QueryExpr};
1295    match expr {
1296        QueryExpr::Table(t) => {
1297            if !runtime.inner.rls_enabled_tables.read().contains(&t.table) {
1298                return true;
1299            }
1300            let role = frame.identity().map(|(_, role)| role);
1301            let role_str = role.map(|r| r.as_str().to_string());
1302            let policies =
1303                runtime.matching_rls_policies(&t.table, role_str.as_deref(), PolicyAction::Select);
1304            if policies.is_empty() {
1305                return false;
1306            }
1307            let combined = policies
1308                .into_iter()
1309                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1310                .expect("policies non-empty");
1311            out.push(combined);
1312            true
1313        }
1314        QueryExpr::Join(inner) => {
1315            collect_join_side_policy(runtime, frame, inner.left.as_ref(), out)
1316                && collect_join_side_policy(runtime, frame, inner.right.as_ref(), out)
1317        }
1318        _ => true,
1319    }
1320}
1321
1322/// Foreign-table post-scan filter application (Phase 3.2.2 PG parity).
1323///
1324/// Phase 3.2 FDW wrappers don't advertise filter pushdown, so the runtime
1325/// applies `WHERE` / `ORDER BY` / `LIMIT` / `OFFSET` after the wrapper
1326/// materialises all rows. Projections are best-effort — when the query
1327/// lists explicit columns we keep only those; a `SELECT *` keeps every
1328/// wrapper-emitted field verbatim.
1329///
1330/// When a wrapper later opts into pushdown (`supports_pushdown = true`)
1331/// the runtime will pass the compiled filter down instead of post-filtering.
1332fn apply_foreign_table_filters(
1333    records: Vec<crate::storage::query::unified::UnifiedRecord>,
1334    query: &crate::storage::query::ast::TableQuery,
1335) -> crate::storage::query::unified::UnifiedResult {
1336    use crate::storage::query::sql_lowering::{
1337        effective_table_filter, effective_table_projections,
1338    };
1339    use crate::storage::query::unified::UnifiedResult;
1340
1341    let filter = effective_table_filter(query);
1342    let projections = effective_table_projections(query);
1343
1344    // Step 1 — WHERE. Reuse the cross-store evaluator so the semantics
1345    // match native-collection queries (same operators, same NULL handling).
1346    let mut filtered: Vec<_> = records
1347        .into_iter()
1348        .filter(|record| match &filter {
1349            Some(f) => {
1350                super::join_filter::evaluate_runtime_filter_with_db(None, record, f, None, None)
1351            }
1352            None => true,
1353        })
1354        .collect();
1355
1356    // Step 2 — LIMIT / OFFSET. Applied after filter to match SQL semantics.
1357    if let Some(offset) = query.offset {
1358        let offset = offset as usize;
1359        if offset >= filtered.len() {
1360            filtered.clear();
1361        } else {
1362            filtered.drain(0..offset);
1363        }
1364    }
1365    if let Some(limit) = query.limit {
1366        filtered.truncate(limit as usize);
1367    }
1368
1369    // Step 3 — columns list. `SELECT *` (no explicit projections) keeps
1370    // the wrapper's column set; an explicit list trims to those names.
1371    let columns: Vec<String> = if projections.is_empty() {
1372        filtered
1373            .first()
1374            .map(|r| r.column_names().iter().map(|k| k.to_string()).collect())
1375            .unwrap_or_default()
1376    } else {
1377        projections
1378            .iter()
1379            .map(super::join_filter::projection_name)
1380            .collect()
1381    };
1382
1383    let mut result = UnifiedResult::empty();
1384    result.columns = columns;
1385    result.records = filtered;
1386    result
1387}
1388
1389/// Collect every concrete table reference inside a `QueryExpr`.
1390///
1391/// Used by view bookkeeping (dependency tracking for materialised
1392/// invalidation) and any other rewriter that needs to know the base
1393/// tables a query pulls from. Does not descend into projections/filters;
1394/// only the `FROM` side.
1395pub(crate) fn collect_table_refs(expr: &QueryExpr) -> Vec<String> {
1396    let mut scopes: HashSet<String> = HashSet::new();
1397    collect_query_expr_result_cache_scopes(&mut scopes, expr);
1398    scopes.into_iter().collect()
1399}
1400
1401fn query_expr_result_cache_scopes(expr: &QueryExpr) -> HashSet<String> {
1402    let mut scopes = HashSet::new();
1403    collect_query_expr_result_cache_scopes(&mut scopes, expr);
1404    scopes
1405}
1406
1407const RESULT_CACHE_BACKEND_KEY: &str = "runtime.result_cache.backend";
1408const RESULT_CACHE_DEFAULT_BACKEND: &str = "legacy";
1409const RESULT_CACHE_BLOB_NAMESPACE: &str = "runtime.result_cache";
1410const RESULT_CACHE_TTL_SECS: u64 = 30;
1411const RESULT_CACHE_MAX_ENTRIES: usize = 1000;
1412const RESULT_CACHE_PAYLOAD_MAGIC: &[u8; 8] = b"RDRC0001";
1413
1414#[derive(Clone, Copy, Debug, PartialEq, Eq)]
1415enum RuntimeResultCacheBackend {
1416    Legacy,
1417    BlobCache,
1418    Shadow,
1419}
1420
1421fn trim_result_cache(
1422    map: &mut HashMap<String, RuntimeResultCacheEntry>,
1423    order: &mut std::collections::VecDeque<String>,
1424) {
1425    while map.len() > RESULT_CACHE_MAX_ENTRIES {
1426        if let Some(oldest) = order.pop_front() {
1427            map.remove(&oldest);
1428        } else {
1429            break;
1430        }
1431    }
1432}
1433
1434fn result_cache_fingerprint(result: &RuntimeQueryResult) -> String {
1435    format!(
1436        "{:?}|{}|{}|{}|{}|{:?}",
1437        result.result,
1438        result.query,
1439        result.statement,
1440        result.engine,
1441        result.affected_rows,
1442        result.statement_type
1443    )
1444}
1445
1446fn mode_to_byte(mode: crate::storage::query::modes::QueryMode) -> u8 {
1447    match mode {
1448        crate::storage::query::modes::QueryMode::Sql => 0,
1449        crate::storage::query::modes::QueryMode::Gremlin => 1,
1450        crate::storage::query::modes::QueryMode::Cypher => 2,
1451        crate::storage::query::modes::QueryMode::Sparql => 3,
1452        crate::storage::query::modes::QueryMode::Path => 4,
1453        crate::storage::query::modes::QueryMode::Natural => 5,
1454        crate::storage::query::modes::QueryMode::Unknown => 255,
1455    }
1456}
1457
1458fn mode_from_byte(byte: u8) -> Option<crate::storage::query::modes::QueryMode> {
1459    match byte {
1460        0 => Some(crate::storage::query::modes::QueryMode::Sql),
1461        1 => Some(crate::storage::query::modes::QueryMode::Gremlin),
1462        2 => Some(crate::storage::query::modes::QueryMode::Cypher),
1463        3 => Some(crate::storage::query::modes::QueryMode::Sparql),
1464        4 => Some(crate::storage::query::modes::QueryMode::Path),
1465        5 => Some(crate::storage::query::modes::QueryMode::Natural),
1466        255 => Some(crate::storage::query::modes::QueryMode::Unknown),
1467        _ => None,
1468    }
1469}
1470
1471fn result_cache_static_str(value: &str) -> Option<&'static str> {
1472    match value {
1473        "select" => Some("select"),
1474        "materialized-graph" => Some("materialized-graph"),
1475        "runtime-red-schema" => Some("runtime-red-schema"),
1476        "runtime-fdw" => Some("runtime-fdw"),
1477        "runtime-table-rls" => Some("runtime-table-rls"),
1478        "runtime-table" => Some("runtime-table"),
1479        "runtime-join-rls" => Some("runtime-join-rls"),
1480        "runtime-join" => Some("runtime-join"),
1481        "runtime-vector" => Some("runtime-vector"),
1482        "runtime-hybrid" => Some("runtime-hybrid"),
1483        "runtime-secret" => Some("runtime-secret"),
1484        "runtime-config" => Some("runtime-config"),
1485        "runtime-tenant" => Some("runtime-tenant"),
1486        "runtime-explain" => Some("runtime-explain"),
1487        "runtime-tree" => Some("runtime-tree"),
1488        "runtime-kv" => Some("runtime-kv"),
1489        "runtime-queue" => Some("runtime-queue"),
1490        _ => None,
1491    }
1492}
1493
1494fn write_u32(out: &mut Vec<u8>, value: usize) -> Option<()> {
1495    let value = u32::try_from(value).ok()?;
1496    out.extend_from_slice(&value.to_le_bytes());
1497    Some(())
1498}
1499
1500fn write_string(out: &mut Vec<u8>, value: &str) -> Option<()> {
1501    write_u32(out, value.len())?;
1502    out.extend_from_slice(value.as_bytes());
1503    Some(())
1504}
1505
1506fn write_bytes(out: &mut Vec<u8>, value: &[u8]) -> Option<()> {
1507    write_u32(out, value.len())?;
1508    out.extend_from_slice(value);
1509    Some(())
1510}
1511
1512fn read_u8(input: &mut &[u8]) -> Option<u8> {
1513    let (&value, rest) = input.split_first()?;
1514    *input = rest;
1515    Some(value)
1516}
1517
1518fn read_u32(input: &mut &[u8]) -> Option<usize> {
1519    if input.len() < 4 {
1520        return None;
1521    }
1522    let value = u32::from_le_bytes(input[..4].try_into().ok()?) as usize;
1523    *input = &input[4..];
1524    Some(value)
1525}
1526
1527fn read_u64(input: &mut &[u8]) -> Option<u64> {
1528    if input.len() < 8 {
1529        return None;
1530    }
1531    let value = u64::from_le_bytes(input[..8].try_into().ok()?);
1532    *input = &input[8..];
1533    Some(value)
1534}
1535
1536fn read_string(input: &mut &[u8]) -> Option<String> {
1537    let len = read_u32(input)?;
1538    if input.len() < len {
1539        return None;
1540    }
1541    let value = String::from_utf8(input[..len].to_vec()).ok()?;
1542    *input = &input[len..];
1543    Some(value)
1544}
1545
1546fn read_bytes<'a>(input: &mut &'a [u8]) -> Option<&'a [u8]> {
1547    let len = read_u32(input)?;
1548    if input.len() < len {
1549        return None;
1550    }
1551    let value = &input[..len];
1552    *input = &input[len..];
1553    Some(value)
1554}
1555
1556fn encode_result_cache_payload(entry: &RuntimeResultCacheEntry) -> Option<Vec<u8>> {
1557    let result = &entry.result;
1558    if result.result.pre_serialized_json.is_some()
1559        || result_cache_static_str(result.statement).is_none()
1560        || result_cache_static_str(result.engine).is_none()
1561        || result_cache_static_str(result.statement_type).is_none()
1562        || result.result.records.iter().any(|record| {
1563            !record.nodes.is_empty()
1564                || !record.edges.is_empty()
1565                || !record.paths.is_empty()
1566                || !record.vector_results.is_empty()
1567        })
1568    {
1569        return None;
1570    }
1571
1572    let mut out = Vec::new();
1573    out.extend_from_slice(RESULT_CACHE_PAYLOAD_MAGIC);
1574    write_string(&mut out, &result.query)?;
1575    out.push(mode_to_byte(result.mode));
1576    write_string(&mut out, result.statement)?;
1577    write_string(&mut out, result.engine)?;
1578    out.extend_from_slice(&result.affected_rows.to_le_bytes());
1579    write_string(&mut out, result.statement_type)?;
1580
1581    write_u32(&mut out, result.result.columns.len())?;
1582    for column in &result.result.columns {
1583        write_string(&mut out, column)?;
1584    }
1585    out.extend_from_slice(&result.result.stats.nodes_scanned.to_le_bytes());
1586    out.extend_from_slice(&result.result.stats.edges_scanned.to_le_bytes());
1587    out.extend_from_slice(&result.result.stats.rows_scanned.to_le_bytes());
1588    out.extend_from_slice(&result.result.stats.exec_time_us.to_le_bytes());
1589
1590    write_u32(&mut out, result.result.records.len())?;
1591    for record in &result.result.records {
1592        let fields = record.iter_fields().collect::<Vec<_>>();
1593        write_u32(&mut out, fields.len())?;
1594        for (name, value) in fields {
1595            write_string(&mut out, name)?;
1596            let mut encoded = Vec::new();
1597            crate::storage::schema::value_codec::encode(value, &mut encoded);
1598            write_bytes(&mut out, &encoded)?;
1599        }
1600    }
1601
1602    write_u32(&mut out, entry.scopes.len())?;
1603    for scope in &entry.scopes {
1604        write_string(&mut out, scope)?;
1605    }
1606    Some(out)
1607}
1608
1609fn decode_result_cache_payload(mut input: &[u8]) -> Option<(RuntimeQueryResult, HashSet<String>)> {
1610    if input.len() < RESULT_CACHE_PAYLOAD_MAGIC.len()
1611        || &input[..RESULT_CACHE_PAYLOAD_MAGIC.len()] != RESULT_CACHE_PAYLOAD_MAGIC
1612    {
1613        return None;
1614    }
1615    input = &input[RESULT_CACHE_PAYLOAD_MAGIC.len()..];
1616
1617    let query = read_string(&mut input)?;
1618    let mode = mode_from_byte(read_u8(&mut input)?)?;
1619    let statement = result_cache_static_str(&read_string(&mut input)?)?;
1620    let engine = result_cache_static_str(&read_string(&mut input)?)?;
1621    let affected_rows = read_u64(&mut input)?;
1622    let statement_type = result_cache_static_str(&read_string(&mut input)?)?;
1623
1624    let mut columns = Vec::new();
1625    for _ in 0..read_u32(&mut input)? {
1626        columns.push(read_string(&mut input)?);
1627    }
1628    let stats = crate::storage::query::unified::QueryStats {
1629        nodes_scanned: read_u64(&mut input)?,
1630        edges_scanned: read_u64(&mut input)?,
1631        rows_scanned: read_u64(&mut input)?,
1632        exec_time_us: read_u64(&mut input)?,
1633    };
1634
1635    let mut records = Vec::new();
1636    for _ in 0..read_u32(&mut input)? {
1637        let mut record = crate::storage::query::unified::UnifiedRecord::new();
1638        for _ in 0..read_u32(&mut input)? {
1639            let name = read_string(&mut input)?;
1640            let bytes = read_bytes(&mut input)?;
1641            let (value, used) = crate::storage::schema::value_codec::decode(bytes).ok()?;
1642            if used != bytes.len() {
1643                return None;
1644            }
1645            record.set_owned(name, value);
1646        }
1647        records.push(record);
1648    }
1649
1650    let mut scopes = HashSet::new();
1651    for _ in 0..read_u32(&mut input)? {
1652        scopes.insert(read_string(&mut input)?);
1653    }
1654    if !input.is_empty() {
1655        return None;
1656    }
1657
1658    Some((
1659        RuntimeQueryResult {
1660            query,
1661            mode,
1662            statement,
1663            engine,
1664            result: crate::storage::query::unified::UnifiedResult {
1665                columns,
1666                records,
1667                stats,
1668                pre_serialized_json: None,
1669            },
1670            affected_rows,
1671            statement_type,
1672        },
1673        scopes,
1674    ))
1675}
1676
1677/// Heuristic: does the raw SQL reference a built-in whose output
1678/// varies by connection, clock, or randomness? Such queries must
1679/// skip the 30s result cache — see the call site for rationale.
1680///
1681/// ASCII case-insensitive substring match. False positives (the
1682/// token appears in a quoted string) only skip caching, which is
1683/// the conservative direction.
1684/// If `sql` starts with `EXPLAIN` followed by a non-`ALTER` token,
1685/// return the trimmed inner statement; otherwise `None`.
1686///
1687/// `EXPLAIN ALTER FOR CREATE TABLE ...` is a separate schema-diff
1688/// command handled inside the normal SQL parser, so we leave it
1689/// alone here.
1690fn strip_explain_prefix(sql: &str) -> Option<&str> {
1691    let trimmed = sql.trim_start();
1692    let (head, rest) = trimmed.split_at(
1693        trimmed
1694            .find(|c: char| c.is_whitespace())
1695            .unwrap_or(trimmed.len()),
1696    );
1697    if !head.eq_ignore_ascii_case("EXPLAIN") {
1698        return None;
1699    }
1700    let rest = rest.trim_start();
1701    if rest.is_empty() {
1702        return None;
1703    }
1704    // Peek the next token — if ALTER or ASK, defer to the normal parser.
1705    // `EXPLAIN ASK` is an executable read path: it runs retrieval and
1706    // provider selection, then short-circuits before the LLM call.
1707    let next_head_end = rest.find(|c: char| c.is_whitespace()).unwrap_or(rest.len());
1708    if rest[..next_head_end].eq_ignore_ascii_case("ALTER")
1709        || rest[..next_head_end].eq_ignore_ascii_case("ASK")
1710    {
1711        return None;
1712    }
1713    Some(rest)
1714}
1715
1716/// Cheap prefix check for a leading `WITH` keyword. Used to gate the
1717/// CTE-aware parse in `execute_query` without paying for a full
1718/// lexer pass on every statement. Treats `WITHIN` as not-a-CTE so
1719/// `WITHIN TENANT '...' SELECT ...` doesn't mis-route.
1720pub(super) fn has_with_prefix(sql: &str) -> bool {
1721    let trimmed = sql.trim_start();
1722    let head_end = trimmed
1723        .find(|c: char| c.is_whitespace() || c == '(')
1724        .unwrap_or(trimmed.len());
1725    trimmed[..head_end].eq_ignore_ascii_case("WITH")
1726}
1727
1728/// If the query is a plain SELECT whose top-level `TableQuery`
1729/// carries an `AS OF` clause, return a typed spec that the runtime
1730/// can feed to `vcs_resolve_as_of`. Returns `None` for any other
1731/// shape — joins, DML, EXPLAIN, or parse failures — so callers fall
1732/// back to the connection's regular MVCC snapshot. A cheap textual
1733/// prefilter skips the parse entirely when the source doesn't
1734/// mention `AS OF` / `as of`, keeping the autocommit hot path free.
1735fn peek_top_level_as_of(sql: &str) -> Option<crate::application::vcs::AsOfSpec> {
1736    peek_top_level_as_of_with_table(sql).map(|(spec, _)| spec)
1737}
1738
1739/// Same as `peek_top_level_as_of` but also returns the table name
1740/// targeted by the AS OF clause (when the FROM clause names a
1741/// concrete table). `None` for the table slot means scalar SELECT
1742/// or a subquery source — callers treat those as "no enforcement".
1743pub(super) fn peek_top_level_as_of_with_table(
1744    sql: &str,
1745) -> Option<(crate::application::vcs::AsOfSpec, Option<String>)> {
1746    if !sql
1747        .as_bytes()
1748        .windows(5)
1749        .any(|w| w.eq_ignore_ascii_case(b"as of"))
1750    {
1751        return None;
1752    }
1753    let parsed = crate::storage::query::parser::parse(sql).ok()?;
1754    let crate::storage::query::ast::QueryExpr::Table(table) = parsed.query else {
1755        return None;
1756    };
1757    let clause = table.as_of?;
1758    let table_name = if table.table.is_empty() || table.table == "any" {
1759        None
1760    } else {
1761        Some(table.table.clone())
1762    };
1763    let spec = match clause {
1764        crate::storage::query::ast::AsOfClause::Commit(h) => {
1765            crate::application::vcs::AsOfSpec::Commit(h)
1766        }
1767        crate::storage::query::ast::AsOfClause::Branch(b) => {
1768            crate::application::vcs::AsOfSpec::Branch(b)
1769        }
1770        crate::storage::query::ast::AsOfClause::Tag(t) => crate::application::vcs::AsOfSpec::Tag(t),
1771        crate::storage::query::ast::AsOfClause::TimestampMs(ts) => {
1772            crate::application::vcs::AsOfSpec::TimestampMs(ts)
1773        }
1774        crate::storage::query::ast::AsOfClause::Snapshot(x) => {
1775            crate::application::vcs::AsOfSpec::Snapshot(x)
1776        }
1777    };
1778    Some((spec, table_name))
1779}
1780
1781pub(super) fn query_has_volatile_builtin(sql: &str) -> bool {
1782    // Lowercase the bytes up to the first null/newline into a small
1783    // stack buffer for cheap contains() checks. Most SQL fits in the
1784    // buffer; longer queries fall back to owned lowercase.
1785    const VOLATILE_TOKENS: &[&str] = &[
1786        "pg_advisory_lock",
1787        "pg_try_advisory_lock",
1788        "pg_advisory_unlock",
1789        "random()",
1790        // NOW() / CURRENT_TIMESTAMP / CURRENT_DATE intentionally
1791        // omitted for now — they ARE volatile but today's tests rely
1792        // on caching them. Revisit once a tighter volatility story
1793        // lands.
1794    ];
1795    let lowered = sql.to_ascii_lowercase();
1796    VOLATILE_TOKENS.iter().any(|t| lowered.contains(t))
1797}
1798
1799pub(super) fn query_is_ask_statement(sql: &str) -> bool {
1800    let trimmed = sql.trim_start();
1801    let head_end = trimmed
1802        .find(|c: char| c.is_whitespace() || c == '(' || c == ';')
1803        .unwrap_or(trimmed.len());
1804    trimmed[..head_end].eq_ignore_ascii_case("ASK")
1805}
1806
1807/// Pick the `(global_mode, collection_mode)` pair for an expression,
1808/// or `None` for variants that opt out of intent-locking entirely
1809/// (admin statements like `SHOW CONFIG`, transaction control, tenant
1810/// toggles).
1811///
1812/// Phase-1 contract:
1813/// - Reads  — `(IX-compatible) (Global, IS) → (Collection, IS)`
1814/// - Writes — `(IX-compatible) (Global, IX) → (Collection, IX)`
1815/// - DDL    — `(strong)        (Global, IX) → (Collection, X)`
1816pub(super) fn intent_lock_modes_for(
1817    expr: &QueryExpr,
1818) -> Option<(
1819    crate::storage::transaction::lock::LockMode,
1820    crate::storage::transaction::lock::LockMode,
1821)> {
1822    use crate::storage::transaction::lock::LockMode::{Exclusive, IntentExclusive, IntentShared};
1823
1824    match expr {
1825        // Reads — IS / IS.
1826        QueryExpr::Table(_)
1827        | QueryExpr::Join(_)
1828        | QueryExpr::Vector(_)
1829        | QueryExpr::Hybrid(_)
1830        | QueryExpr::Graph(_)
1831        | QueryExpr::Path(_)
1832        | QueryExpr::Ask(_)
1833        | QueryExpr::SearchCommand(_)
1834        | QueryExpr::GraphCommand(_)
1835        | QueryExpr::QueueSelect(_) => Some((IntentShared, IntentShared)),
1836
1837        // Writes — IX / IX. Non-tabular mutations (vector insert,
1838        // graph node insert, queue push, timeseries point insert)
1839        // don't carry their own dispatch arm here; they ride through
1840        // the Insert variant or a command variant covered by the
1841        // read-side arm above. P1.T4 expands only the TableQuery-ish
1842        // writes; non-tabular kinds inherit when their DML variants
1843        // land in later phases.
1844        QueryExpr::Insert(_)
1845        | QueryExpr::Update(_)
1846        | QueryExpr::Delete(_)
1847        | QueryExpr::QueueCommand(QueueCommand::Move { .. }) => {
1848            Some((IntentExclusive, IntentExclusive))
1849        }
1850        QueryExpr::QueueCommand(_) => Some((IntentShared, IntentShared)),
1851
1852        // DDL — IX / X. A DDL against collection `c` blocks all
1853        // other writers + readers on `c` but leaves other collections
1854        // running (because Global stays IX, not X).
1855        QueryExpr::CreateTable(_)
1856        | QueryExpr::CreateCollection(_)
1857        | QueryExpr::CreateVector(_)
1858        | QueryExpr::DropTable(_)
1859        | QueryExpr::DropGraph(_)
1860        | QueryExpr::DropVector(_)
1861        | QueryExpr::DropDocument(_)
1862        | QueryExpr::DropKv(_)
1863        | QueryExpr::DropCollection(_)
1864        | QueryExpr::Truncate(_)
1865        | QueryExpr::AlterTable(_)
1866        | QueryExpr::CreateIndex(_)
1867        | QueryExpr::DropIndex(_)
1868        | QueryExpr::CreateTimeSeries(_)
1869        | QueryExpr::DropTimeSeries(_)
1870        | QueryExpr::CreateQueue(_)
1871        | QueryExpr::AlterQueue(_)
1872        | QueryExpr::DropQueue(_)
1873        | QueryExpr::CreateTree(_)
1874        | QueryExpr::DropTree(_)
1875        | QueryExpr::CreatePolicy(_)
1876        | QueryExpr::DropPolicy(_)
1877        | QueryExpr::CreateView(_)
1878        | QueryExpr::DropView(_)
1879        | QueryExpr::RefreshMaterializedView(_)
1880        | QueryExpr::CreateSchema(_)
1881        | QueryExpr::DropSchema(_)
1882        | QueryExpr::CreateSequence(_)
1883        | QueryExpr::DropSequence(_)
1884        | QueryExpr::CreateServer(_)
1885        | QueryExpr::DropServer(_)
1886        | QueryExpr::CreateForeignTable(_)
1887        | QueryExpr::DropForeignTable(_) => Some((IntentExclusive, Exclusive)),
1888
1889        // Admin / control — skip intent locks. `SET TENANT`,
1890        // `BEGIN / COMMIT / ROLLBACK`, `SET CONFIG`, `SHOW CONFIG`,
1891        // `VACUUM`, etc. don't touch collection data the same way
1892        // and the existing transaction layer already serialises the
1893        // pieces that matter.
1894        _ => None,
1895    }
1896}
1897
1898/// Best-effort collection inventory for an expression. Used to pick
1899/// `Collection(...)` resources for the intent-lock guard. Overshoots
1900/// are fine (take an extra IS, benign); undershoots leak writes past
1901/// DDL X locks, so err on the side of listing more names.
1902pub(super) fn collections_referenced(expr: &QueryExpr) -> Vec<String> {
1903    let mut out = Vec::new();
1904    walk_collections(expr, &mut out);
1905    out.sort();
1906    out.dedup();
1907    out
1908}
1909
1910fn walk_collections(expr: &QueryExpr, out: &mut Vec<String>) {
1911    match expr {
1912        QueryExpr::Table(t) => out.push(t.table.clone()),
1913        QueryExpr::Join(j) => {
1914            walk_collections(&j.left, out);
1915            walk_collections(&j.right, out);
1916        }
1917        QueryExpr::Insert(i) => out.push(i.table.clone()),
1918        QueryExpr::Update(u) => out.push(u.table.clone()),
1919        QueryExpr::Delete(d) => out.push(d.table.clone()),
1920        QueryExpr::QueueSelect(q) => out.push(q.queue.clone()),
1921
1922        // DDL — include the target collection so DDL takes
1923        // `(Collection, X)` and blocks concurrent readers / writers
1924        // on the same collection. Other collections stay live
1925        // because Global is still IX.
1926        QueryExpr::CreateTable(q) => out.push(q.name.clone()),
1927        QueryExpr::CreateCollection(q) => out.push(q.name.clone()),
1928        QueryExpr::CreateVector(q) => out.push(q.name.clone()),
1929        QueryExpr::DropTable(q) => out.push(q.name.clone()),
1930        QueryExpr::DropGraph(q) => out.push(q.name.clone()),
1931        QueryExpr::DropVector(q) => out.push(q.name.clone()),
1932        QueryExpr::DropDocument(q) => out.push(q.name.clone()),
1933        QueryExpr::DropKv(q) => out.push(q.name.clone()),
1934        QueryExpr::DropCollection(q) => out.push(q.name.clone()),
1935        QueryExpr::Truncate(q) => out.push(q.name.clone()),
1936        QueryExpr::AlterTable(q) => out.push(q.name.clone()),
1937        QueryExpr::CreateIndex(q) => out.push(q.table.clone()),
1938        QueryExpr::DropIndex(q) => out.push(q.table.clone()),
1939        QueryExpr::CreateTimeSeries(q) => out.push(q.name.clone()),
1940        QueryExpr::DropTimeSeries(q) => out.push(q.name.clone()),
1941        QueryExpr::CreateQueue(q) => out.push(q.name.clone()),
1942        QueryExpr::AlterQueue(q) => out.push(q.name.clone()),
1943        QueryExpr::DropQueue(q) => out.push(q.name.clone()),
1944        QueryExpr::QueueCommand(QueueCommand::Move {
1945            source,
1946            destination,
1947            ..
1948        }) => {
1949            out.push(source.clone());
1950            out.push(destination.clone());
1951        }
1952        QueryExpr::CreatePolicy(q) => out.push(q.table.clone()),
1953        QueryExpr::CreateView(q) => out.push(q.name.clone()),
1954        QueryExpr::DropView(q) => out.push(q.name.clone()),
1955        QueryExpr::RefreshMaterializedView(q) => out.push(q.name.clone()),
1956
1957        // Vector / Hybrid / Graph / Path / commands reference
1958        // collections through fields whose shape varies; without a
1959        // uniform accessor we fall back to the global lock only —
1960        // benign because every runtime path still holds the global
1961        // mode.
1962        _ => {}
1963    }
1964}
1965
1966impl RedDBRuntime {
1967    pub fn in_memory() -> RedDBResult<Self> {
1968        Self::with_options(RedDBOptions::in_memory())
1969    }
1970
1971    /// Handle to the intent-lock manager for tests + introspection.
1972    /// Production code acquires via `LockerGuard::new(rt.lock_manager())`
1973    /// rather than touching the manager directly.
1974    pub fn lock_manager(&self) -> std::sync::Arc<crate::storage::transaction::lock::LockManager> {
1975        self.inner.lock_manager.clone()
1976    }
1977
1978    #[inline(never)]
1979    pub fn with_options(options: RedDBOptions) -> RedDBResult<Self> {
1980        Self::with_pool(options, ConnectionPoolConfig::default())
1981    }
1982
1983    pub fn with_pool(
1984        options: RedDBOptions,
1985        pool_config: ConnectionPoolConfig,
1986    ) -> RedDBResult<Self> {
1987        // PLAN.md Phase 9.1 — capture wall-clock before storage
1988        // open so the cold-start phase markers can be backfilled
1989        // once Lifecycle is constructed below. Storage open
1990        // encapsulates auto-restore + WAL replay; we treat the
1991        // whole window as one combined "restore" + "wal_replay"
1992        // phase split at the same boundary because the storage
1993        // layer doesn't yet emit a finer signal.
1994        let boot_open_start_ms = std::time::SystemTime::now()
1995            .duration_since(std::time::UNIX_EPOCH)
1996            .map(|d| d.as_millis() as u64)
1997            .unwrap_or(0);
1998        let db = Arc::new(
1999            RedDB::open_with_options(&options)
2000                .map_err(|err| RedDBError::Internal(err.to_string()))?,
2001        );
2002        let result_blob_cache = crate::storage::cache::BlobCache::open_with_l2(
2003            crate::storage::cache::BlobCacheConfig::default().with_l2_path(
2004                options
2005                    .resolved_path("data.rdb")
2006                    .with_extension("result-cache.l2"),
2007            ),
2008        )
2009        .map_err(|err| {
2010            RedDBError::Internal(format!("open result Blob Cache L2 failed: {err:?}"))
2011        })?;
2012        let storage_ready_ms = std::time::SystemTime::now()
2013            .duration_since(std::time::UNIX_EPOCH)
2014            .map(|d| d.as_millis() as u64)
2015            .unwrap_or(0);
2016
2017        let runtime = Self {
2018            inner: Arc::new(RuntimeInner {
2019                db,
2020                layout: PhysicalLayout::from_options(&options),
2021                indices: IndexCatalog::register_default_vector_graph(
2022                    options.has_capability(crate::api::Capability::Table),
2023                    options.has_capability(crate::api::Capability::Graph),
2024                ),
2025                pool_config,
2026                pool: Mutex::new(PoolState::default()),
2027                started_at_unix_ms: SystemTime::now()
2028                    .duration_since(UNIX_EPOCH)
2029                    .unwrap_or_default()
2030                    .as_millis(),
2031                probabilistic: super::probabilistic_store::ProbabilisticStore::new(),
2032                index_store: super::index_store::IndexStore::new(),
2033                cdc: crate::replication::cdc::CdcBuffer::new(100_000),
2034                backup_scheduler: crate::replication::scheduler::BackupScheduler::new(3600),
2035                query_cache: parking_lot::RwLock::new(
2036                    crate::storage::query::planner::cache::PlanCache::new(1000),
2037                ),
2038                result_cache: parking_lot::RwLock::new((
2039                    HashMap::new(),
2040                    std::collections::VecDeque::new(),
2041                )),
2042                result_blob_cache,
2043                result_blob_entries: parking_lot::RwLock::new((
2044                    HashMap::new(),
2045                    std::collections::VecDeque::new(),
2046                )),
2047                ask_answer_cache_entries: parking_lot::RwLock::new((
2048                    HashSet::new(),
2049                    std::collections::VecDeque::new(),
2050                )),
2051                result_cache_shadow_divergences: std::sync::atomic::AtomicU64::new(0),
2052                ask_daily_spend: parking_lot::RwLock::new(HashMap::new()),
2053                queue_message_locks: parking_lot::RwLock::new(HashMap::new()),
2054                planner_dirty_tables: parking_lot::RwLock::new(HashSet::new()),
2055                ec_registry: Arc::new(crate::ec::config::EcRegistry::new()),
2056                ec_worker: crate::ec::worker::EcWorker::new(),
2057                auth_store: parking_lot::RwLock::new(None),
2058                oauth_validator: parking_lot::RwLock::new(None),
2059                views: parking_lot::RwLock::new(HashMap::new()),
2060                materialized_views: parking_lot::RwLock::new(
2061                    crate::storage::cache::result::MaterializedViewCache::new(),
2062                ),
2063                snapshot_manager: Arc::new(
2064                    crate::storage::transaction::snapshot::SnapshotManager::new(),
2065                ),
2066                tx_contexts: parking_lot::RwLock::new(HashMap::new()),
2067                tx_local_tenants: parking_lot::RwLock::new(HashMap::new()),
2068                env_config_overrides: crate::runtime::config_overlay::collect_env_overrides(),
2069                lock_manager: Arc::new({
2070                    // Sourced from the matrix: Tier B key
2071                    // `concurrency.locking.deadlock_timeout_ms`
2072                    // (default 5000). Env var wins at boot so
2073                    // operators can tune without touching red_config.
2074                    let env = crate::runtime::config_overlay::collect_env_overrides();
2075                    let timeout_ms = env
2076                        .get("concurrency.locking.deadlock_timeout_ms")
2077                        .and_then(|raw| raw.parse::<u64>().ok())
2078                        .unwrap_or_else(|| {
2079                            match crate::runtime::config_matrix::default_for(
2080                                "concurrency.locking.deadlock_timeout_ms",
2081                            ) {
2082                                Some(crate::serde_json::Value::Number(n)) => n as u64,
2083                                _ => 5000,
2084                            }
2085                        });
2086                    let cfg = crate::storage::transaction::lock::LockConfig {
2087                        default_timeout: std::time::Duration::from_millis(timeout_ms),
2088                        ..Default::default()
2089                    };
2090                    crate::storage::transaction::lock::LockManager::new(cfg)
2091                }),
2092                rls_policies: parking_lot::RwLock::new(HashMap::new()),
2093                rls_enabled_tables: parking_lot::RwLock::new(HashSet::new()),
2094                foreign_tables: Arc::new(crate::storage::fdw::ForeignTableRegistry::with_builtins()),
2095                pending_tombstones: parking_lot::RwLock::new(HashMap::new()),
2096                pending_versioned_updates: parking_lot::RwLock::new(HashMap::new()),
2097                pending_kv_watch_events: parking_lot::RwLock::new(HashMap::new()),
2098                pending_store_wal_actions: parking_lot::RwLock::new(HashMap::new()),
2099                tenant_tables: parking_lot::RwLock::new(HashMap::new()),
2100                ddl_epoch: std::sync::atomic::AtomicU64::new(0),
2101                write_gate: Arc::new(crate::runtime::write_gate::WriteGate::from_options(
2102                    &options,
2103                )),
2104                lifecycle: crate::runtime::lifecycle::Lifecycle::new(),
2105                resource_limits: crate::runtime::resource_limits::ResourceLimits::from_env(),
2106                audit_log: {
2107                    // Default audit-log path for the in-memory case
2108                    // sits in the system temp dir; persistent runs
2109                    // place it next to data.rdb.
2110                    let data_path = options
2111                        .data_path
2112                        .clone()
2113                        .unwrap_or_else(|| std::env::temp_dir().join("reddb"));
2114                    Arc::new(crate::runtime::audit_log::AuditLogger::for_data_path(
2115                        &data_path,
2116                    ))
2117                },
2118                lease_lifecycle: std::sync::OnceLock::new(),
2119                replica_apply_metrics: crate::replication::logical::ReplicaApplyMetrics::default(),
2120                quota_bucket: crate::runtime::quota_bucket::QuotaBucket::from_env(),
2121                schema_vocabulary: parking_lot::RwLock::new(
2122                    crate::runtime::schema_vocabulary::SchemaVocabulary::new(),
2123                ),
2124                slow_query_logger: {
2125                    // Issue #205 — slow-query sink lives in the same
2126                    // directory the audit log uses, so backup/restore
2127                    // ships them together. Threshold + sample-pct
2128                    // default conservatively (1 s, 100% sampling) so
2129                    // emitted lines are rare and complete. Operators
2130                    // tune via env / config matrix in a follow-up.
2131                    //
2132                    // `data_path` points at the primary `.rdb` *file*
2133                    // (mirrors AuditLogger::for_data_path), so we
2134                    // anchor the slow log at its parent directory.
2135                    let log_dir = options
2136                        .data_path
2137                        .as_ref()
2138                        .and_then(|p| p.parent().map(std::path::PathBuf::from))
2139                        .unwrap_or_else(|| std::env::temp_dir().join("reddb"));
2140                    let threshold_ms = std::env::var("RED_SLOW_QUERY_THRESHOLD_MS")
2141                        .ok()
2142                        .and_then(|s| s.parse::<u64>().ok())
2143                        .unwrap_or(1000);
2144                    let sample_pct = std::env::var("RED_SLOW_QUERY_SAMPLE_PCT")
2145                        .ok()
2146                        .and_then(|s| s.parse::<u8>().ok())
2147                        .unwrap_or(100);
2148                    crate::telemetry::slow_query_logger::SlowQueryLogger::new(
2149                        crate::telemetry::slow_query_logger::SlowQueryOpts {
2150                            log_dir,
2151                            threshold_ms,
2152                            sample_pct,
2153                        },
2154                    )
2155                },
2156                kv_stats: crate::runtime::KvStatsCounters::default(),
2157                kv_tag_index: crate::runtime::KvTagIndex::default(),
2158            }),
2159        };
2160
2161        // Issue #205 — install the process-wide OperatorEvent sink so
2162        // emit sites buried in storage / replication / signal handlers
2163        // can record without threading an `&AuditLogger` through every
2164        // call stack. First registration wins; subsequent in-memory
2165        // runtimes (test harnesses) fall through to tracing+eprintln.
2166        crate::telemetry::operator_event::install_global_audit_sink(Arc::clone(
2167            &runtime.inner.audit_log,
2168        ));
2169
2170        // PLAN.md Phase 9.1 — backfill cold-start phase markers
2171        // from the wall-clock captured before storage open. The
2172        // entire `RedDB::open_with_options` call covers both
2173        // auto-restore (when configured) and WAL replay. We
2174        // record both phases against the same boundary today;
2175        // a follow-up will split them once the storage layer
2176        // surfaces a finer-grained event.
2177        runtime
2178            .inner
2179            .lifecycle
2180            .set_restore_started_at_ms(boot_open_start_ms);
2181        runtime
2182            .inner
2183            .lifecycle
2184            .set_restore_ready_at_ms(storage_ready_ms);
2185        runtime
2186            .inner
2187            .lifecycle
2188            .set_wal_replay_started_at_ms(boot_open_start_ms);
2189        runtime
2190            .inner
2191            .lifecycle
2192            .set_wal_replay_ready_at_ms(storage_ready_ms);
2193
2194        let restored_cdc_lsn = runtime
2195            .inner
2196            .db
2197            .replication
2198            .as_ref()
2199            .map(|repl| {
2200                repl.logical_wal_spool
2201                    .as_ref()
2202                    .map(|spool| spool.current_lsn())
2203                    .unwrap_or(0)
2204            })
2205            .unwrap_or(0)
2206            .max(runtime.config_u64("red.config.timeline.last_archived_lsn", 0));
2207        runtime.inner.cdc.set_current_lsn(restored_cdc_lsn);
2208        runtime.rehydrate_snapshot_xid_floor();
2209        runtime.bootstrap_system_keyed_collections()?;
2210
2211        // Phase 2.5.4: replay `tenant_tables.{table}.column` markers so
2212        // tables declared via `TENANT BY (col)` survive restart. Each
2213        // entry re-registers the auto-policy and flips RLS on again.
2214        runtime.rehydrate_tenant_tables();
2215        if let Some(repl) = &runtime.inner.db.replication {
2216            repl.wal_buffer.set_current_lsn(restored_cdc_lsn);
2217        }
2218
2219        // Save system info to red_config on boot
2220        {
2221            let sys = SystemInfo::collect();
2222            runtime.inner.db.store().set_config_tree(
2223                "red.system",
2224                &crate::serde_json::json!({
2225                    "pid": sys.pid,
2226                    "cpu_cores": sys.cpu_cores,
2227                    "total_memory_bytes": sys.total_memory_bytes,
2228                    "available_memory_bytes": sys.available_memory_bytes,
2229                    "os": sys.os,
2230                    "arch": sys.arch,
2231                    "hostname": sys.hostname,
2232                    "started_at": SystemTime::now()
2233                        .duration_since(UNIX_EPOCH)
2234                        .unwrap_or_default()
2235                        .as_millis() as u64
2236                }),
2237            );
2238
2239            // Seed defaults on first boot (only if red_config is empty or missing defaults)
2240            let store = runtime.inner.db.store();
2241            if store
2242                .get_collection("red_config")
2243                .map(|m| m.query_all(|_| true).len())
2244                .unwrap_or(0)
2245                <= 10
2246            {
2247                store.set_config_tree("red.ai", &crate::json!({
2248                    "default": crate::json!({
2249                        "provider": "openai",
2250                        "model": crate::ai::DEFAULT_OPENAI_PROMPT_MODEL
2251                    }),
2252                    "max_embedding_inputs": 256,
2253                    "max_prompt_batch": 256,
2254                    "timeout": crate::json!({ "connect_secs": 10, "read_secs": 90, "write_secs": 30 })
2255                }));
2256                store.set_config_tree(
2257                    "red.server",
2258                    &crate::json!({
2259                        "max_scan_limit": 1000,
2260                        "max_body_size": 1048576,
2261                        "read_timeout_ms": 5000,
2262                        "write_timeout_ms": 5000
2263                    }),
2264                );
2265                store.set_config_tree(
2266                    "red.storage",
2267                    &crate::json!({
2268                        "page_size": 4096,
2269                        "page_cache_capacity": 100000,
2270                        "auto_checkpoint_pages": 1000,
2271                        "snapshot_retention": 16,
2272                        "verify_checksums": true,
2273                        "segment": crate::json!({
2274                            "max_entities": 100000,
2275                            "max_bytes": 268435456_u64,
2276                            "compression_level": 6
2277                        }),
2278                        "hnsw": crate::json!({ "m": 16, "ef_construction": 100, "ef_search": 50 }),
2279                        "ivf": crate::json!({ "n_lists": 100, "n_probes": 10 }),
2280                        "bm25": crate::json!({ "k1": 1.2, "b": 0.75 })
2281                    }),
2282                );
2283                store.set_config_tree(
2284                    "red.search",
2285                    &crate::json!({
2286                        "rag": crate::json!({
2287                            "max_chunks_per_source": 10,
2288                            "max_total_chunks": 25,
2289                            "similarity_threshold": 0.8,
2290                            "graph_depth": 2,
2291                            "min_relevance": 0.3
2292                        }),
2293                        "fusion": crate::json!({
2294                            "vector_weight": 0.5,
2295                            "graph_weight": 0.3,
2296                            "table_weight": 0.2,
2297                            "dedup_threshold": 0.85
2298                        })
2299                    }),
2300                );
2301                store.set_config_tree(
2302                    "red.auth",
2303                    &crate::json!({
2304                        "enabled": false,
2305                        "session_ttl_secs": 3600,
2306                        "require_auth": false
2307                    }),
2308                );
2309                store.set_config_tree(
2310                    "red.query",
2311                    &crate::json!({
2312                        "connection_pool": crate::json!({ "max_connections": 64, "max_idle": 16 }),
2313                        "max_recursion_depth": 1000
2314                    }),
2315                );
2316                store.set_config_tree(
2317                    "red.indexes",
2318                    &crate::json!({
2319                        "auto_select": true,
2320                        "bloom_filter": crate::json!({
2321                            "enabled": true,
2322                            "false_positive_rate": 0.01,
2323                            "prune_on_scan": true
2324                        }),
2325                        "hash": crate::json!({ "enabled": true }),
2326                        "bitmap": crate::json!({ "enabled": true, "max_cardinality": 1000 }),
2327                        "spatial": crate::json!({ "enabled": true })
2328                    }),
2329                );
2330                store.set_config_tree(
2331                    "red.memtable",
2332                    &crate::json!({
2333                        "enabled": true,
2334                        "max_bytes": 67108864_u64,
2335                        "flush_threshold": 0.75
2336                    }),
2337                );
2338                store.set_config_tree(
2339                    "red.probabilistic",
2340                    &crate::json!({
2341                        "hll_registers": 16384,
2342                        "sketch_default_width": 1000,
2343                        "sketch_default_depth": 5,
2344                        "filter_default_capacity": 100000
2345                    }),
2346                );
2347                store.set_config_tree(
2348                    "red.timeseries",
2349                    &crate::json!({
2350                        "default_chunk_size": 1024,
2351                        "compression": crate::json!({
2352                            "timestamps": "delta_of_delta",
2353                            "values": "gorilla_xor"
2354                        }),
2355                        "default_retention_days": 0
2356                    }),
2357                );
2358                store.set_config_tree(
2359                    "red.queue",
2360                    &crate::json!({
2361                        "default_max_size": 0,
2362                        "default_max_attempts": 3,
2363                        "visibility_timeout_ms": 30000,
2364                        "consumer_idle_timeout_ms": 60000
2365                    }),
2366                );
2367                store.set_config_tree(
2368                    "red.backup",
2369                    &crate::json!({
2370                        "enabled": false,
2371                        "interval_secs": 3600,
2372                        "retention_count": 24,
2373                        "upload": false,
2374                        "backend": "local"
2375                    }),
2376                );
2377                store.set_config_tree(
2378                    "red.wal",
2379                    &crate::json!({
2380                        "archive": crate::json!({
2381                            "enabled": false,
2382                            "retention_hours": 168,
2383                            "prefix": "wal/"
2384                        })
2385                    }),
2386                );
2387                store.set_config_tree(
2388                    "red.cdc",
2389                    &crate::json!({
2390                        "enabled": true,
2391                        "buffer_size": 100000
2392                    }),
2393                );
2394                store.set_config_tree(
2395                    "red.config.secret",
2396                    &crate::json!({
2397                        "auto_encrypt": true,
2398                        "auto_decrypt": true
2399                    }),
2400                );
2401            }
2402
2403            // Perf-parity config matrix: heal the Tier A (critical)
2404            // keys unconditionally on every boot. Idempotent — only
2405            // writes the default when the key is missing. Keeps
2406            // `SHOW CONFIG` showing every guarantee the operator has
2407            // (durability.mode, concurrency.locking.enabled, …) even
2408            // on long-running datadirs that predate the matrix.
2409            crate::runtime::config_matrix::heal_critical_keys(store.as_ref());
2410
2411            // Phase 5 — Lehman-Yao runtime flag. Read the Tier A
2412            // `storage.btree.lehman_yao` value from the matrix (env
2413            // > file > red_config > default) and publish it to the
2414            // storage layer's atomic so the B-tree read / split
2415            // paths can branch without re-reading the config on
2416            // every hot-path call.
2417            let lehman_yao = runtime.config_bool("storage.btree.lehman_yao", true);
2418            crate::storage::engine::btree::lehman_yao::set_enabled(lehman_yao);
2419            if lehman_yao {
2420                tracing::info!(
2421                    "storage.btree.lehman_yao=true — lock-free concurrent descent enabled"
2422                );
2423            }
2424
2425            // Config file overlay — mounted `/etc/reddb/config.json`
2426            // (override path via REDDB_CONFIG_FILE). Writes keys with
2427            // write-if-absent semantics so a later user `SET CONFIG`
2428            // always wins. Missing file = silent no-op.
2429            let overlay_path = crate::runtime::config_overlay::config_file_path();
2430            let _ =
2431                crate::runtime::config_overlay::apply_config_file(store.as_ref(), &overlay_path);
2432        }
2433
2434        // VCS ("Git for Data") — create the `red_*` metadata
2435        // collections on first boot. Idempotent: `get_or_create_collection`
2436        // is a no-op if the collection already exists.
2437        {
2438            let store = runtime.inner.db.store();
2439            for name in crate::application::vcs_collections::ALL {
2440                let _ = store.get_or_create_collection(*name);
2441            }
2442            // Seed VCS config namespace with sensible defaults on first
2443            // boot, matching the pattern used by red.ai / red.storage.
2444            store.set_config_tree(
2445                crate::application::vcs_collections::CONFIG_NAMESPACE,
2446                &crate::json!({
2447                    "default_branch": "main",
2448                    "author": crate::json!({
2449                        "name": "reddb",
2450                        "email": "reddb@localhost"
2451                    }),
2452                    "protected_branches": crate::json!(["main"]),
2453                    "closure": crate::json!({
2454                        "enabled": true,
2455                        "lazy": true
2456                    }),
2457                    "merge": crate::json!({
2458                        "default_strategy": "auto",
2459                        "fast_forward": true
2460                    })
2461                }),
2462            );
2463        }
2464
2465        // Migrations — create the `red_migrations` / `red_migration_deps`
2466        // system collections on first boot. Idempotent.
2467        {
2468            let store = runtime.inner.db.store();
2469            for name in crate::application::migration_collections::ALL {
2470                let _ = store.get_or_create_collection(*name);
2471            }
2472        }
2473
2474        // Start background maintenance thread (context index refresh +
2475        // session purge). Held by a WEAK reference to `RuntimeInner`
2476        // so dropping the last `RedDBRuntime` handle actually releases
2477        // the underlying Arc<Pager> (and its file lock). Polling at
2478        // 200ms means shutdown latency is bounded; the real 60-second
2479        // work cadence is tracked independently via a `last_work`
2480        // timestamp.
2481        //
2482        // The previous version captured `rt = runtime.clone()` by
2483        // strong reference and ran an unterminated `loop`, which held
2484        // Arc<RuntimeInner> forever — reopening a persistent database
2485        // in the same process failed with "Database is locked" because
2486        // the pager could never drop. See the regression test
2487        // `finding_1_select_after_bulk_insert_persistent_reopen`.
2488        {
2489            let weak = Arc::downgrade(&runtime.inner);
2490            std::thread::Builder::new()
2491                .name("reddb-maintenance".into())
2492                .spawn(move || {
2493                    let tick = std::time::Duration::from_millis(200);
2494                    let work_interval = std::time::Duration::from_secs(60);
2495                    let mut last_work = std::time::Instant::now();
2496                    loop {
2497                        std::thread::sleep(tick);
2498                        let Some(inner) = weak.upgrade() else {
2499                            // All strong references dropped — the
2500                            // runtime is gone, exit cleanly.
2501                            break;
2502                        };
2503                        if last_work.elapsed() >= work_interval {
2504                            let _stats = inner.db.store().context_index().stats();
2505                            last_work = std::time::Instant::now();
2506                        }
2507                    }
2508                })
2509                .ok();
2510        }
2511
2512        // Start backup scheduler if enabled via red_config
2513        {
2514            let store = runtime.inner.db.store();
2515            let mut backup_enabled = false;
2516            let mut backup_interval = 3600u64;
2517
2518            if let Some(manager) = store.get_collection("red_config") {
2519                manager.for_each_entity(|entity| {
2520                    if let Some(row) = entity.data.as_row() {
2521                        let key = row.get_field("key").and_then(|v| match v {
2522                            crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2523                            _ => None,
2524                        });
2525                        let val = row.get_field("value");
2526                        if key == Some("red.config.backup.enabled") {
2527                            backup_enabled = match val {
2528                                Some(crate::storage::schema::Value::Boolean(true)) => true,
2529                                Some(crate::storage::schema::Value::Text(s)) => &**s == "true",
2530                                _ => false,
2531                            };
2532                        } else if key == Some("red.config.backup.interval_secs") {
2533                            if let Some(crate::storage::schema::Value::Integer(n)) = val {
2534                                backup_interval = *n as u64;
2535                            }
2536                        }
2537                    }
2538                    true
2539                });
2540            }
2541
2542            if backup_enabled {
2543                runtime.inner.backup_scheduler.set_interval(backup_interval);
2544                let rt = runtime.clone();
2545                runtime
2546                    .inner
2547                    .backup_scheduler
2548                    .start(move || rt.trigger_backup().map_err(|e| format!("{}", e)));
2549            }
2550        }
2551
2552        // Load EC registry from red_config and start worker
2553        {
2554            runtime
2555                .inner
2556                .ec_registry
2557                .load_from_config_store(runtime.inner.db.store().as_ref());
2558            if !runtime.inner.ec_registry.async_configs().is_empty() {
2559                runtime.inner.ec_worker.start(
2560                    Arc::clone(&runtime.inner.ec_registry),
2561                    Arc::clone(&runtime.inner.db.store()),
2562                );
2563            }
2564        }
2565
2566        if let crate::replication::ReplicationRole::Replica { primary_addr } =
2567            runtime.inner.db.options().replication.role.clone()
2568        {
2569            let rt = runtime.clone();
2570            std::thread::Builder::new()
2571                .name("reddb-replica".into())
2572                .spawn(move || rt.run_replica_loop(primary_addr))
2573                .ok();
2574        }
2575
2576        // PLAN.md Phase 1 — Lifecycle Contract. Mark Ready once every
2577        // boot stage above has completed (WAL replay, restore-from-
2578        // remote, replica-loop spawn). Health probes flip from 503 to
2579        // 200 here; shutdown begins from this state.
2580        runtime.inner.lifecycle.mark_ready();
2581
2582        Ok(runtime)
2583    }
2584
2585    fn rehydrate_snapshot_xid_floor(&self) {
2586        let store = self.inner.db.store();
2587        for collection in store.list_collections() {
2588            let Some(manager) = store.get_collection(&collection) else {
2589                continue;
2590            };
2591            for entity in manager.query_all(|_| true) {
2592                self.inner
2593                    .snapshot_manager
2594                    .observe_committed_xid(entity.xmin);
2595                self.inner
2596                    .snapshot_manager
2597                    .observe_committed_xid(entity.xmax);
2598            }
2599        }
2600    }
2601
2602    fn bootstrap_system_keyed_collections(&self) -> RedDBResult<()> {
2603        let mut changed = false;
2604        for (name, model) in [
2605            ("red.config", crate::catalog::CollectionModel::Config),
2606            ("red.vault", crate::catalog::CollectionModel::Vault),
2607        ] {
2608            if self.inner.db.store().get_collection(name).is_none() {
2609                self.inner.db.store().get_or_create_collection(name);
2610                changed = true;
2611            }
2612            if self.inner.db.collection_contract(name).is_none() {
2613                self.inner
2614                    .db
2615                    .save_collection_contract(system_keyed_collection_contract(name, model))
2616                    .map_err(|err| RedDBError::Internal(err.to_string()))?;
2617                changed = true;
2618            }
2619        }
2620        if changed {
2621            self.inner
2622                .db
2623                .persist_metadata()
2624                .map_err(|err| RedDBError::Internal(err.to_string()))?;
2625        }
2626        Ok(())
2627    }
2628
2629    pub fn db(&self) -> Arc<RedDB> {
2630        Arc::clone(&self.inner.db)
2631    }
2632
2633    /// Direct access to the runtime's secondary-index store.
2634    /// Used by bulk-insert entry points (gRPC binary bulk, HTTP bulk,
2635    /// wire bulk) that need to push new rows through the per-index
2636    /// maintenance hook after `store.bulk_insert` returns.
2637    pub fn index_store_ref(&self) -> &super::index_store::IndexStore {
2638        &self.inner.index_store
2639    }
2640
2641    /// Apply a DDL event to the schema-vocabulary reverse index
2642    /// (issue #120). Called by DDL execution paths after the catalog
2643    /// mutation has succeeded so the index never holds entries for
2644    /// half-applied DDL.
2645    pub(crate) fn schema_vocabulary_apply(
2646        &self,
2647        event: crate::runtime::schema_vocabulary::DdlEvent,
2648    ) {
2649        self.inner.schema_vocabulary.write().on_ddl(event);
2650    }
2651
2652    /// Lookup `token` in the schema-vocabulary reverse index. Returns
2653    /// an owned `Vec<VocabHit>` because the underlying read lock
2654    /// cannot be borrowed across the call boundary; the slice from
2655    /// `SchemaVocabulary::lookup` is cloned per hit.
2656    pub fn schema_vocabulary_lookup(
2657        &self,
2658        token: &str,
2659    ) -> Vec<crate::runtime::schema_vocabulary::VocabHit> {
2660        self.inner.schema_vocabulary.read().lookup(token).to_vec()
2661    }
2662
2663    /// Inject an AuthStore into the runtime. Called by server boot
2664    /// after the vault has been bootstrapped, so that `Value::Secret`
2665    /// auto-encrypt/decrypt can reach the vault AES key.
2666    pub fn set_auth_store(&self, store: Arc<crate::auth::store::AuthStore>) {
2667        *self.inner.auth_store.write() = Some(store);
2668    }
2669
2670    /// Read a vault KV secret from the configured AuthStore, if present.
2671    pub fn vault_kv_get(&self, key: &str) -> Option<String> {
2672        self.inner
2673            .auth_store
2674            .read()
2675            .as_ref()
2676            .and_then(|store| store.vault_kv_get(key))
2677    }
2678
2679    /// Write a vault KV secret and fail if the encrypted vault write is
2680    /// unavailable or cannot be made durable.
2681    pub fn vault_kv_try_set(&self, key: String, value: String) -> RedDBResult<()> {
2682        let store = self.inner.auth_store.read().clone().ok_or_else(|| {
2683            RedDBError::Query("secret storage requires an enabled, unsealed vault".to_string())
2684        })?;
2685        store
2686            .vault_kv_try_set(key, value)
2687            .map_err(|err| RedDBError::Query(err.to_string()))
2688    }
2689
2690    /// Inject an `OAuthValidator` into the runtime. When set, HTTP and
2691    /// wire transports try OAuth JWT validation before falling back to
2692    /// the local AuthStore lookup. Pass `None` to disable.
2693    pub fn set_oauth_validator(&self, validator: Option<Arc<crate::auth::oauth::OAuthValidator>>) {
2694        *self.inner.oauth_validator.write() = validator;
2695    }
2696
2697    /// Returns a clone of the configured `OAuthValidator` Arc, if any.
2698    /// Hot path: called per HTTP request when an Authorization header
2699    /// is present, so we hand back a cheap Arc clone.
2700    pub fn oauth_validator(&self) -> Option<Arc<crate::auth::oauth::OAuthValidator>> {
2701        self.inner.oauth_validator.read().clone()
2702    }
2703
2704    /// Returns the vault AES key (`red.secret.aes_key`) if an auth
2705    /// store is wired and a key has been generated. Used by the
2706    /// `Value::Secret` encrypt/decrypt pipeline.
2707    pub(crate) fn secret_aes_key(&self) -> Option<[u8; 32]> {
2708        let guard = self.inner.auth_store.read();
2709        guard.as_ref().and_then(|s| s.vault_secret_key())
2710    }
2711
2712    /// Resolve a boolean flag from `red_config`. Defaults to `default`
2713    /// when the key is missing or not coercible. If the same key has
2714    /// been written multiple times (SET CONFIG appends new rows), the
2715    /// most recent entity wins. Env-var overrides
2716    /// (`REDDB_<UP_DOTTED>`) take highest precedence.
2717    pub(crate) fn config_bool(&self, key: &str, default: bool) -> bool {
2718        if let Some(raw) = self.inner.env_config_overrides.get(key) {
2719            if let Some(crate::storage::schema::Value::Boolean(b)) =
2720                crate::runtime::config_overlay::coerce_env_value(key, raw)
2721            {
2722                return b;
2723            }
2724        }
2725        let store = self.inner.db.store();
2726        let Some(manager) = store.get_collection("red_config") else {
2727            return default;
2728        };
2729        let mut result = default;
2730        let mut latest_id: u64 = 0;
2731        manager.for_each_entity(|entity| {
2732            if let Some(row) = entity.data.as_row() {
2733                let entry_key = row.get_field("key").and_then(|v| match v {
2734                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2735                    _ => None,
2736                });
2737                if entry_key == Some(key) {
2738                    let id = entity.id.raw();
2739                    if id >= latest_id {
2740                        latest_id = id;
2741                        result = match row.get_field("value") {
2742                            Some(crate::storage::schema::Value::Boolean(b)) => *b,
2743                            Some(crate::storage::schema::Value::Text(s)) => {
2744                                matches!(s.as_ref(), "true" | "TRUE" | "True" | "1")
2745                            }
2746                            Some(crate::storage::schema::Value::Integer(n)) => *n != 0,
2747                            _ => default,
2748                        };
2749                    }
2750                }
2751            }
2752            true
2753        });
2754        result
2755    }
2756
2757    pub(crate) fn config_u64(&self, key: &str, default: u64) -> u64 {
2758        if let Some(raw) = self.inner.env_config_overrides.get(key) {
2759            if let Some(crate::storage::schema::Value::UnsignedInteger(n)) =
2760                crate::runtime::config_overlay::coerce_env_value(key, raw)
2761            {
2762                return n;
2763            }
2764        }
2765        let store = self.inner.db.store();
2766        let Some(manager) = store.get_collection("red_config") else {
2767            return default;
2768        };
2769        let mut result = default;
2770        let mut latest_id: u64 = 0;
2771        manager.for_each_entity(|entity| {
2772            if let Some(row) = entity.data.as_row() {
2773                let entry_key = row.get_field("key").and_then(|v| match v {
2774                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2775                    _ => None,
2776                });
2777                if entry_key == Some(key) {
2778                    let id = entity.id.raw();
2779                    if id >= latest_id {
2780                        latest_id = id;
2781                        result = match row.get_field("value") {
2782                            Some(crate::storage::schema::Value::Integer(n)) => *n as u64,
2783                            Some(crate::storage::schema::Value::UnsignedInteger(n)) => *n,
2784                            Some(crate::storage::schema::Value::Text(s)) => {
2785                                s.parse::<u64>().unwrap_or(default)
2786                            }
2787                            _ => default,
2788                        };
2789                    }
2790                }
2791            }
2792            true
2793        });
2794        result
2795    }
2796
2797    pub(crate) fn config_f64(&self, key: &str, default: f64) -> f64 {
2798        if let Some(raw) = self.inner.env_config_overrides.get(key) {
2799            if let Ok(n) = raw.parse::<f64>() {
2800                return n;
2801            }
2802        }
2803        let store = self.inner.db.store();
2804        let Some(manager) = store.get_collection("red_config") else {
2805            return default;
2806        };
2807        let mut result = default;
2808        let mut latest_id: u64 = 0;
2809        manager.for_each_entity(|entity| {
2810            if let Some(row) = entity.data.as_row() {
2811                let entry_key = row.get_field("key").and_then(|v| match v {
2812                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2813                    _ => None,
2814                });
2815                if entry_key == Some(key) {
2816                    let id = entity.id.raw();
2817                    if id >= latest_id {
2818                        latest_id = id;
2819                        result = match row.get_field("value") {
2820                            Some(crate::storage::schema::Value::Float(n)) => *n,
2821                            Some(crate::storage::schema::Value::Integer(n)) => *n as f64,
2822                            Some(crate::storage::schema::Value::UnsignedInteger(n)) => *n as f64,
2823                            Some(crate::storage::schema::Value::Text(s)) => {
2824                                s.parse::<f64>().unwrap_or(default)
2825                            }
2826                            _ => default,
2827                        };
2828                    }
2829                }
2830            }
2831            true
2832        });
2833        result
2834    }
2835
2836    pub(crate) fn config_string(&self, key: &str, default: &str) -> String {
2837        if let Some(raw) = self.inner.env_config_overrides.get(key) {
2838            return raw.clone();
2839        }
2840        let store = self.inner.db.store();
2841        let Some(manager) = store.get_collection("red_config") else {
2842            return default.to_string();
2843        };
2844        let mut result = default.to_string();
2845        let mut latest_id: u64 = 0;
2846        manager.for_each_entity(|entity| {
2847            if let Some(row) = entity.data.as_row() {
2848                let entry_key = row.get_field("key").and_then(|v| match v {
2849                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2850                    _ => None,
2851                });
2852                if entry_key == Some(key) {
2853                    let id = entity.id.raw();
2854                    if id >= latest_id {
2855                        latest_id = id;
2856                        if let Some(crate::storage::schema::Value::Text(value)) =
2857                            row.get_field("value")
2858                        {
2859                            result = value.to_string();
2860                        }
2861                    }
2862                }
2863            }
2864            true
2865        });
2866        result
2867    }
2868
2869    fn latest_metadata_for(
2870        &self,
2871        collection: &str,
2872        entity_id: u64,
2873    ) -> Option<crate::serde_json::Value> {
2874        self.inner
2875            .db
2876            .store()
2877            .get_metadata(collection, EntityId::new(entity_id))
2878            .map(|metadata| metadata_to_json(&metadata))
2879    }
2880
2881    fn persist_replica_lsn(&self, lsn: u64) {
2882        self.inner.db.store().set_config_tree(
2883            "red.replication",
2884            &crate::json!({
2885                "last_applied_lsn": lsn
2886            }),
2887        );
2888    }
2889
2890    fn persist_replication_health(
2891        &self,
2892        state: &str,
2893        last_error: &str,
2894        primary_lsn: Option<u64>,
2895        oldest_available_lsn: Option<u64>,
2896    ) {
2897        self.inner.db.store().set_config_tree(
2898            "red.replication",
2899            &crate::json!({
2900                "state": state,
2901                "last_error": last_error,
2902                "last_seen_primary_lsn": primary_lsn.unwrap_or(0),
2903                "last_seen_oldest_lsn": oldest_available_lsn.unwrap_or(0),
2904                "updated_at_unix_ms": SystemTime::now()
2905                    .duration_since(UNIX_EPOCH)
2906                    .unwrap_or_default()
2907                    .as_millis() as u64
2908            }),
2909        );
2910    }
2911
2912    /// Whether `SECRET('...')` literals should be encrypted with the
2913    /// vault AES key on INSERT. Default `true`.
2914    pub(crate) fn secret_auto_encrypt(&self) -> bool {
2915        self.config_bool("red.config.secret.auto_encrypt", true)
2916    }
2917
2918    /// Whether `Value::Secret` columns should be decrypted back to
2919    /// plaintext on SELECT when the vault is unsealed. Default `true`.
2920    /// Turning this off keeps secrets masked as `***` even while the
2921    /// vault is open — useful for audit trails or read-only exports.
2922    pub(crate) fn secret_auto_decrypt(&self) -> bool {
2923        self.config_bool("red.config.secret.auto_decrypt", true)
2924    }
2925
2926    /// Walk every record in `result` and swap `Value::Secret(bytes)`
2927    /// for the decrypted plaintext when the runtime has the vault
2928    /// AES key AND `red.config.secret.auto_decrypt = true`. If the
2929    /// key is missing, the vault is sealed, or auto_decrypt is off,
2930    /// secrets are left as `Value::Secret` which every formatter
2931    /// (Display, JSON) already masks as `***`.
2932    pub(crate) fn apply_secret_decryption(&self, result: &mut RuntimeQueryResult) {
2933        if !self.secret_auto_decrypt() {
2934            return;
2935        }
2936        let Some(key) = self.secret_aes_key() else {
2937            return;
2938        };
2939        for record in result.result.records.iter_mut() {
2940            for value in record.values_mut() {
2941                if let Value::Secret(ref bytes) = value {
2942                    if let Some(plain) =
2943                        super::impl_dml::decrypt_secret_payload(&key, bytes.as_slice())
2944                    {
2945                        if let Ok(text) = String::from_utf8(plain) {
2946                            *value = Value::text(text);
2947                        }
2948                    }
2949                }
2950            }
2951        }
2952    }
2953
2954    /// Emit a CDC change event and replicate to WAL buffer.
2955    /// Create a `MutationEngine` bound to this runtime.
2956    ///
2957    /// The engine is cheap to construct (no allocation) and should be
2958    /// dropped after `apply` returns. Use this from application-layer
2959    /// `create_row` / `create_rows_batch` instead of calling
2960    /// `bulk_insert` + `index_entity_insert` + `cdc_emit` separately.
2961    pub(crate) fn mutation_engine(&self) -> crate::runtime::mutation::MutationEngine<'_> {
2962        crate::runtime::mutation::MutationEngine::new(self)
2963    }
2964
2965    /// Public-mutation gate snapshot (PLAN.md W1).
2966    ///
2967    /// Surfaces that accept untrusted client requests (SQL DML/DDL,
2968    /// gRPC mutating RPCs, HTTP/native wire mutations, admin
2969    /// maintenance, serverless lifecycle) call `check_write` before
2970    /// dispatching to storage. Returns `RedDBError::ReadOnly` on any
2971    /// instance running as a replica or with `options.read_only =
2972    /// true`. The replica internal logical-WAL apply path reaches into
2973    /// the store directly and never calls this method, so legitimate
2974    /// replica catch-up still works.
2975    pub fn check_write(&self, kind: crate::runtime::write_gate::WriteKind) -> RedDBResult<()> {
2976        self.inner.write_gate.check(kind)
2977    }
2978
2979    /// Read-only handle to the gate, useful for transports that want
2980    /// to surface the policy in health/status output without taking on
2981    /// a dependency on the concrete enum.
2982    pub fn write_gate(&self) -> &crate::runtime::write_gate::WriteGate {
2983        &self.inner.write_gate
2984    }
2985
2986    /// Process lifecycle handle (PLAN.md Phase 1). Health probes,
2987    /// admin/shutdown, and signal handlers consult this single
2988    /// state machine.
2989    pub fn lifecycle(&self) -> &crate::runtime::lifecycle::Lifecycle {
2990        &self.inner.lifecycle
2991    }
2992
2993    /// Operator-imposed resource limits (PLAN.md Phase 4.1).
2994    pub fn resource_limits(&self) -> &crate::runtime::resource_limits::ResourceLimits {
2995        &self.inner.resource_limits
2996    }
2997
2998    /// Append-only audit log for admin mutations (PLAN.md Phase 6.5).
2999    pub fn audit_log(&self) -> &crate::runtime::audit_log::AuditLogger {
3000        &self.inner.audit_log
3001    }
3002
3003    /// Shared `Arc` to the audit logger — used by collaborators (the
3004    /// lease lifecycle, future request-context plumbing) that need to
3005    /// keep the logger alive past the runtime's stack frame.
3006    pub fn audit_log_arc(&self) -> Arc<crate::runtime::audit_log::AuditLogger> {
3007        Arc::clone(&self.inner.audit_log)
3008    }
3009
3010    /// Shared `Arc` to the write gate. Same rationale as
3011    /// `audit_log_arc`: collaborators (lease lifecycle, refresh
3012    /// thread) need a clone-cheap handle they can move into a
3013    /// background thread.
3014    pub fn write_gate_arc(&self) -> Arc<crate::runtime::write_gate::WriteGate> {
3015        Arc::clone(&self.inner.write_gate)
3016    }
3017
3018    /// Serverless writer-lease state machine. `None` when the operator
3019    /// did not opt into lease fencing (`RED_LEASE_REQUIRED` unset).
3020    pub fn lease_lifecycle(&self) -> Option<&Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>> {
3021        self.inner.lease_lifecycle.get()
3022    }
3023
3024    /// Install the lease lifecycle. Idempotent; subsequent calls
3025    /// return the previously stored value untouched.
3026    pub fn set_lease_lifecycle(
3027        &self,
3028        lifecycle: Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>,
3029    ) -> Result<(), Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>> {
3030        self.inner.lease_lifecycle.set(lifecycle)
3031    }
3032
3033    /// Reject the call when the requested batch size exceeds
3034    /// `RED_MAX_BATCH_SIZE`. Returns `RedDBError::QuotaExceeded`
3035    /// shaped so the HTTP layer can map it to 413 Payload Too
3036    /// Large (PLAN.md Phase 4.1).
3037    pub fn check_batch_size(&self, requested: usize) -> RedDBResult<()> {
3038        if self.inner.resource_limits.batch_size_exceeded(requested) {
3039            let max = self.inner.resource_limits.max_batch_size.unwrap_or(0);
3040            return Err(RedDBError::QuotaExceeded(format!(
3041                "max_batch_size:{requested}:{max}"
3042            )));
3043        }
3044        Ok(())
3045    }
3046
3047    /// Reject the call when the local DB file exceeds
3048    /// `RED_MAX_DB_SIZE_BYTES`. Reads file metadata once per call —
3049    /// the cost is a single `stat()` syscall, negligible against the
3050    /// I/O the caller is about to do. Returns `QuotaExceeded` shaped
3051    /// for HTTP 507 Insufficient Storage.
3052    pub fn check_db_size(&self) -> RedDBResult<()> {
3053        let Some(limit) = self.inner.resource_limits.max_db_size_bytes else {
3054            return Ok(());
3055        };
3056        if limit == 0 {
3057            return Ok(());
3058        }
3059        let Some(path) = self.inner.db.path() else {
3060            return Ok(());
3061        };
3062        let current = std::fs::metadata(path).map(|m| m.len()).unwrap_or(0);
3063        if current > limit {
3064            return Err(RedDBError::QuotaExceeded(format!(
3065                "max_db_size_bytes:{current}:{limit}"
3066            )));
3067        }
3068        Ok(())
3069    }
3070
3071    /// Graceful shutdown coordinator (PLAN.md Phase 1.1).
3072    ///
3073    /// Steps, in order, all idempotent across re-entrant calls:
3074    ///   1. Move lifecycle into `ShuttingDown` (concurrent callers
3075    ///      observe `Stopped` after first finishes).
3076    ///   2. Flush WAL + run final checkpoint via `db.flush()` so
3077    ///      every acked write is durable on disk.
3078    ///   3. If `backup_on_shutdown == true` and a remote backend is
3079    ///      configured, run a synchronous `trigger_backup()` so the
3080    ///      remote head reflects the final state.
3081    ///   4. Stamp the report and move to `Stopped`. Subsequent calls
3082    ///      return the cached report without re-running anything.
3083    ///
3084    /// On any error, the runtime is still marked `Stopped` so the
3085    /// process can exit; the caller logs the error context but does
3086    /// not retry the same shutdown — the operator can inspect the
3087    /// report fields to see which step failed.
3088    pub fn graceful_shutdown(
3089        &self,
3090        backup_on_shutdown: bool,
3091    ) -> RedDBResult<crate::runtime::lifecycle::ShutdownReport> {
3092        if !self.inner.lifecycle.begin_shutdown() {
3093            // Someone else already shut down (or is in flight). Return
3094            // the cached report so the HTTP caller and SIGTERM handler
3095            // get the same idempotent answer.
3096            return Ok(self.inner.lifecycle.shutdown_report().unwrap_or_default());
3097        }
3098
3099        let started_ms = std::time::SystemTime::now()
3100            .duration_since(std::time::UNIX_EPOCH)
3101            .map(|d| d.as_millis() as u64)
3102            .unwrap_or(0);
3103        let mut report = crate::runtime::lifecycle::ShutdownReport {
3104            started_at_ms: started_ms,
3105            ..Default::default()
3106        };
3107
3108        // Flush WAL + run any pending checkpoint. Local fsync is
3109        // unconditional — even a lease-lost replica needs its WAL on
3110        // disk before exit so a future restore has the latest tail.
3111        // The remote upload is gated separately so a lost-lease writer
3112        // doesn't clobber the new holder's state on its way out.
3113        let flush_res = self.inner.db.flush_local_only();
3114        report.flushed_wal = flush_res.is_ok();
3115        report.final_checkpoint = flush_res.is_ok();
3116        if let Err(err) = &flush_res {
3117            tracing::error!(
3118                target: "reddb::lifecycle",
3119                error = %err,
3120                "graceful_shutdown: local flush failed"
3121            );
3122        } else if let Err(lease_err) =
3123            self.assert_remote_write_allowed("shutdown/checkpoint_upload")
3124        {
3125            tracing::warn!(
3126                target: "reddb::serverless::lease",
3127                error = %lease_err,
3128                "graceful_shutdown: remote upload skipped — lease not held"
3129            );
3130        } else if let Err(err) = self.inner.db.upload_to_remote_backend() {
3131            tracing::error!(
3132                target: "reddb::lifecycle",
3133                error = %err,
3134                "graceful_shutdown: remote upload failed"
3135            );
3136        }
3137
3138        // Optional final backup. Skipped silently when no remote
3139        // backend is configured — `trigger_backup()` returns Err
3140        // anyway in that case, but logging it as a shutdown failure
3141        // would be misleading on a standalone (no-backend) runtime.
3142        if backup_on_shutdown && self.inner.db.remote_backend.is_some() {
3143            // The trigger_backup gate now reads `WriteKind::Backup`,
3144            // which a replica/read_only instance refuses. That's
3145            // intentional — replicas don't drive backups; only the
3146            // primary does. We still want shutdown to flush its WAL
3147            // even if the backup branch is gated off.
3148            match self.trigger_backup() {
3149                Ok(result) => {
3150                    report.backup_uploaded = result.uploaded;
3151                }
3152                Err(err) => {
3153                    tracing::warn!(
3154                        target: "reddb::lifecycle",
3155                        error = %err,
3156                        "graceful_shutdown: final backup skipped"
3157                    );
3158                }
3159            }
3160        }
3161
3162        let completed_ms = std::time::SystemTime::now()
3163            .duration_since(std::time::UNIX_EPOCH)
3164            .map(|d| d.as_millis() as u64)
3165            .unwrap_or(started_ms);
3166        report.completed_at_ms = completed_ms;
3167        report.duration_ms = completed_ms.saturating_sub(started_ms);
3168
3169        self.inner.lifecycle.finish_shutdown(report.clone());
3170        Ok(report)
3171    }
3172
3173    /// Emit a CDC record without invalidating the result cache.
3174    ///
3175    /// Used by `MutationEngine::append_batch` which calls
3176    /// `invalidate_result_cache` once for the whole batch before this
3177    /// loop, avoiding N write-lock acquisitions.
3178    pub(crate) fn cdc_emit_no_cache_invalidate(
3179        &self,
3180        operation: crate::replication::cdc::ChangeOperation,
3181        collection: &str,
3182        entity_id: u64,
3183        entity_kind: &str,
3184    ) -> u64 {
3185        let lsn = self
3186            .inner
3187            .cdc
3188            .emit(operation, collection, entity_id, entity_kind);
3189
3190        // Append to logical WAL replication buffer (if primary mode)
3191        if let Some(ref primary) = self.inner.db.replication {
3192            let store = self.inner.db.store();
3193            let entity = if operation == crate::replication::cdc::ChangeOperation::Delete {
3194                None
3195            } else {
3196                store.get(collection, EntityId::new(entity_id))
3197            };
3198            let record = ChangeRecord {
3199                lsn,
3200                timestamp: SystemTime::now()
3201                    .duration_since(UNIX_EPOCH)
3202                    .unwrap_or_default()
3203                    .as_millis() as u64,
3204                operation,
3205                collection: collection.to_string(),
3206                entity_id,
3207                entity_kind: entity_kind.to_string(),
3208                entity_bytes: entity
3209                    .as_ref()
3210                    .map(|e| UnifiedStore::serialize_entity(e, store.format_version())),
3211                metadata: self.latest_metadata_for(collection, entity_id),
3212            };
3213            let encoded = record.encode();
3214            primary.wal_buffer.append(record.lsn, encoded.clone());
3215            if let Some(spool) = &primary.logical_wal_spool {
3216                let _ = spool.append(record.lsn, &encoded);
3217            }
3218        }
3219        lsn
3220    }
3221
3222    pub(crate) fn cdc_emit_insert_batch_no_cache_invalidate(
3223        &self,
3224        collection: &str,
3225        ids: &[EntityId],
3226        entity_kind: &str,
3227    ) -> Vec<u64> {
3228        if ids.is_empty() {
3229            return Vec::new();
3230        }
3231
3232        // Without logical replication, CDC only needs the in-memory event
3233        // ring. Reserve all LSNs and push the batch under one mutex instead
3234        // of taking the ring lock once per inserted row.
3235        if self.inner.db.replication.is_none() {
3236            return self.inner.cdc.emit_batch_same_collection(
3237                crate::replication::cdc::ChangeOperation::Insert,
3238                collection,
3239                entity_kind,
3240                ids.iter().map(|id| id.raw()),
3241            );
3242        }
3243
3244        // Replication needs one logical-WAL record per entity with the
3245        // serialized entity bytes, so keep the existing per-row path.
3246        ids.iter()
3247            .map(|id| {
3248                self.cdc_emit_no_cache_invalidate(
3249                    crate::replication::cdc::ChangeOperation::Insert,
3250                    collection,
3251                    id.raw(),
3252                    entity_kind,
3253                )
3254            })
3255            .collect()
3256    }
3257
3258    pub fn cdc_emit(
3259        &self,
3260        operation: crate::replication::cdc::ChangeOperation,
3261        collection: &str,
3262        entity_id: u64,
3263        entity_kind: &str,
3264    ) -> u64 {
3265        let lsn = self
3266            .inner
3267            .cdc
3268            .emit(operation, collection, entity_id, entity_kind);
3269        // Perf: prior to this we called `invalidate_result_cache()`
3270        // which wipes EVERY cached query, across every table, under
3271        // a write lock — turning each INSERT into a serialisation
3272        // point for all readers. Swap to the per-table variant so
3273        // unrelated query caches survive.
3274        self.invalidate_result_cache_for_table(collection);
3275
3276        // Append to logical WAL replication buffer (if primary mode)
3277        if let Some(ref primary) = self.inner.db.replication {
3278            let store = self.inner.db.store();
3279            let entity = if operation == crate::replication::cdc::ChangeOperation::Delete {
3280                None
3281            } else {
3282                store.get(collection, EntityId::new(entity_id))
3283            };
3284            let record = ChangeRecord {
3285                lsn,
3286                timestamp: SystemTime::now()
3287                    .duration_since(UNIX_EPOCH)
3288                    .unwrap_or_default()
3289                    .as_millis() as u64,
3290                operation,
3291                collection: collection.to_string(),
3292                entity_id,
3293                entity_kind: entity_kind.to_string(),
3294                entity_bytes: entity
3295                    .as_ref()
3296                    .map(|entity| UnifiedStore::serialize_entity(entity, store.format_version())),
3297                metadata: self.latest_metadata_for(collection, entity_id),
3298            };
3299            let encoded = record.encode();
3300            primary.wal_buffer.append(record.lsn, encoded.clone());
3301            if let Some(spool) = &primary.logical_wal_spool {
3302                let _ = spool.append(record.lsn, &encoded);
3303            }
3304        }
3305        lsn
3306    }
3307
3308    pub(crate) fn cdc_emit_kv(
3309        &self,
3310        operation: crate::replication::cdc::ChangeOperation,
3311        collection: &str,
3312        key: &str,
3313        entity_id: u64,
3314        before: Option<crate::json::Value>,
3315        after: Option<crate::json::Value>,
3316    ) -> u64 {
3317        let lsn = self
3318            .inner
3319            .cdc
3320            .emit_kv(operation, collection, key, entity_id, before, after);
3321        self.inner.kv_stats.incr_watch_events_emitted();
3322        self.invalidate_result_cache_for_table(collection);
3323        lsn
3324    }
3325
3326    pub(crate) fn record_kv_watch_event(
3327        &self,
3328        operation: crate::replication::cdc::ChangeOperation,
3329        collection: &str,
3330        key: &str,
3331        entity_id: u64,
3332        before: Option<crate::json::Value>,
3333        after: Option<crate::json::Value>,
3334    ) {
3335        if self.current_xid().is_some() {
3336            let conn_id = current_connection_id();
3337            let event = crate::replication::cdc::KvWatchEvent {
3338                collection: collection.to_string(),
3339                key: key.to_string(),
3340                op: operation,
3341                before,
3342                after,
3343                lsn: 0,
3344                committed_at: 0,
3345                dropped_event_count: 0,
3346            };
3347            self.inner
3348                .pending_kv_watch_events
3349                .write()
3350                .entry(conn_id)
3351                .or_default()
3352                .push(event);
3353            return;
3354        }
3355
3356        self.cdc_emit_kv(operation, collection, key, entity_id, before, after);
3357    }
3358
3359    pub(crate) fn cdc_emit_prebuilt(
3360        &self,
3361        operation: crate::replication::cdc::ChangeOperation,
3362        collection: &str,
3363        entity: &UnifiedEntity,
3364        entity_kind: &str,
3365        metadata: Option<&crate::storage::Metadata>,
3366        invalidate_cache: bool,
3367    ) -> u64 {
3368        self.cdc_emit_prebuilt_with_columns(
3369            operation,
3370            collection,
3371            entity,
3372            entity_kind,
3373            metadata,
3374            invalidate_cache,
3375            None,
3376        )
3377    }
3378
3379    /// `cdc_emit_prebuilt` plus the list of column names whose values
3380    /// changed on this update. Callers that have already computed a
3381    /// `RowDamageVector` pass it here so downstream CDC consumers can
3382    /// filter events by touched column without re-diffing.
3383    /// `changed_columns` is only meaningful for `Update` operations —
3384    /// insert and delete events ignore it.
3385    pub(crate) fn cdc_emit_prebuilt_with_columns(
3386        &self,
3387        operation: crate::replication::cdc::ChangeOperation,
3388        collection: &str,
3389        entity: &UnifiedEntity,
3390        entity_kind: &str,
3391        metadata: Option<&crate::storage::Metadata>,
3392        invalidate_cache: bool,
3393        changed_columns: Option<Vec<String>>,
3394    ) -> u64 {
3395        if invalidate_cache {
3396            self.invalidate_result_cache();
3397        }
3398
3399        let lsn = self.inner.cdc.emit_with_columns(
3400            operation,
3401            collection,
3402            entity.id.raw(),
3403            entity_kind,
3404            changed_columns,
3405        );
3406
3407        if let Some(ref primary) = self.inner.db.replication {
3408            let store = self.inner.db.store();
3409            let record = ChangeRecord {
3410                lsn,
3411                timestamp: SystemTime::now()
3412                    .duration_since(UNIX_EPOCH)
3413                    .unwrap_or_default()
3414                    .as_millis() as u64,
3415                operation,
3416                collection: collection.to_string(),
3417                entity_id: entity.id.raw(),
3418                entity_kind: entity_kind.to_string(),
3419                entity_bytes: Some(UnifiedStore::serialize_entity(
3420                    entity,
3421                    store.format_version(),
3422                )),
3423                metadata: metadata
3424                    .map(metadata_to_json)
3425                    .or_else(|| self.latest_metadata_for(collection, entity.id.raw())),
3426            };
3427            let encoded = record.encode();
3428            primary.wal_buffer.append(record.lsn, encoded.clone());
3429            if let Some(spool) = &primary.logical_wal_spool {
3430                let _ = spool.append(record.lsn, &encoded);
3431            }
3432        }
3433
3434        lsn
3435    }
3436
3437    pub(crate) fn cdc_emit_prebuilt_batch<'a, I>(
3438        &self,
3439        operation: crate::replication::cdc::ChangeOperation,
3440        entity_kind: &str,
3441        items: I,
3442        invalidate_cache: bool,
3443    ) where
3444        I: IntoIterator<
3445            Item = (
3446                &'a str,
3447                &'a UnifiedEntity,
3448                Option<&'a crate::storage::Metadata>,
3449            ),
3450        >,
3451    {
3452        let items: Vec<(&str, &UnifiedEntity, Option<&crate::storage::Metadata>)> =
3453            items.into_iter().collect();
3454        if items.is_empty() {
3455            return;
3456        }
3457
3458        if invalidate_cache {
3459            self.invalidate_result_cache();
3460        }
3461
3462        for (collection, entity, metadata) in items {
3463            self.cdc_emit_prebuilt(operation, collection, entity, entity_kind, metadata, false);
3464        }
3465    }
3466
3467    fn run_replica_loop(&self, primary_addr: String) {
3468        let endpoint = if primary_addr.starts_with("http") {
3469            primary_addr
3470        } else {
3471            format!("http://{primary_addr}")
3472        };
3473        let poll_ms = self.inner.db.options().replication.poll_interval_ms;
3474        let max_count = self.inner.db.options().replication.max_batch_size;
3475        let mut since_lsn = self.config_u64("red.replication.last_applied_lsn", 0);
3476
3477        let runtime = match tokio::runtime::Builder::new_current_thread()
3478            .enable_all()
3479            .build()
3480        {
3481            Ok(runtime) => runtime,
3482            Err(_) => return,
3483        };
3484
3485        runtime.block_on(async move {
3486            use crate::grpc::proto::red_db_client::RedDbClient;
3487            use crate::grpc::proto::JsonPayloadRequest;
3488
3489            let mut client = loop {
3490                match RedDbClient::connect(endpoint.clone()).await {
3491                    Ok(client) => {
3492                        self.persist_replication_health("connecting", "", None, None);
3493                        break client;
3494                    }
3495                    Err(_) => {
3496                        self.persist_replication_health(
3497                            "connecting",
3498                            "waiting for primary connection",
3499                            None,
3500                            None,
3501                        );
3502                        std::thread::sleep(std::time::Duration::from_millis(poll_ms.max(250)))
3503                    }
3504                }
3505            };
3506
3507            // PLAN.md Phase 11.5 — stateful applier guards LSN
3508            // monotonicity across pulls. Seed with the persisted
3509            // `last_applied_lsn` so reboots don't lose the chain
3510            // pointer.
3511            let applier = crate::replication::logical::LogicalChangeApplier::new(since_lsn);
3512
3513            loop {
3514                let payload = crate::json!({
3515                    "since_lsn": since_lsn,
3516                    "max_count": max_count
3517                });
3518                let request = tonic::Request::new(JsonPayloadRequest {
3519                    payload_json: crate::json::to_string(&payload)
3520                        .unwrap_or_else(|_| "{}".to_string()),
3521                });
3522
3523                if let Ok(response) = client.pull_wal_records(request).await {
3524                    if let Ok(value) =
3525                        crate::json::from_str::<crate::json::Value>(&response.into_inner().payload)
3526                    {
3527                        let current_lsn =
3528                            value.get("current_lsn").and_then(crate::json::Value::as_u64);
3529                        let oldest_available_lsn = value
3530                            .get("oldest_available_lsn")
3531                            .and_then(crate::json::Value::as_u64);
3532                        if since_lsn > 0
3533                            && oldest_available_lsn
3534                                .map(|oldest| oldest > since_lsn.saturating_add(1))
3535                                .unwrap_or(false)
3536                        {
3537                            self.persist_replication_health(
3538                                "stalled_gap",
3539                                "replica is behind the oldest logical WAL available on primary; re-bootstrap required",
3540                                current_lsn,
3541                                oldest_available_lsn,
3542                            );
3543                            std::thread::sleep(std::time::Duration::from_millis(poll_ms.max(250)));
3544                            continue;
3545                        }
3546                        if let Some(records) =
3547                            value.get("records").and_then(crate::json::Value::as_array)
3548                        {
3549                            for record in records {
3550                                let Some(data_hex) =
3551                                    record.get("data").and_then(crate::json::Value::as_str)
3552                                else {
3553                                    continue;
3554                                };
3555                                let Ok(data) = hex::decode(data_hex) else {
3556                                    self.inner.replica_apply_metrics.record(
3557                                        crate::replication::logical::ApplyErrorKind::Decode,
3558                                    );
3559                                    self.persist_replication_health(
3560                                        "apply_error",
3561                                        "failed to decode WAL record hex payload",
3562                                        current_lsn,
3563                                        oldest_available_lsn,
3564                                    );
3565                                    continue;
3566                                };
3567                                let Ok(change) = ChangeRecord::decode(&data) else {
3568                                    self.inner.replica_apply_metrics.record(
3569                                        crate::replication::logical::ApplyErrorKind::Decode,
3570                                    );
3571                                    self.persist_replication_health(
3572                                        "apply_error",
3573                                        "failed to decode logical WAL record",
3574                                        current_lsn,
3575                                        oldest_available_lsn,
3576                                    );
3577                                    continue;
3578                                };
3579                                match applier.apply(
3580                                    self.inner.db.as_ref(),
3581                                    &change,
3582                                    ApplyMode::Replica,
3583                                ) {
3584                                    Ok(crate::replication::logical::ApplyOutcome::Applied) => {
3585                                        self.invalidate_result_cache_for_table(&change.collection);
3586                                        since_lsn = since_lsn.max(change.lsn);
3587                                        self.persist_replica_lsn(since_lsn);
3588                                    }
3589                                    Ok(_) => {
3590                                        // Idempotent / Skipped: no advance, no error.
3591                                    }
3592                                    Err(err) => {
3593                                        self.inner.replica_apply_metrics.record(err.kind());
3594                                        // Issue #205 — emit operator-grade event
3595                                        // for the two replication-fatal kinds. `Gap`
3596                                        // / `Apply` / `Decode` already persist via
3597                                        // `persist_replication_health`; the
3598                                        // OperatorEvent variants only cover the
3599                                        // two "stream is broken" / "follower
3600                                        // diverged" conditions an operator must act
3601                                        // on out-of-band.
3602                                        match &err {
3603                                            crate::replication::logical::LogicalApplyError::Divergence { lsn, expected: _, got: _ } => {
3604                                                crate::telemetry::operator_event::OperatorEvent::Divergence {
3605                                                    peer: "primary".to_string(),
3606                                                    leader_lsn: *lsn,
3607                                                    follower_lsn: since_lsn,
3608                                                }
3609                                                .emit_global();
3610                                            }
3611                                            crate::replication::logical::LogicalApplyError::Gap { last, next } => {
3612                                                crate::telemetry::operator_event::OperatorEvent::ReplicationBroken {
3613                                                    peer: "primary".to_string(),
3614                                                    reason: format!("stalled gap last={last} next={next}"),
3615                                                }
3616                                                .emit_global();
3617                                            }
3618                                            _ => {}
3619                                        }
3620                                        let kind = match &err {
3621                                            crate::replication::logical::LogicalApplyError::Gap { .. } => "stalled_gap",
3622                                            crate::replication::logical::LogicalApplyError::Divergence { .. } => "divergence",
3623                                            _ => "apply_error",
3624                                        };
3625                                        self.persist_replication_health(
3626                                            kind,
3627                                            &format!("replica apply rejected: {err}"),
3628                                            current_lsn,
3629                                            oldest_available_lsn,
3630                                        );
3631                                        // Stop applying this batch. The
3632                                        // outer loop will retry on next
3633                                        // pull, which on a real Gap will
3634                                        // not magically heal — operator
3635                                        // must rebootstrap. For
3636                                        // Divergence, we explicitly do
3637                                        // not advance; this keeps the
3638                                        // replica visibly unhealthy
3639                                        // instead of silently swallowing
3640                                        // corruption.
3641                                        break;
3642                                    }
3643                                }
3644                            }
3645                        }
3646                        self.persist_replication_health(
3647                            "healthy",
3648                            "",
3649                            current_lsn,
3650                            oldest_available_lsn,
3651                        );
3652                    } else {
3653                        self.persist_replication_health(
3654                            "apply_error",
3655                            "failed to parse pull_wal_records response",
3656                            None,
3657                            None,
3658                        );
3659                    }
3660                } else {
3661                    self.persist_replication_health(
3662                        "connecting",
3663                        "primary pull_wal_records request failed",
3664                        None,
3665                        None,
3666                    );
3667                }
3668
3669                std::thread::sleep(std::time::Duration::from_millis(poll_ms));
3670            }
3671        });
3672    }
3673
3674    /// Poll CDC events since a given LSN.
3675    pub fn cdc_poll(
3676        &self,
3677        since_lsn: u64,
3678        max_count: usize,
3679    ) -> Vec<crate::replication::cdc::ChangeEvent> {
3680        self.inner.cdc.poll(since_lsn, max_count)
3681    }
3682
3683    /// PLAN.md Phase 11.4 — current CDC LSN. Public mutation
3684    /// surfaces (HTTP query, gRPC entity ops) call this immediately
3685    /// after a successful write to feed `enforce_commit_policy`.
3686    pub fn cdc_current_lsn(&self) -> u64 {
3687        self.inner.cdc.current_lsn()
3688    }
3689
3690    pub fn kv_watch_events_since(
3691        &self,
3692        collection: &str,
3693        key: &str,
3694        since_lsn: u64,
3695        max_count: usize,
3696    ) -> Vec<crate::replication::cdc::KvWatchEvent> {
3697        self.inner
3698            .cdc
3699            .poll(since_lsn, max_count)
3700            .into_iter()
3701            .filter_map(|event| event.kv)
3702            .filter(|event| event.collection == collection && event.key == key)
3703            .collect()
3704    }
3705
3706    pub fn kv_watch_events_since_prefix(
3707        &self,
3708        collection: &str,
3709        prefix: &str,
3710        since_lsn: u64,
3711        max_count: usize,
3712    ) -> Vec<crate::replication::cdc::KvWatchEvent> {
3713        self.inner
3714            .cdc
3715            .poll(since_lsn, max_count)
3716            .into_iter()
3717            .filter_map(|event| event.kv)
3718            .filter(|event| event.collection == collection && event.key.starts_with(prefix))
3719            .collect()
3720    }
3721
3722    pub(crate) fn kv_watch_subscribe<'a>(
3723        &'a self,
3724        collection: impl Into<String>,
3725        key: impl Into<String>,
3726        from_lsn: Option<u64>,
3727    ) -> crate::runtime::kv_watch::KvWatchStream<'a> {
3728        crate::runtime::kv_watch::KvWatchStream::subscribe(
3729            &self.inner.cdc,
3730            &self.inner.kv_stats,
3731            collection,
3732            key,
3733            from_lsn,
3734            self.kv_watch_idle_timeout_ms(),
3735        )
3736    }
3737
3738    pub(crate) fn kv_watch_subscribe_prefix<'a>(
3739        &'a self,
3740        collection: impl Into<String>,
3741        prefix: impl Into<String>,
3742        from_lsn: Option<u64>,
3743    ) -> crate::runtime::kv_watch::KvWatchStream<'a> {
3744        crate::runtime::kv_watch::KvWatchStream::subscribe_prefix(
3745            &self.inner.cdc,
3746            &self.inner.kv_stats,
3747            collection,
3748            prefix,
3749            from_lsn,
3750            self.kv_watch_idle_timeout_ms(),
3751        )
3752    }
3753
3754    pub(crate) fn kv_watch_idle_timeout_ms(&self) -> u64 {
3755        self.config_u64("red.config.kv.watch.idle_timeout_ms", 60_000)
3756    }
3757
3758    /// Get backup scheduler status.
3759    pub fn backup_status(&self) -> crate::replication::scheduler::BackupStatus {
3760        self.inner.backup_scheduler.status()
3761    }
3762
3763    /// Borrow the runtime's result Blob Cache.
3764    ///
3765    /// Wired for the `/admin/blob_cache/sweep` and
3766    /// `/admin/blob_cache/flush_namespace` HTTP handlers (issue #148
3767    /// follow-up): both delegate to
3768    /// `crate::storage::cache::sweeper::BlobCacheSweeper`, which takes a
3769    /// `&BlobCache`. Also used by `trigger_backup` when
3770    /// `red.config.backup.include_blob_cache=true` to locate the L2
3771    /// directory for archival.
3772    pub fn result_blob_cache(&self) -> &crate::storage::cache::BlobCache {
3773        &self.inner.result_blob_cache
3774    }
3775
3776    /// PLAN.md Phase 11.4 — owned snapshot of every registered
3777    /// replica's state on this primary. Returns empty vec on
3778    /// non-primary instances or when no replicas are registered yet.
3779    pub fn primary_replica_snapshots(&self) -> Vec<crate::replication::primary::ReplicaState> {
3780        self.inner
3781            .db
3782            .replication
3783            .as_ref()
3784            .map(|repl| repl.replica_snapshots())
3785            .unwrap_or_default()
3786    }
3787
3788    /// PLAN.md Phase 11.4 — active commit policy. Reads
3789    /// `RED_PRIMARY_COMMIT_POLICY` once at runtime construction;
3790    /// future env reloads will need a reload endpoint. Default is
3791    /// `Local` — current behavior, no replica blocking.
3792    pub fn commit_policy(&self) -> crate::replication::CommitPolicy {
3793        crate::replication::CommitPolicy::from_env()
3794    }
3795
3796    /// PLAN.md Phase 11.5 — accessor for replica-side apply error
3797    /// counters (gap / divergence / apply / decode). Returned
3798    /// snapshot is consistent across the four counters; the labels
3799    /// match `reddb_replica_apply_errors_total{kind}`.
3800    pub fn replica_apply_error_counts(
3801        &self,
3802    ) -> [(crate::replication::logical::ApplyErrorKind, u64); 4] {
3803        self.inner.replica_apply_metrics.snapshot()
3804    }
3805
3806    /// PLAN.md Phase 4.4 — per-caller quota bucket. Always
3807    /// returned; `is_configured()` lets callers short-circuit.
3808    pub fn quota_bucket(&self) -> &crate::runtime::quota_bucket::QuotaBucket {
3809        &self.inner.quota_bucket
3810    }
3811
3812    /// PLAN.md Phase 11.4 — observability snapshot of every
3813    /// replica's durable LSN as known to the commit waiter. Empty
3814    /// vec on non-primary instances or when no replica has acked.
3815    pub fn commit_waiter_snapshot(&self) -> Vec<(String, u64)> {
3816        self.inner
3817            .db
3818            .replication
3819            .as_ref()
3820            .map(|repl| repl.commit_waiter.snapshot())
3821            .unwrap_or_default()
3822    }
3823
3824    /// PLAN.md Phase 11.4 — `(reached, timed_out, not_required, last_micros)`
3825    /// counters for /metrics. Always-zero on non-primary instances.
3826    pub fn commit_waiter_metrics_snapshot(&self) -> (u64, u64, u64, u64) {
3827        self.inner
3828            .db
3829            .replication
3830            .as_ref()
3831            .map(|repl| repl.commit_waiter.metrics_snapshot())
3832            .unwrap_or((0, 0, 0, 0))
3833    }
3834
3835    /// PLAN.md Phase 11.4 — block until at least `count` replicas
3836    /// have durably applied through `target_lsn`, or `timeout`
3837    /// elapses. Returns the `AwaitOutcome` so the caller can decide
3838    /// whether to surface a timeout error to the client or continue
3839    /// (the policy mapping lives in the commit dispatcher).
3840    ///
3841    /// Foundation only — the write commit path doesn't yet call
3842    /// this. Wiring it is a per-surface task gated on the operator
3843    /// flipping `RED_PRIMARY_COMMIT_POLICY` away from `local`.
3844    pub fn await_replica_acks(
3845        &self,
3846        target_lsn: u64,
3847        count: u32,
3848        timeout: std::time::Duration,
3849    ) -> crate::replication::AwaitOutcome {
3850        match &self.inner.db.replication {
3851            Some(repl) => repl.commit_waiter.await_acks(target_lsn, count, timeout),
3852            None => {
3853                // No replication configured: policy must be `Local`.
3854                // Treat as immediate `NotRequired` so callers don't
3855                // block on a degenerate setup.
3856                crate::replication::AwaitOutcome::NotRequired
3857            }
3858        }
3859    }
3860
3861    /// PLAN.md Phase 11.4 — enforce the configured commit policy
3862    /// against `post_lsn` (the LSN of the just-completed write).
3863    /// Returns `Ok(AwaitOutcome)` on every successful enforcement
3864    /// (including `Reached` and `TimedOut` when fail-on-timeout is
3865    /// off). Returns `Err(ReadOnly)` only when:
3866    ///   * policy is `AckN(n)` with `n > 0`
3867    ///   * the wait timed out
3868    ///   * `RED_COMMIT_FAIL_ON_TIMEOUT=true` is set
3869    ///
3870    /// The HTTP / gRPC / wire surfaces map the error to 504 / wire
3871    /// backoff. Default behaviour (env unset) logs warn and returns
3872    /// success — matches PLAN.md "default v1 stays local" semantics
3873    /// while still letting the operator opt into hard-blocking.
3874    pub fn enforce_commit_policy(
3875        &self,
3876        post_lsn: u64,
3877    ) -> RedDBResult<crate::replication::AwaitOutcome> {
3878        let n = match self.commit_policy() {
3879            crate::replication::CommitPolicy::AckN(n) if n > 0 => n,
3880            _ => return Ok(crate::replication::AwaitOutcome::NotRequired),
3881        };
3882        let timeout_ms = std::env::var("RED_REPLICATION_ACK_TIMEOUT_MS")
3883            .ok()
3884            .and_then(|v| v.parse::<u64>().ok())
3885            .unwrap_or(5_000);
3886        let outcome =
3887            self.await_replica_acks(post_lsn, n, std::time::Duration::from_millis(timeout_ms));
3888        if let crate::replication::AwaitOutcome::TimedOut { observed, required } = &outcome {
3889            tracing::warn!(
3890                target: "reddb::commit",
3891                post_lsn,
3892                observed = *observed,
3893                required = *required,
3894                timeout_ms,
3895                "ack_n: timed out waiting for replicas"
3896            );
3897            let fail = std::env::var("RED_COMMIT_FAIL_ON_TIMEOUT")
3898                .ok()
3899                .map(|v| {
3900                    let t = v.trim();
3901                    t.eq_ignore_ascii_case("true") || t == "1" || t.eq_ignore_ascii_case("yes")
3902                })
3903                .unwrap_or(false);
3904            if fail {
3905                return Err(RedDBError::ReadOnly(format!(
3906                    "commit policy timed out at lsn {post_lsn}: observed={observed} required={required} (RED_COMMIT_FAIL_ON_TIMEOUT=true)"
3907                )));
3908            }
3909        }
3910        Ok(outcome)
3911    }
3912
3913    /// PLAN.md Phase 6.3 — whether at-rest encryption is configured.
3914    /// Reads `RED_ENCRYPTION_KEY` / `RED_ENCRYPTION_KEY_FILE` lazily;
3915    /// returns `("enabled", None)` when a key is loadable, `("error", Some(msg))`
3916    /// when the operator set the env but it doesn't parse, and
3917    /// `("disabled", None)` when no key is configured. The pager
3918    /// hookup is deferred — this accessor surfaces the operator's
3919    /// intent for /admin/status without yet using the key in writes.
3920    pub fn encryption_at_rest_status(&self) -> (&'static str, Option<String>) {
3921        match crate::crypto::page_encryption::key_from_env() {
3922            Ok(Some(_)) => ("enabled", None),
3923            Ok(None) => ("disabled", None),
3924            Err(err) => ("error", Some(err)),
3925        }
3926    }
3927
3928    /// PLAN.md Phase 11.5 — current replica apply health label
3929    /// (`ok`, `gap`, `divergence`, `apply_error`, `connecting`,
3930    /// `stalled_gap`). Read from the persisted `red.replication.state`
3931    /// config key updated by the replica loop. Returns `None` on
3932    /// non-replica instances or when no apply has run yet.
3933    pub fn replica_apply_health(&self) -> Option<String> {
3934        let state = self.config_string("red.replication.state", "");
3935        if state.is_empty() {
3936            None
3937        } else {
3938            Some(state)
3939        }
3940    }
3941
3942    /// Current local LSN paired with the LSN of the most recently
3943    /// archived WAL segment. The difference is the replication /
3944    /// archive lag operators alert on (PLAN.md Phase 5.1). Returns
3945    /// `(0, 0)` when neither replication nor archiving is configured.
3946    pub fn wal_archive_progress(&self) -> (u64, u64) {
3947        let current_lsn = self
3948            .inner
3949            .db
3950            .replication
3951            .as_ref()
3952            .map(|repl| {
3953                repl.logical_wal_spool
3954                    .as_ref()
3955                    .map(|spool| spool.current_lsn())
3956                    .unwrap_or_else(|| repl.wal_buffer.current_lsn())
3957            })
3958            .unwrap_or_else(|| self.inner.cdc.current_lsn());
3959        let last_archived_lsn = self.config_u64("red.config.timeline.last_archived_lsn", 0);
3960        (current_lsn, last_archived_lsn)
3961    }
3962
3963    /// Trigger an immediate backup.
3964    pub fn trigger_backup(&self) -> RedDBResult<crate::replication::scheduler::BackupResult> {
3965        self.check_write(crate::runtime::write_gate::WriteKind::Backup)?;
3966        // Defense in depth — check_write above already rejects when
3967        // the lease is NotHeld, but log + audit the lease angle here
3968        // explicitly so dashboards distinguish "lease lost" from a
3969        // generic read-only refusal.
3970        self.assert_remote_write_allowed("admin/backup")?;
3971        let started = std::time::Instant::now();
3972        let snapshot = self.create_snapshot()?;
3973        let mut uploaded = false;
3974
3975        if let (Some(backend), Some(path)) = (&self.inner.db.remote_backend, self.inner.db.path()) {
3976            let default_snapshot_prefix = self.inner.db.options().default_snapshot_prefix();
3977            let default_wal_prefix = self.inner.db.options().default_wal_archive_prefix();
3978            let default_head_key = self.inner.db.options().default_backup_head_key();
3979            let snapshot_prefix = self.config_string(
3980                "red.config.backup.snapshot_prefix",
3981                &default_snapshot_prefix,
3982            );
3983            let wal_prefix =
3984                self.config_string("red.config.wal.archive.prefix", &default_wal_prefix);
3985            let head_key = self.config_string("red.config.backup.head_key", &default_head_key);
3986            let timeline_id = self.config_string("red.config.timeline.id", "main");
3987            let snapshot_key = crate::storage::wal::archive_snapshot(
3988                backend.as_ref(),
3989                path,
3990                snapshot.snapshot_id,
3991                &snapshot_prefix,
3992            )
3993            .map_err(|err| RedDBError::Internal(err.to_string()))?;
3994            let current_lsn = self
3995                .inner
3996                .db
3997                .replication
3998                .as_ref()
3999                .map(|repl| {
4000                    repl.logical_wal_spool
4001                        .as_ref()
4002                        .map(|spool| spool.current_lsn())
4003                        .unwrap_or_else(|| repl.wal_buffer.current_lsn())
4004                })
4005                .unwrap_or_else(|| self.inner.cdc.current_lsn());
4006            let last_archived_lsn = self.config_u64("red.config.timeline.last_archived_lsn", 0);
4007            // Hash the local snapshot bytes so the manifest can carry
4008            // the digest for restore-side verification (PLAN.md
4009            // Phase 4). Failure to hash is non-fatal — we still
4010            // publish the manifest, just without a checksum, so a
4011            // future fix can backfill rather than losing the backup.
4012            let snapshot_sha256 =
4013                crate::storage::wal::SnapshotManifest::compute_snapshot_sha256(path)
4014                    .map_err(|err| {
4015                        tracing::warn!(
4016                            target: "reddb::backup",
4017                            error = %err,
4018                            snapshot_id = snapshot.snapshot_id,
4019                            "snapshot hash failed; manifest will lack checksum"
4020                        );
4021                    })
4022                    .ok();
4023            let manifest = crate::storage::wal::SnapshotManifest {
4024                timeline_id: timeline_id.clone(),
4025                snapshot_key: snapshot_key.clone(),
4026                snapshot_id: snapshot.snapshot_id,
4027                snapshot_time: snapshot.created_at_unix_ms as u64,
4028                base_lsn: current_lsn,
4029                schema_version: crate::api::REDDB_FORMAT_VERSION,
4030                format_version: crate::api::REDDB_FORMAT_VERSION,
4031                snapshot_sha256,
4032            };
4033            crate::storage::wal::publish_snapshot_manifest(backend.as_ref(), &manifest)
4034                .map_err(|err| RedDBError::Internal(err.to_string()))?;
4035
4036            // PLAN.md Phase 11.3 — read the head of the WAL hash chain
4037            // so the new segment can link back. `None` means we're
4038            // starting a fresh timeline (after a clean restore or on
4039            // first archive ever); the segment's `prev_hash` will be
4040            // `None` and restore-side validation accepts that only for
4041            // the first segment in `plan.wal_segments`.
4042            let prev_segment_hash = self.config_string("red.config.timeline.last_segment_hash", "");
4043            let prev_hash_arg = if prev_segment_hash.is_empty() {
4044                None
4045            } else {
4046                Some(prev_segment_hash)
4047            };
4048
4049            let archived_lsn = if let Some(primary) = &self.inner.db.replication {
4050                let oldest = primary
4051                    .logical_wal_spool
4052                    .as_ref()
4053                    .and_then(|spool| spool.oldest_lsn().ok().flatten())
4054                    .or_else(|| primary.wal_buffer.oldest_lsn())
4055                    .unwrap_or(last_archived_lsn);
4056                if last_archived_lsn > 0 && last_archived_lsn < oldest.saturating_sub(1) {
4057                    return Err(RedDBError::Internal(format!(
4058                        "logical WAL gap detected: last_archived_lsn={last_archived_lsn}, oldest_available_lsn={oldest}"
4059                    )));
4060                }
4061                let records = if let Some(spool) = &primary.logical_wal_spool {
4062                    spool
4063                        .read_since(last_archived_lsn, usize::MAX)
4064                        .map_err(|err| RedDBError::Internal(err.to_string()))?
4065                } else {
4066                    primary.wal_buffer.read_since(last_archived_lsn, usize::MAX)
4067                };
4068                if let Some(meta) = crate::storage::wal::archive_change_records(
4069                    backend.as_ref(),
4070                    &wal_prefix,
4071                    &records,
4072                    prev_hash_arg,
4073                )
4074                .map_err(|err| RedDBError::Internal(err.to_string()))?
4075                {
4076                    if let Some(spool) = &primary.logical_wal_spool {
4077                        let _ = spool.prune_through(meta.lsn_end);
4078                    }
4079                    // Advance the chain head so the next archive call
4080                    // links to this segment's hash. If the segment has
4081                    // no sha256 (legacy / hashing failed) we leave the
4082                    // head as-is — the next segment then carries the
4083                    // prior chain head, preserving continuity.
4084                    if let Some(sha) = &meta.sha256 {
4085                        self.inner.db.store().set_config_tree(
4086                            "red.config.timeline",
4087                            &crate::json!({ "last_segment_hash": sha }),
4088                        );
4089                    }
4090                    meta.lsn_end
4091                } else {
4092                    last_archived_lsn
4093                }
4094            } else {
4095                last_archived_lsn
4096            };
4097
4098            let head = crate::storage::wal::BackupHead {
4099                timeline_id,
4100                snapshot_key,
4101                snapshot_id: snapshot.snapshot_id,
4102                snapshot_time: snapshot.created_at_unix_ms as u64,
4103                current_lsn,
4104                last_archived_lsn: archived_lsn,
4105                wal_prefix,
4106            };
4107            crate::storage::wal::publish_backup_head(backend.as_ref(), &head_key, &head)
4108                .map_err(|err| RedDBError::Internal(err.to_string()))?;
4109            self.inner.db.store().set_config_tree(
4110                "red.config.timeline",
4111                &crate::json!({
4112                    "last_archived_lsn": archived_lsn,
4113                    "id": head.timeline_id
4114                }),
4115            );
4116
4117            // PLAN.md Phase 2.4 — refresh the unified `MANIFEST.json`
4118            // at the prefix root so external tooling sees a single
4119            // catalog of every snapshot + WAL segment with their
4120            // checksums. Best-effort: a manifest publish failure
4121            // doesn't fail the backup (the per-artifact sidecars
4122            // already give restore-side integrity), but it does log
4123            // so dashboards can flag stale catalogs.
4124            if let Err(err) = crate::storage::wal::publish_unified_manifest_for_prefix(
4125                backend.as_ref(),
4126                &snapshot_prefix,
4127            ) {
4128                tracing::warn!(
4129                    target: "reddb::backup",
4130                    error = %err,
4131                    snapshot_prefix = %snapshot_prefix,
4132                    "unified MANIFEST.json refresh failed; per-artifact sidecars unaffected"
4133                );
4134            }
4135
4136            // PLAN.md Phase 11.4 — when the operator picked a
4137            // commit policy that demands replica durability, block
4138            // until the configured count of replicas has acked the
4139            // archived LSN (or the timeout fires). For backup the
4140            // policy decides the *DR posture* — `local` returns
4141            // immediately, `ack_n` ensures at least N replicas saw
4142            // the new tail before we report success to the
4143            // operator. A `TimedOut` is logged but does NOT fail
4144            // the backup: the local WAL + remote upload are durable
4145            // regardless; the missing acks are reported via
4146            // /metrics and /admin/status so the operator can decide.
4147            match self.commit_policy() {
4148                crate::replication::CommitPolicy::AckN(n) if n > 0 => {
4149                    let timeout = std::env::var("RED_REPLICATION_ACK_TIMEOUT_MS")
4150                        .ok()
4151                        .and_then(|v| v.parse::<u64>().ok())
4152                        .unwrap_or(5_000);
4153                    let outcome = self.await_replica_acks(
4154                        archived_lsn,
4155                        n,
4156                        std::time::Duration::from_millis(timeout),
4157                    );
4158                    match outcome {
4159                        crate::replication::AwaitOutcome::Reached(count) => {
4160                            tracing::debug!(
4161                                target: "reddb::backup",
4162                                archived_lsn,
4163                                n,
4164                                count,
4165                                "ack_n: replicas synced before backup return"
4166                            );
4167                        }
4168                        crate::replication::AwaitOutcome::TimedOut { observed, required } => {
4169                            tracing::warn!(
4170                                target: "reddb::backup",
4171                                archived_lsn,
4172                                observed,
4173                                required,
4174                                timeout_ms = timeout,
4175                                "ack_n: timed out waiting for replicas; backup uploaded but DR posture degraded"
4176                            );
4177                        }
4178                        crate::replication::AwaitOutcome::NotRequired => {}
4179                    }
4180                }
4181                _ => {} // Local / RemoteWal / Quorum: no blocking yet
4182            }
4183
4184            // Issue #148 follow-up — opt-in archive of the L2 Blob Cache
4185            // directory tree. Default off so a standard backup stays
4186            // small; flip via `red.config.backup.include_blob_cache=true`
4187            // when warm-cache restore is required (per
4188            // docs/operations/blob-cache-backup-restore.md §1).
4189            //
4190            // The L2 tree is *derived* state (ADR 0006) — its absence
4191            // never causes data loss; it only affects post-restore
4192            // p99 latency until the cache re-warms. We therefore log
4193            // (not fail) on per-file upload errors so a partial L2
4194            // upload never aborts a healthy snapshot+WAL backup.
4195            if self.config_bool("red.config.backup.include_blob_cache", false) {
4196                let blob_cache_prefix = self.config_string(
4197                    "red.config.backup.blob_cache_prefix",
4198                    &format!("{snapshot_prefix}blob_cache/"),
4199                );
4200                if let Some(l2_path) = self.inner.result_blob_cache.l2_path() {
4201                    match crate::storage::cache::archive_blob_cache_l2(
4202                        backend.as_ref(),
4203                        l2_path,
4204                        &blob_cache_prefix,
4205                    ) {
4206                        Ok(count) => {
4207                            tracing::info!(
4208                                target: "reddb::backup",
4209                                files_uploaded = count,
4210                                blob_cache_prefix = %blob_cache_prefix,
4211                                "include_blob_cache: archived L2 directory"
4212                            );
4213                        }
4214                        Err(err) => {
4215                            tracing::warn!(
4216                                target: "reddb::backup",
4217                                error = %err,
4218                                blob_cache_prefix = %blob_cache_prefix,
4219                                "include_blob_cache: L2 archive failed; backup proceeding (cache is derived state)"
4220                            );
4221                        }
4222                    }
4223                } else {
4224                    tracing::debug!(
4225                        target: "reddb::backup",
4226                        "include_blob_cache=true but no L2 path configured; nothing to archive"
4227                    );
4228                }
4229            }
4230
4231            uploaded = true;
4232        }
4233
4234        Ok(crate::replication::scheduler::BackupResult {
4235            snapshot_id: snapshot.snapshot_id,
4236            uploaded,
4237            duration_ms: started.elapsed().as_millis() as u64,
4238            timestamp: snapshot.created_at_unix_ms as u64,
4239        })
4240    }
4241
4242    pub fn acquire(&self) -> RedDBResult<RuntimeConnection> {
4243        let mut pool = self
4244            .inner
4245            .pool
4246            .lock()
4247            .map_err(|e| RedDBError::Internal(format!("connection pool lock poisoned: {e}")))?;
4248        if pool.active >= self.inner.pool_config.max_connections {
4249            return Err(RedDBError::Internal(
4250                "connection pool exhausted".to_string(),
4251            ));
4252        }
4253
4254        let id = if let Some(id) = pool.idle.pop() {
4255            id
4256        } else {
4257            let id = pool.next_id;
4258            pool.next_id += 1;
4259            id
4260        };
4261        pool.active += 1;
4262        pool.total_checkouts += 1;
4263        drop(pool);
4264
4265        Ok(RuntimeConnection {
4266            id,
4267            inner: Arc::clone(&self.inner),
4268        })
4269    }
4270
4271    pub fn checkpoint(&self) -> RedDBResult<()> {
4272        // Local fsync always allowed — losing the lease shouldn't
4273        // prevent us from durably persisting what's already in memory.
4274        // The remote upload is the side-effect that risks clobbering a
4275        // peer's state, so it's behind the lease gate.
4276        self.inner.db.flush_local_only().map_err(|err| {
4277            // Issue #205 — local flush failure is a CheckpointFailed
4278            // operator-grade event. The local-flush path also covers
4279            // the WAL fsync we depend on, so a failure here doubles as
4280            // the WalFsyncFailed signal for the runtime entry point.
4281            let msg = err.to_string();
4282            crate::telemetry::operator_event::OperatorEvent::CheckpointFailed {
4283                lsn: 0,
4284                error: msg.clone(),
4285            }
4286            .emit_global();
4287            crate::telemetry::operator_event::OperatorEvent::WalFsyncFailed {
4288                path: "<flush_local_only>".to_string(),
4289                error: msg.clone(),
4290            }
4291            .emit_global();
4292            RedDBError::Engine(msg)
4293        })?;
4294        if let Err(err) = self.assert_remote_write_allowed("checkpoint") {
4295            tracing::warn!(
4296                target: "reddb::serverless::lease",
4297                error = %err,
4298                "checkpoint: skipping remote upload — lease not held"
4299            );
4300            return Ok(());
4301        }
4302        self.inner
4303            .db
4304            .upload_to_remote_backend()
4305            .map_err(|err| RedDBError::Engine(err.to_string()))
4306    }
4307
4308    /// Guard remote-mutating operations on the writer lease.
4309    /// Returns `Ok(())` when no remote backend is configured (the
4310    /// lease is irrelevant) or the lease state is `NotRequired` /
4311    /// `Held`. Returns `RedDBError::ReadOnly` when the lease is
4312    /// `NotHeld`, with an audit-friendly action label so the caller
4313    /// can record the rejection.
4314    pub(crate) fn assert_remote_write_allowed(&self, action: &str) -> RedDBResult<()> {
4315        if self.inner.db.remote_backend.is_none() {
4316            return Ok(());
4317        }
4318        match self.inner.write_gate.lease_state() {
4319            crate::runtime::write_gate::LeaseGateState::NotHeld => {
4320                self.inner.audit_log.record(
4321                    action,
4322                    "system",
4323                    "remote_backend",
4324                    "err: writer lease not held",
4325                    crate::json::Value::Null,
4326                );
4327                Err(RedDBError::ReadOnly(format!(
4328                    "writer lease not held — {action} blocked (serverless fence)"
4329                )))
4330            }
4331            _ => Ok(()),
4332        }
4333    }
4334
4335    pub fn run_maintenance(&self) -> RedDBResult<()> {
4336        self.inner
4337            .db
4338            .run_maintenance()
4339            .map_err(|err| RedDBError::Internal(err.to_string()))
4340    }
4341
4342    pub fn scan_collection(
4343        &self,
4344        collection: &str,
4345        cursor: Option<ScanCursor>,
4346        limit: usize,
4347    ) -> RedDBResult<ScanPage> {
4348        let store = self.inner.db.store();
4349        let manager = store
4350            .get_collection(collection)
4351            .ok_or_else(|| RedDBError::NotFound(collection.to_string()))?;
4352
4353        let mut entities = manager.query_all(|_| true);
4354        entities.sort_by_key(|entity| entity.id.raw());
4355
4356        let offset = cursor.map(|cursor| cursor.offset).unwrap_or(0);
4357        let total = entities.len();
4358        let end = total.min(offset.saturating_add(limit.max(1)));
4359        let items = if offset >= total {
4360            Vec::new()
4361        } else {
4362            entities[offset..end].to_vec()
4363        };
4364        let next = (end < total).then_some(ScanCursor { offset: end });
4365
4366        Ok(ScanPage {
4367            collection: collection.to_string(),
4368            items,
4369            next,
4370            total,
4371        })
4372    }
4373
4374    pub fn catalog(&self) -> CatalogModelSnapshot {
4375        self.inner.db.catalog_model_snapshot()
4376    }
4377
4378    pub fn catalog_consistency_report(&self) -> crate::catalog::CatalogConsistencyReport {
4379        self.inner.db.catalog_consistency_report()
4380    }
4381
4382    pub fn catalog_attention_summary(&self) -> CatalogAttentionSummary {
4383        crate::catalog::attention_summary(&self.catalog())
4384    }
4385
4386    pub fn collection_attention(&self) -> Vec<CollectionDescriptor> {
4387        crate::catalog::collection_attention(&self.catalog())
4388    }
4389
4390    pub fn index_attention(&self) -> Vec<CatalogIndexStatus> {
4391        crate::catalog::index_attention(&self.catalog())
4392    }
4393
4394    pub fn graph_projection_attention(&self) -> Vec<CatalogGraphProjectionStatus> {
4395        crate::catalog::graph_projection_attention(&self.catalog())
4396    }
4397
4398    pub fn analytics_job_attention(&self) -> Vec<CatalogAnalyticsJobStatus> {
4399        crate::catalog::analytics_job_attention(&self.catalog())
4400    }
4401
4402    pub fn stats(&self) -> RuntimeStats {
4403        let pool = runtime_pool_lock(self);
4404        RuntimeStats {
4405            active_connections: pool.active,
4406            idle_connections: pool.idle.len(),
4407            total_checkouts: pool.total_checkouts,
4408            paged_mode: self.inner.db.is_paged(),
4409            started_at_unix_ms: self.inner.started_at_unix_ms,
4410            store: self.inner.db.stats(),
4411            system: SystemInfo::collect(),
4412            result_blob_cache: self.inner.result_blob_cache.stats(),
4413            kv: self.inner.kv_stats.snapshot(),
4414        }
4415    }
4416
4417    /// Execute a query under a typed scope override without embedding
4418    /// the tenant / user / role values into the SQL string. Use this
4419    /// from transport middleware (HTTP / gRPC / worker loops) where the
4420    /// scope is resolved from auth claims and the SQL is a parameterised
4421    /// template — avoids the string-concat injection risk of building
4422    /// `WITHIN TENANT '<id>' …` manually, and is drop-in compatible with
4423    /// prepared statements that didn't know about tenancy.
4424    ///
4425    /// Precedence matches the `WITHIN` clause: the passed `scope`
4426    /// overrides `SET LOCAL TENANT`, which overrides `SET TENANT`.
4427    /// The override is pushed on the thread-local scope stack for the
4428    /// duration of the call and popped on return — pool-shared
4429    /// connections cannot leak it across requests.
4430    pub fn execute_query_with_scope(
4431        &self,
4432        query: &str,
4433        scope: crate::runtime::within_clause::ScopeOverride,
4434    ) -> RedDBResult<RuntimeQueryResult> {
4435        if scope.is_empty() {
4436            return self.execute_query(query);
4437        }
4438        let _scope_guard = ScopeOverrideGuard::install(scope);
4439        self.execute_query(query)
4440    }
4441
4442    /// Issue #205 — single lifecycle exit for slow-query logging.
4443    ///
4444    /// `execute_query_inner` does the real work; this wrapper times it
4445    /// and, if elapsed exceeds the configured threshold, hands the
4446    /// triple `(QueryKind, elapsed_ms, sql_redacted, scope)` to the
4447    /// SlowQueryLogger. The threshold + sample_pct were captured at
4448    /// SlowQueryLogger construction (runtime startup), so the per-call
4449    /// cost on below-threshold paths is one relaxed atomic load.
4450    pub fn execute_query(&self, query: &str) -> RedDBResult<RuntimeQueryResult> {
4451        let started = std::time::Instant::now();
4452        let result = self.execute_query_inner(query);
4453        let elapsed_ms = started.elapsed().as_millis() as u64;
4454
4455        // Build EffectiveScope from the same thread-locals frame-build
4456        // consults — keeps the slow-log row consistent with the audit /
4457        // RLS view of "this statement". `ai_scope()` is the canonical
4458        // builder.
4459        let scope = self.ai_scope();
4460        let kind = match result
4461            .as_ref()
4462            .map(|r| r.statement_type)
4463            .unwrap_or("select")
4464        {
4465            "select" => crate::telemetry::slow_query_logger::QueryKind::Select,
4466            "insert" => crate::telemetry::slow_query_logger::QueryKind::Insert,
4467            "update" => crate::telemetry::slow_query_logger::QueryKind::Update,
4468            "delete" => crate::telemetry::slow_query_logger::QueryKind::Delete,
4469            _ => crate::telemetry::slow_query_logger::QueryKind::Internal,
4470        };
4471        // SQL redaction: pass the raw query through. The slow-query
4472        // logger writes structured JSON so embedded literals stay
4473        // escape-safe at the JSON boundary (proven by
4474        // `adversarial_sql_is_escape_safe` in slow_query_logger.rs).
4475        // PII redaction (e.g. literal masking) is a follow-up.
4476        self.inner
4477            .slow_query_logger
4478            .record(kind, elapsed_ms, query.to_string(), &scope);
4479
4480        result
4481    }
4482
4483    #[inline(never)]
4484    fn execute_query_inner(&self, query: &str) -> RedDBResult<RuntimeQueryResult> {
4485        // ── ULTRA-TURBO: autocommit `SELECT * FROM t WHERE _entity_id = N` ──
4486        //
4487        // Moved above every boot-cost the normal path pays (WITHIN
4488        // strip, SET LOCAL parse, tx_local_tenants read, snapshot
4489        // guard, tracing span, tx_contexts read) because the bench's
4490        // `select_point` scenario was observed at 28× vs PostgreSQL —
4491        // the dominant cost wasn't the entity fetch but the ceremony
4492        // before it. Only fires when there's no ambient transaction
4493        // context or WITHIN override, so the snapshot install we skip
4494        // truly is a no-op for this query.
4495        if !has_scope_override_active()
4496            && !query.trim_start().starts_with("WITHIN")
4497            && !query.trim_start().starts_with("within")
4498            && !self
4499                .inner
4500                .tx_contexts
4501                .read()
4502                .contains_key(&current_connection_id())
4503        {
4504            if let Some(result) = self.try_fast_entity_lookup(query) {
4505                return result;
4506            }
4507        }
4508
4509        // `WITHIN TENANT '<id>' [USER '<u>'] [AS ROLE '<r>'] <stmt>` —
4510        // strip the prefix, push a stack-scoped override, recurse on
4511        // the inner statement, pop on return. Stack lives in a
4512        // thread-local but is balanced by the RAII guard, so a
4513        // pool-shared connection cannot leak the override across
4514        // requests and an early `?` return still pops cleanly.
4515        match crate::runtime::within_clause::try_strip_within_prefix(query) {
4516            Ok(Some((scope, inner))) => {
4517                let _scope_guard = ScopeOverrideGuard::install(scope);
4518                // Re-enter the inner path, NOT `execute_query`, so the
4519                // slow-query lifecycle hook records exactly one row per
4520                // top-level statement (the WITHIN-stripped form would
4521                // double-record).
4522                return self.execute_query_inner(inner);
4523            }
4524            Ok(None) => {}
4525            Err(msg) => return Err(RedDBError::Query(msg)),
4526        }
4527
4528        // `EXPLAIN <stmt>` — introspection. Runs the planner on the
4529        // inner statement (WITHOUT executing it) and returns the
4530        // CanonicalLogicalNode tree as rows so the caller can see the
4531        // operator shape and estimated cost. `EXPLAIN ALTER FOR ...`
4532        // is a distinct schema-diff command and continues down the
4533        // regular SQL path.
4534        if let Some(inner) = strip_explain_prefix(query) {
4535            return self.explain_as_rows(query, inner);
4536        }
4537
4538        // `SET LOCAL TENANT '<id>'` — write the per-transaction tenant
4539        // override and return. Outside a transaction the statement is
4540        // an error (matches PG semantics: SET LOCAL only takes effect
4541        // within an active transaction).
4542        if let Some(value) = parse_set_local_tenant(query)? {
4543            let conn_id = current_connection_id();
4544            if !self.inner.tx_contexts.read().contains_key(&conn_id) {
4545                return Err(RedDBError::Query(
4546                    "SET LOCAL TENANT requires an active transaction".to_string(),
4547                ));
4548            }
4549            self.inner
4550                .tx_local_tenants
4551                .write()
4552                .insert(conn_id, value.clone());
4553            return Ok(RuntimeQueryResult::ok_message(
4554                query.to_string(),
4555                &match &value {
4556                    Some(id) => format!("local tenant set: {id}"),
4557                    None => "local tenant cleared".to_string(),
4558                },
4559                "set_local_tenant",
4560            ));
4561        }
4562
4563        if super::red_schema::is_system_schema_write(query) {
4564            return Err(RedDBError::Query(
4565                super::red_schema::READ_ONLY_ERROR.to_string(),
4566            ));
4567        }
4568
4569        let rewritten_query = super::red_schema::rewrite_virtual_names(query);
4570        let execution_query = rewritten_query.as_deref().unwrap_or(query);
4571
4572        let frame = super::statement_frame::StatementExecutionFrame::build(self, execution_query)?;
4573        let _frame_guards = frame.install(self);
4574
4575        // Phase 6 logging: enter a span stamped with conn_id / tenant
4576        // / query_len. Every downstream tracing::info!/warn!/error!
4577        // inherits these fields — no need to thread them manually
4578        // through storage/scan layers. Entered AFTER the WITHIN /
4579        // SET LOCAL TENANT resolution above so the span reflects the
4580        // effective scope for this statement.
4581        let _log_span = crate::telemetry::span::query_span(query).entered();
4582
4583        // ── CTE prelude (#41) — `WITH x AS (...) SELECT ... FROM x` ──
4584        if let Some(rewritten) = frame.prepare_cte(execution_query)? {
4585            return self.execute_query_expr(rewritten);
4586        }
4587
4588        // ── TURBO: bypass SQL parse for SELECT * FROM x WHERE _entity_id = N ──
4589        if let Some(result) = self.try_fast_entity_lookup(execution_query) {
4590            return result;
4591        }
4592
4593        // ── Result cache: return cached result if still fresh (30s TTL) ──
4594        if let Some(result) = frame.read_result_cache(self) {
4595            return Ok(result);
4596        }
4597
4598        let prepared = frame.prepare_statement(self, execution_query)?;
4599        let mode = prepared.mode;
4600        let expr = prepared.expr;
4601
4602        let statement = query_expr_name(&expr);
4603        let result_cache_scopes = query_expr_result_cache_scopes(&expr);
4604
4605        let _lock_guard = frame.prepare_dispatch(self, &expr)?;
4606        let frame_iface: &dyn super::statement_frame::ReadFrame = &frame;
4607
4608        let query_result = match expr {
4609            QueryExpr::Graph(_) | QueryExpr::Path(_) => {
4610                // Apply MVCC visibility + RLS gate while materialising the
4611                // graph: every node entity is screened against the source
4612                // collection's policy chain (basic and `Nodes`-targeted)
4613                // and dropped when the caller's tenant / role doesn't
4614                // admit it. Edges are pruned automatically because the
4615                // graph builder skips edges whose endpoints aren't in
4616                // `allowed_nodes`.
4617                let (graph, node_properties) = self.materialize_graph_with_rls()?;
4618                let result =
4619                    crate::storage::query::unified::UnifiedExecutor::execute_on_with_node_properties(
4620                        &graph,
4621                        &expr,
4622                        node_properties,
4623                    )
4624                        .map_err(|err| RedDBError::Query(err.to_string()))?;
4625
4626                Ok(RuntimeQueryResult {
4627                    query: query.to_string(),
4628                    mode,
4629                    statement,
4630                    engine: "materialized-graph",
4631                    result,
4632                    affected_rows: 0,
4633                    statement_type: "select",
4634                })
4635            }
4636            QueryExpr::Table(table) => {
4637                let table = self.resolve_table_expr_subqueries(
4638                    table,
4639                    &frame as &dyn super::statement_frame::ReadFrame,
4640                )?;
4641                if super::red_schema::is_virtual_table(&table.table) {
4642                    return Ok(RuntimeQueryResult {
4643                        query: query.to_string(),
4644                        mode,
4645                        statement,
4646                        engine: "runtime-red-schema",
4647                        result: super::red_schema::red_query(
4648                            self,
4649                            &table.table,
4650                            &table,
4651                            &frame as &dyn super::statement_frame::ReadFrame,
4652                        )?,
4653                        affected_rows: 0,
4654                        statement_type: "select",
4655                    });
4656                }
4657
4658                if let Some(result) = self.execute_probabilistic_select(&table)? {
4659                    return Ok(RuntimeQueryResult {
4660                        query: query.to_string(),
4661                        mode,
4662                        statement,
4663                        engine: "runtime-probabilistic",
4664                        result,
4665                        affected_rows: 0,
4666                        statement_type: "select",
4667                    });
4668                }
4669
4670                // Foreign-table intercept (Phase 3.2.2 PG parity).
4671                //
4672                // When the referenced table matches a `CREATE FOREIGN TABLE`
4673                // registration, short-circuit into the FDW scan. Phase 3.2
4674                // wrappers don't yet support pushdown, so filters/projections
4675                // apply post-scan via `apply_foreign_table_filters` — good
4676                // enough for correctness; perf work lands in 3.2.3.
4677                if self.inner.foreign_tables.is_foreign_table(&table.table) {
4678                    let records = self
4679                        .inner
4680                        .foreign_tables
4681                        .scan(&table.table)
4682                        .map_err(|e| RedDBError::Internal(e.to_string()))?;
4683                    let result = apply_foreign_table_filters(records, &table);
4684                    return Ok(RuntimeQueryResult {
4685                        query: query.to_string(),
4686                        mode,
4687                        statement,
4688                        engine: "runtime-fdw",
4689                        result,
4690                        affected_rows: 0,
4691                        statement_type: "select",
4692                    });
4693                }
4694
4695                // Row-Level Security enforcement (Phase 2.5.2 PG parity).
4696                //
4697                // When RLS is enabled on this table, fetch every policy
4698                // that applies to the current (role, SELECT) pair and
4699                // fold them into the query's WHERE clause: policies
4700                // OR-combine (any of them admitting the row is enough),
4701                // then AND into the caller's existing filter.
4702                //
4703                // Anonymous callers (no thread-local identity) pass
4704                // `role = None`; policies with a specific `TO role`
4705                // clause skip, but `TO PUBLIC` policies still apply.
4706                //
4707                // When `inject_rls_filters` returns `None` the table has
4708                // RLS enabled but no policy admits the caller's role —
4709                // short-circuit with an empty result set instead of
4710                // synthesising a contradiction filter.
4711                let Some(table_with_rls) = self.authorize_relational_table_select(
4712                    table,
4713                    &frame as &dyn super::statement_frame::ReadFrame,
4714                )?
4715                else {
4716                    let empty = crate::storage::query::unified::UnifiedResult::empty();
4717                    return Ok(RuntimeQueryResult {
4718                        query: query.to_string(),
4719                        mode,
4720                        statement,
4721                        engine: "runtime-table-rls",
4722                        result: empty,
4723                        affected_rows: 0,
4724                        statement_type: "select",
4725                    });
4726                };
4727                Ok(RuntimeQueryResult {
4728                    query: query.to_string(),
4729                    mode,
4730                    statement,
4731                    engine: "runtime-table",
4732                    result: execute_runtime_table_query(
4733                        &self.inner.db,
4734                        &table_with_rls,
4735                        Some(&self.inner.index_store),
4736                    )?,
4737                    affected_rows: 0,
4738                    statement_type: "select",
4739                })
4740            }
4741            QueryExpr::Join(join) => {
4742                // Fold per-table RLS filters into each `QueryExpr::Table`
4743                // leaf of the join tree before executing. Without this
4744                // the join executor scans both tables raw and ignores
4745                // policies — a `WITHIN TENANT 'x'` against a join of
4746                // two tenant-scoped tables would leak cross-tenant rows.
4747                // When any leaf has RLS enabled and zero matching policy,
4748                // short-circuit to an empty join result instead of
4749                // emitting a contradiction filter.
4750                let join_with_rls = match self.authorize_relational_join_select(
4751                    join,
4752                    &frame as &dyn super::statement_frame::ReadFrame,
4753                )? {
4754                    Some(j) => j,
4755                    None => {
4756                        return Ok(RuntimeQueryResult {
4757                            query: query.to_string(),
4758                            mode,
4759                            statement,
4760                            engine: "runtime-join-rls",
4761                            result: crate::storage::query::unified::UnifiedResult::empty(),
4762                            affected_rows: 0,
4763                            statement_type: "select",
4764                        });
4765                    }
4766                };
4767                Ok(RuntimeQueryResult {
4768                    query: query.to_string(),
4769                    mode,
4770                    statement,
4771                    engine: "runtime-join",
4772                    result: execute_runtime_join_query(&self.inner.db, &join_with_rls)?,
4773                    affected_rows: 0,
4774                    statement_type: "select",
4775                })
4776            }
4777            QueryExpr::Vector(vector) => Ok(RuntimeQueryResult {
4778                query: query.to_string(),
4779                mode,
4780                statement,
4781                engine: "runtime-vector",
4782                result: execute_runtime_vector_query(&self.inner.db, &vector)?,
4783                affected_rows: 0,
4784                statement_type: "select",
4785            }),
4786            QueryExpr::Hybrid(hybrid) => Ok(RuntimeQueryResult {
4787                query: query.to_string(),
4788                mode,
4789                statement,
4790                engine: "runtime-hybrid",
4791                result: execute_runtime_hybrid_query(&self.inner.db, &hybrid)?,
4792                affected_rows: 0,
4793                statement_type: "select",
4794            }),
4795            // DML execution
4796            QueryExpr::Insert(ref insert) if super::red_schema::is_virtual_table(&insert.table) => {
4797                Err(RedDBError::Query(
4798                    super::red_schema::READ_ONLY_ERROR.to_string(),
4799                ))
4800            }
4801            QueryExpr::Update(ref update) if super::red_schema::is_virtual_table(&update.table) => {
4802                Err(RedDBError::Query(
4803                    super::red_schema::READ_ONLY_ERROR.to_string(),
4804                ))
4805            }
4806            QueryExpr::Delete(ref delete) if super::red_schema::is_virtual_table(&delete.table) => {
4807                Err(RedDBError::Query(
4808                    super::red_schema::READ_ONLY_ERROR.to_string(),
4809                ))
4810            }
4811            QueryExpr::Insert(ref insert) => {
4812                self.with_deferred_store_wal_if_transaction(|| self.execute_insert(query, insert))
4813            }
4814            QueryExpr::Update(ref update) => {
4815                self.with_deferred_store_wal_if_transaction(|| self.execute_update(query, update))
4816            }
4817            QueryExpr::Delete(ref delete) => {
4818                self.with_deferred_store_wal_if_transaction(|| self.execute_delete(query, delete))
4819            }
4820            // DDL execution
4821            QueryExpr::CreateTable(ref create) => self.execute_create_table(query, create),
4822            QueryExpr::CreateCollection(ref create) => {
4823                self.execute_create_collection(query, create)
4824            }
4825            QueryExpr::CreateVector(ref create) => self.execute_create_vector(query, create),
4826            QueryExpr::DropTable(ref drop_tbl) => self.execute_drop_table(query, drop_tbl),
4827            QueryExpr::DropGraph(ref drop_graph) => self.execute_drop_graph(query, drop_graph),
4828            QueryExpr::DropVector(ref drop_vector) => self.execute_drop_vector(query, drop_vector),
4829            QueryExpr::DropDocument(ref drop_document) => {
4830                self.execute_drop_document(query, drop_document)
4831            }
4832            QueryExpr::DropKv(ref drop_kv) => self.execute_drop_kv(query, drop_kv),
4833            QueryExpr::DropCollection(ref drop_collection) => {
4834                self.execute_drop_collection(query, drop_collection)
4835            }
4836            QueryExpr::Truncate(ref truncate) => self.execute_truncate(query, truncate),
4837            QueryExpr::AlterTable(ref alter) => self.execute_alter_table(query, alter),
4838            QueryExpr::ExplainAlter(ref explain) => self.execute_explain_alter(query, explain),
4839            // Graph analytics commands
4840            QueryExpr::GraphCommand(ref cmd) => self.execute_graph_command(query, cmd),
4841            // Search commands
4842            QueryExpr::SearchCommand(ref cmd) => self.execute_search_command(query, cmd),
4843            // ASK: RAG query with LLM synthesis
4844            QueryExpr::Ask(ref ask) => self.execute_ask(query, ask),
4845            QueryExpr::CreateIndex(ref create_idx) => self.execute_create_index(query, create_idx),
4846            QueryExpr::DropIndex(ref drop_idx) => self.execute_drop_index(query, drop_idx),
4847            QueryExpr::ProbabilisticCommand(ref cmd) => {
4848                self.execute_probabilistic_command(query, cmd)
4849            }
4850            // Time-series DDL
4851            QueryExpr::CreateTimeSeries(ref ts) => self.execute_create_timeseries(query, ts),
4852            QueryExpr::DropTimeSeries(ref ts) => self.execute_drop_timeseries(query, ts),
4853            // Queue DDL and commands
4854            QueryExpr::CreateQueue(ref q) => self.execute_create_queue(query, q),
4855            QueryExpr::AlterQueue(ref q) => self.execute_alter_queue(query, q),
4856            QueryExpr::DropQueue(ref q) => self.execute_drop_queue(query, q),
4857            QueryExpr::QueueSelect(ref q) => self.execute_queue_select(query, q),
4858            QueryExpr::QueueCommand(ref cmd) => self.execute_queue_command(query, cmd),
4859            QueryExpr::EventsBackfill(ref backfill) => {
4860                self.execute_events_backfill(query, backfill)
4861            }
4862            QueryExpr::EventsBackfillStatus { ref collection } => Err(RedDBError::Query(format!(
4863                "EVENTS BACKFILL STATUS for '{collection}' is not implemented in this slice"
4864            ))),
4865            QueryExpr::KvCommand(ref cmd) => self.execute_kv_command(query, cmd),
4866            QueryExpr::ConfigCommand(ref cmd) => self.execute_config_command(query, cmd),
4867            QueryExpr::CreateTree(ref tree) => self.execute_create_tree(query, tree),
4868            QueryExpr::DropTree(ref tree) => self.execute_drop_tree(query, tree),
4869            QueryExpr::TreeCommand(ref cmd) => self.execute_tree_command(query, cmd),
4870            // SET CONFIG key = value
4871            QueryExpr::SetConfig { ref key, ref value } => {
4872                if key.starts_with("red.secret.") {
4873                    return Err(RedDBError::Query(
4874                        "red.secret.* is reserved for vault secrets; use SET SECRET".to_string(),
4875                    ));
4876                }
4877                let store = self.inner.db.store();
4878                let json_val = match value {
4879                    Value::Text(s) => crate::serde_json::Value::String(s.to_string()),
4880                    Value::Integer(n) => crate::serde_json::Value::Number(*n as f64),
4881                    Value::Float(n) => crate::serde_json::Value::Number(*n),
4882                    Value::Boolean(b) => crate::serde_json::Value::Bool(*b),
4883                    _ => crate::serde_json::Value::String(value.to_string()),
4884                };
4885                store.set_config_tree(key, &json_val);
4886                update_current_config_value(key, value.clone());
4887                // Config changes can flip runtime behavior mid-session
4888                // (auto_decrypt, auto_encrypt, etc.) — invalidate the
4889                // result cache so subsequent reads re-execute against
4890                // the new config.
4891                self.invalidate_result_cache();
4892                Ok(RuntimeQueryResult::ok_message(
4893                    query.to_string(),
4894                    &format!("config set: {key}"),
4895                    "set",
4896                ))
4897            }
4898            // SET SECRET key = value
4899            QueryExpr::SetSecret { ref key, ref value } => {
4900                if key.starts_with("red.config.") {
4901                    return Err(RedDBError::Query(
4902                        "red.config.* is reserved for config; use SET CONFIG".to_string(),
4903                    ));
4904                }
4905                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
4906                    RedDBError::Query("SET SECRET requires an enabled, unsealed vault".to_string())
4907                })?;
4908                if matches!(value, Value::Null) {
4909                    auth_store
4910                        .vault_kv_try_delete(key)
4911                        .map_err(|err| RedDBError::Query(err.to_string()))?;
4912                    update_current_secret_value(key, None);
4913                    self.invalidate_result_cache();
4914                    return Ok(RuntimeQueryResult::ok_message(
4915                        query.to_string(),
4916                        &format!("secret deleted: {key}"),
4917                        "delete_secret",
4918                    ));
4919                }
4920                let value = secret_sql_value_to_string(value)?;
4921                auth_store
4922                    .vault_kv_try_set(key.clone(), value.clone())
4923                    .map_err(|err| RedDBError::Query(err.to_string()))?;
4924                update_current_secret_value(key, Some(value));
4925                self.invalidate_result_cache();
4926                Ok(RuntimeQueryResult::ok_message(
4927                    query.to_string(),
4928                    &format!("secret set: {key}"),
4929                    "set_secret",
4930                ))
4931            }
4932            // DELETE SECRET key
4933            QueryExpr::DeleteSecret { ref key } => {
4934                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
4935                    RedDBError::Query(
4936                        "DELETE SECRET requires an enabled, unsealed vault".to_string(),
4937                    )
4938                })?;
4939                let deleted = auth_store
4940                    .vault_kv_try_delete(key)
4941                    .map_err(|err| RedDBError::Query(err.to_string()))?;
4942                if deleted {
4943                    update_current_secret_value(key, None);
4944                }
4945                self.invalidate_result_cache();
4946                Ok(RuntimeQueryResult::ok_message(
4947                    query.to_string(),
4948                    &format!("secret deleted: {key}"),
4949                    if deleted {
4950                        "delete_secret"
4951                    } else {
4952                        "delete_secret_not_found"
4953                    },
4954                ))
4955            }
4956            // SHOW SECRET[S] [prefix]
4957            QueryExpr::ShowSecrets { ref prefix } => {
4958                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
4959                    RedDBError::Query("SHOW SECRET requires an enabled, unsealed vault".to_string())
4960                })?;
4961                if !auth_store.is_vault_backed() {
4962                    return Err(RedDBError::Query(
4963                        "SHOW SECRET requires an enabled, unsealed vault".to_string(),
4964                    ));
4965                }
4966                let mut keys = auth_store.vault_kv_keys();
4967                keys.sort();
4968                let mut result = UnifiedResult::with_columns(vec![
4969                    "key".into(),
4970                    "value".into(),
4971                    "status".into(),
4972                ]);
4973                for key in keys {
4974                    if let Some(ref pfx) = prefix {
4975                        if !key.starts_with(pfx) {
4976                            continue;
4977                        }
4978                    }
4979                    let mut record = UnifiedRecord::new();
4980                    record.set("key", Value::text(key));
4981                    record.set("value", Value::text("***"));
4982                    record.set("status", Value::text("active"));
4983                    result.push(record);
4984                }
4985                Ok(RuntimeQueryResult {
4986                    query: query.to_string(),
4987                    mode,
4988                    statement: "show_secrets",
4989                    engine: "runtime-secret",
4990                    result,
4991                    affected_rows: 0,
4992                    statement_type: "select",
4993                })
4994            }
4995            // SHOW CONFIG [prefix]
4996            QueryExpr::ShowConfig { ref prefix } => {
4997                let store = self.inner.db.store();
4998                let all_collections = store.list_collections();
4999                if !all_collections.contains(&"red_config".to_string()) {
5000                    let result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
5001                    return Ok(RuntimeQueryResult {
5002                        query: query.to_string(),
5003                        mode,
5004                        statement: "show_config",
5005                        engine: "runtime-config",
5006                        result,
5007                        affected_rows: 0,
5008                        statement_type: "select",
5009                    });
5010                }
5011                let manager = store
5012                    .get_collection("red_config")
5013                    .ok_or_else(|| RedDBError::NotFound("red_config".to_string()))?;
5014                let entities = manager.query_all(|_| true);
5015                let mut latest = std::collections::BTreeMap::<String, (u64, Value, Value)>::new();
5016                for entity in entities {
5017                    if let EntityData::Row(ref row) = entity.data {
5018                        if let Some(ref named) = row.named {
5019                            let key_val = named.get("key").cloned().unwrap_or(Value::Null);
5020                            let val = named.get("value").cloned().unwrap_or(Value::Null);
5021                            let key_str = match &key_val {
5022                                Value::Text(s) => s.as_ref(),
5023                                _ => continue,
5024                            };
5025                            if let Some(ref pfx) = prefix {
5026                                if !key_str.starts_with(pfx.as_str()) {
5027                                    continue;
5028                                }
5029                            }
5030                            let entity_id = entity.id.raw();
5031                            match latest.get(key_str) {
5032                                Some((prev_id, _, _)) if *prev_id > entity_id => {}
5033                                _ => {
5034                                    latest.insert(key_str.to_string(), (entity_id, key_val, val));
5035                                }
5036                            }
5037                        }
5038                    }
5039                }
5040                let mut result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
5041                for (_, key_val, val) in latest.into_values() {
5042                    let mut record = UnifiedRecord::new();
5043                    record.set("key", key_val);
5044                    record.set("value", val);
5045                    result.push(record);
5046                }
5047                Ok(RuntimeQueryResult {
5048                    query: query.to_string(),
5049                    mode,
5050                    statement: "show_config",
5051                    engine: "runtime-config",
5052                    result,
5053                    affected_rows: 0,
5054                    statement_type: "select",
5055                })
5056            }
5057            // Session-local multi-tenancy handle (Phase 2.5.3).
5058            //
5059            // SET TENANT 'id' / SET TENANT NULL / RESET TENANT — writes
5060            // the thread-local; SHOW TENANT returns it. Paired with the
5061            // CURRENT_TENANT() scalar for use in RLS policies.
5062            QueryExpr::SetTenant(ref value) => {
5063                match value {
5064                    Some(id) => set_current_tenant(id.clone()),
5065                    None => clear_current_tenant(),
5066                }
5067                Ok(RuntimeQueryResult::ok_message(
5068                    query.to_string(),
5069                    &match value {
5070                        Some(id) => format!("tenant set: {id}"),
5071                        None => "tenant cleared".to_string(),
5072                    },
5073                    "set_tenant",
5074                ))
5075            }
5076            QueryExpr::ShowTenant => {
5077                let mut result = UnifiedResult::with_columns(vec!["tenant".into()]);
5078                let mut record = UnifiedRecord::new();
5079                record.set(
5080                    "tenant",
5081                    current_tenant().map(Value::text).unwrap_or(Value::Null),
5082                );
5083                result.push(record);
5084                Ok(RuntimeQueryResult {
5085                    query: query.to_string(),
5086                    mode,
5087                    statement: "show_tenant",
5088                    engine: "runtime-tenant",
5089                    result,
5090                    affected_rows: 0,
5091                    statement_type: "select",
5092                })
5093            }
5094            // Transaction control (Phase 2.3 PG parity).
5095            //
5096            // BEGIN allocates a real `Xid` and stores a `TxnContext` keyed by
5097            // the current connection's id. COMMIT/ROLLBACK release it through
5098            // the `SnapshotManager` so future snapshots see the correct set of
5099            // active/aborted transactions.
5100            //
5101            // Tuple stamping (xmin/xmax) and read-path visibility filtering
5102            // land in Phase 2.3.2 — this dispatch only manages the snapshot
5103            // registry. Statements running outside a TxnContext still behave
5104            // as autocommit (xid=0 → visible to every snapshot).
5105            QueryExpr::TransactionControl(ref ctl) => {
5106                use crate::storage::query::ast::TxnControl;
5107                use crate::storage::transaction::snapshot::{TxnContext, Xid};
5108                use crate::storage::transaction::IsolationLevel;
5109
5110                // Phase 2.3 keys transactions by a thread-local connection id.
5111                // The stdio/gRPC paths wire a real per-connection id later;
5112                // for embedded use (one RedDBRuntime per process-ish caller)
5113                // we fall back to a deterministic placeholder.
5114                let conn_id = current_connection_id();
5115
5116                let (kind, msg) = match ctl {
5117                    TxnControl::Begin => {
5118                        let mgr = Arc::clone(&self.inner.snapshot_manager);
5119                        let xid = mgr.begin();
5120                        let snapshot = mgr.snapshot(xid);
5121                        let ctx = TxnContext {
5122                            xid,
5123                            isolation: IsolationLevel::SnapshotIsolation,
5124                            snapshot,
5125                            savepoints: Vec::new(),
5126                            released_sub_xids: Vec::new(),
5127                        };
5128                        self.inner.tx_contexts.write().insert(conn_id, ctx);
5129                        ("begin", format!("BEGIN — xid={xid} (snapshot isolation)"))
5130                    }
5131                    TxnControl::Commit => {
5132                        // SET LOCAL TENANT ends with the transaction.
5133                        self.inner.tx_local_tenants.write().remove(&conn_id);
5134                        let ctx = self.inner.tx_contexts.write().remove(&conn_id);
5135                        match ctx {
5136                            Some(ctx) => {
5137                                let mut own_xids = std::collections::HashSet::new();
5138                                own_xids.insert(ctx.xid);
5139                                for (_, sub) in &ctx.savepoints {
5140                                    own_xids.insert(*sub);
5141                                }
5142                                for sub in &ctx.released_sub_xids {
5143                                    own_xids.insert(*sub);
5144                                }
5145                                if let Err(err) = self.check_table_row_write_conflicts(
5146                                    conn_id,
5147                                    &ctx.snapshot,
5148                                    &own_xids,
5149                                ) {
5150                                    for (_, sub) in &ctx.savepoints {
5151                                        self.inner.snapshot_manager.rollback(*sub);
5152                                    }
5153                                    for sub in &ctx.released_sub_xids {
5154                                        self.inner.snapshot_manager.rollback(*sub);
5155                                    }
5156                                    self.inner.snapshot_manager.rollback(ctx.xid);
5157                                    self.revive_pending_versioned_updates(conn_id);
5158                                    self.revive_pending_tombstones(conn_id);
5159                                    self.discard_pending_kv_watch_events(conn_id);
5160                                    self.discard_pending_store_wal_actions(conn_id);
5161                                    return Err(err);
5162                                }
5163                                self.restore_pending_write_stamps(conn_id);
5164                                if let Err(err) = self.flush_pending_store_wal_actions(conn_id) {
5165                                    for (_, sub) in &ctx.savepoints {
5166                                        self.inner.snapshot_manager.rollback(*sub);
5167                                    }
5168                                    for sub in &ctx.released_sub_xids {
5169                                        self.inner.snapshot_manager.rollback(*sub);
5170                                    }
5171                                    self.inner.snapshot_manager.rollback(ctx.xid);
5172                                    self.revive_pending_versioned_updates(conn_id);
5173                                    self.revive_pending_tombstones(conn_id);
5174                                    self.discard_pending_kv_watch_events(conn_id);
5175                                    return Err(err);
5176                                }
5177                                // Phase 2.3.2e: commit every open sub-xid
5178                                // so they also become visible. Their
5179                                // work is promoted to the parent txn's
5180                                // result exactly like a RELEASE would
5181                                // have done.
5182                                for (_, sub) in &ctx.savepoints {
5183                                    self.inner.snapshot_manager.commit(*sub);
5184                                }
5185                                for sub in &ctx.released_sub_xids {
5186                                    self.inner.snapshot_manager.commit(*sub);
5187                                }
5188                                self.inner.snapshot_manager.commit(ctx.xid);
5189                                self.finalize_pending_versioned_updates(conn_id);
5190                                self.finalize_pending_tombstones(conn_id);
5191                                self.finalize_pending_kv_watch_events(conn_id);
5192                                ("commit", format!("COMMIT — xid={} committed", ctx.xid))
5193                            }
5194                            None => (
5195                                "commit",
5196                                "COMMIT outside transaction — no-op (autocommit)".to_string(),
5197                            ),
5198                        }
5199                    }
5200                    TxnControl::Rollback => {
5201                        self.inner.tx_local_tenants.write().remove(&conn_id);
5202                        let ctx = self.inner.tx_contexts.write().remove(&conn_id);
5203                        match ctx {
5204                            Some(ctx) => {
5205                                // Phase 2.3.2e: abort every open sub-xid
5206                                // too so their writes stay hidden.
5207                                for (_, sub) in &ctx.savepoints {
5208                                    self.inner.snapshot_manager.rollback(*sub);
5209                                }
5210                                for sub in &ctx.released_sub_xids {
5211                                    self.inner.snapshot_manager.rollback(*sub);
5212                                }
5213                                self.inner.snapshot_manager.rollback(ctx.xid);
5214                                // Phase 2.3.2b: tuples that the txn had
5215                                // xmax-stamped become live again — wipe xmax
5216                                // back to 0 so later snapshots see them.
5217                                self.revive_pending_versioned_updates(conn_id);
5218                                self.revive_pending_tombstones(conn_id);
5219                                self.discard_pending_kv_watch_events(conn_id);
5220                                self.discard_pending_store_wal_actions(conn_id);
5221                                ("rollback", format!("ROLLBACK — xid={} aborted", ctx.xid))
5222                            }
5223                            None => (
5224                                "rollback",
5225                                "ROLLBACK outside transaction — no-op (autocommit)".to_string(),
5226                            ),
5227                        }
5228                    }
5229                    // Phase 2.3.2e: savepoints map onto sub-xids. Each
5230                    // SAVEPOINT allocates a fresh xid and pushes it
5231                    // onto the per-txn stack so subsequent writes can
5232                    // be selectively rolled back. RELEASE pops without
5233                    // aborting; ROLLBACK TO aborts the sub-xid (and
5234                    // any nested ones) + revives their tombstones.
5235                    TxnControl::Savepoint(name) => {
5236                        let mgr = Arc::clone(&self.inner.snapshot_manager);
5237                        let mut guard = self.inner.tx_contexts.write();
5238                        match guard.get_mut(&conn_id) {
5239                            Some(ctx) => {
5240                                let sub = mgr.begin();
5241                                ctx.savepoints.push((name.clone(), sub));
5242                                ("savepoint", format!("SAVEPOINT {name} — sub_xid={sub}"))
5243                            }
5244                            None => (
5245                                "savepoint",
5246                                "SAVEPOINT outside transaction — no-op".to_string(),
5247                            ),
5248                        }
5249                    }
5250                    TxnControl::ReleaseSavepoint(name) => {
5251                        let mut guard = self.inner.tx_contexts.write();
5252                        match guard.get_mut(&conn_id) {
5253                            Some(ctx) => {
5254                                let pos = ctx
5255                                    .savepoints
5256                                    .iter()
5257                                    .position(|(n, _)| n == name)
5258                                    .ok_or_else(|| {
5259                                        RedDBError::Internal(format!(
5260                                            "savepoint {name} does not exist"
5261                                        ))
5262                                    })?;
5263                                // RELEASE pops the named savepoint and
5264                                // any nested ones. Their sub-xids move
5265                                // to `released_sub_xids` so they commit
5266                                // (or roll back) alongside the parent
5267                                // xid — PG semantics: released
5268                                // savepoints still contribute their
5269                                // work, but their names are gone.
5270                                let released = ctx.savepoints.len() - pos;
5271                                let popped: Vec<Xid> = ctx
5272                                    .savepoints
5273                                    .split_off(pos)
5274                                    .into_iter()
5275                                    .map(|(_, x)| x)
5276                                    .collect();
5277                                ctx.released_sub_xids.extend(popped);
5278                                (
5279                                    "release_savepoint",
5280                                    format!("RELEASE SAVEPOINT {name} — {released} level(s)"),
5281                                )
5282                            }
5283                            None => (
5284                                "release_savepoint",
5285                                "RELEASE outside transaction — no-op".to_string(),
5286                            ),
5287                        }
5288                    }
5289                    TxnControl::RollbackToSavepoint(name) => {
5290                        let mgr = Arc::clone(&self.inner.snapshot_manager);
5291                        // Splice out the savepoint + nested ones under
5292                        // a narrow lock, then run the snapshot-manager
5293                        // + tombstone side-effects without the tx map
5294                        // held so nothing re-enters.
5295                        let drop_result: Option<(Xid, Vec<Xid>)> = {
5296                            let mut guard = self.inner.tx_contexts.write();
5297                            if let Some(ctx) = guard.get_mut(&conn_id) {
5298                                let pos = ctx
5299                                    .savepoints
5300                                    .iter()
5301                                    .position(|(n, _)| n == name)
5302                                    .ok_or_else(|| {
5303                                        RedDBError::Internal(format!(
5304                                            "savepoint {name} does not exist"
5305                                        ))
5306                                    })?;
5307                                let savepoint_xid = ctx.savepoints[pos].1;
5308                                let aborted: Vec<Xid> = ctx
5309                                    .savepoints
5310                                    .split_off(pos)
5311                                    .into_iter()
5312                                    .map(|(_, x)| x)
5313                                    .collect();
5314                                Some((savepoint_xid, aborted))
5315                            } else {
5316                                None
5317                            }
5318                        };
5319
5320                        match drop_result {
5321                            Some((savepoint_xid, aborted)) => {
5322                                for x in &aborted {
5323                                    mgr.rollback(*x);
5324                                }
5325                                let reverted_updates =
5326                                    self.revive_versioned_updates_since(conn_id, savepoint_xid);
5327                                let revived = self.revive_tombstones_since(conn_id, savepoint_xid);
5328                                (
5329                                    "rollback_to_savepoint",
5330                                    format!(
5331                                        "ROLLBACK TO SAVEPOINT {name} — aborted {} sub_xid(s), reverted {reverted_updates} update(s), revived {revived} tombstone(s)",
5332                                        aborted.len(),
5333                                    ),
5334                                )
5335                            }
5336                            None => (
5337                                "rollback_to_savepoint",
5338                                "ROLLBACK TO outside transaction — no-op".to_string(),
5339                            ),
5340                        }
5341                    }
5342                };
5343                Ok(RuntimeQueryResult::ok_message(
5344                    query.to_string(),
5345                    &msg,
5346                    kind,
5347                ))
5348            }
5349            // Schema + Sequence DDL (Phase 1.3 PG parity).
5350            //
5351            // Schemas are lightweight logical namespaces: a CREATE SCHEMA call
5352            // just registers the name in `red_config` under `schema.{name}`.
5353            // Table lookups still happen by collection name; clients using
5354            // `schema.table` qualified names collapse to collection `schema.table`.
5355            //
5356            // Sequences persist a 64-bit counter + metadata (start, increment)
5357            // in `red_config` under `sequence.{name}.*`. Scalar callers
5358            // `nextval('name')` / `currval('name')` arrive with the MVCC phase
5359            // once we have a proper mutating-function dispatch path; for now the
5360            // DDL just establishes the catalog entry so clients don't error.
5361            QueryExpr::CreateSchema(ref q) => {
5362                let store = self.inner.db.store();
5363                let key = format!("schema.{}", q.name);
5364                if store.get_config(&key).is_some() {
5365                    if q.if_not_exists {
5366                        return Ok(RuntimeQueryResult::ok_message(
5367                            query.to_string(),
5368                            &format!("schema {} already exists — skipped", q.name),
5369                            "create_schema",
5370                        ));
5371                    }
5372                    return Err(RedDBError::Internal(format!(
5373                        "schema {} already exists",
5374                        q.name
5375                    )));
5376                }
5377                store.set_config_tree(&key, &crate::serde_json::Value::Bool(true));
5378                Ok(RuntimeQueryResult::ok_message(
5379                    query.to_string(),
5380                    &format!("schema {} created", q.name),
5381                    "create_schema",
5382                ))
5383            }
5384            QueryExpr::DropSchema(ref q) => {
5385                let store = self.inner.db.store();
5386                let key = format!("schema.{}", q.name);
5387                let existed = store.get_config(&key).is_some();
5388                if !existed && !q.if_exists {
5389                    return Err(RedDBError::Internal(format!(
5390                        "schema {} does not exist",
5391                        q.name
5392                    )));
5393                }
5394                // Remove marker from red_config via set to null.
5395                store.set_config_tree(&key, &crate::serde_json::Value::Null);
5396                let suffix = if q.cascade {
5397                    " (CASCADE accepted — tables untouched)"
5398                } else {
5399                    ""
5400                };
5401                Ok(RuntimeQueryResult::ok_message(
5402                    query.to_string(),
5403                    &format!("schema {} dropped{}", q.name, suffix),
5404                    "drop_schema",
5405                ))
5406            }
5407            QueryExpr::CreateSequence(ref q) => {
5408                let store = self.inner.db.store();
5409                let base = format!("sequence.{}", q.name);
5410                let start_key = format!("{base}.start");
5411                let incr_key = format!("{base}.increment");
5412                let curr_key = format!("{base}.current");
5413                if store.get_config(&start_key).is_some() {
5414                    if q.if_not_exists {
5415                        return Ok(RuntimeQueryResult::ok_message(
5416                            query.to_string(),
5417                            &format!("sequence {} already exists — skipped", q.name),
5418                            "create_sequence",
5419                        ));
5420                    }
5421                    return Err(RedDBError::Internal(format!(
5422                        "sequence {} already exists",
5423                        q.name
5424                    )));
5425                }
5426                // Persist start + increment, and set current so the first
5427                // nextval returns `start`.
5428                let initial_current = q.start - q.increment;
5429                store.set_config_tree(
5430                    &start_key,
5431                    &crate::serde_json::Value::Number(q.start as f64),
5432                );
5433                store.set_config_tree(
5434                    &incr_key,
5435                    &crate::serde_json::Value::Number(q.increment as f64),
5436                );
5437                store.set_config_tree(
5438                    &curr_key,
5439                    &crate::serde_json::Value::Number(initial_current as f64),
5440                );
5441                Ok(RuntimeQueryResult::ok_message(
5442                    query.to_string(),
5443                    &format!(
5444                        "sequence {} created (start={}, increment={})",
5445                        q.name, q.start, q.increment
5446                    ),
5447                    "create_sequence",
5448                ))
5449            }
5450            QueryExpr::DropSequence(ref q) => {
5451                let store = self.inner.db.store();
5452                let base = format!("sequence.{}", q.name);
5453                let existed = store.get_config(&format!("{base}.start")).is_some();
5454                if !existed && !q.if_exists {
5455                    return Err(RedDBError::Internal(format!(
5456                        "sequence {} does not exist",
5457                        q.name
5458                    )));
5459                }
5460                for k in ["start", "increment", "current"] {
5461                    store.set_config_tree(&format!("{base}.{k}"), &crate::serde_json::Value::Null);
5462                }
5463                Ok(RuntimeQueryResult::ok_message(
5464                    query.to_string(),
5465                    &format!("sequence {} dropped", q.name),
5466                    "drop_sequence",
5467                ))
5468            }
5469            // Views — CREATE [MATERIALIZED] VIEW (Phase 2.1 PG parity).
5470            //
5471            // The view definition is stored in-memory on RuntimeInner (not
5472            // persisted). SELECTs that reference the view name will substitute
5473            // the stored `QueryExpr` via `resolve_view_reference` during
5474            // planning (same entry point used by table-name resolution).
5475            //
5476            // Materialized views additionally allocate a slot in
5477            // `MaterializedViewCache`; a REFRESH repopulates that slot.
5478            QueryExpr::CreateView(ref q) => {
5479                let mut views = self.inner.views.write();
5480                if views.contains_key(&q.name) && !q.or_replace {
5481                    if q.if_not_exists {
5482                        return Ok(RuntimeQueryResult::ok_message(
5483                            query.to_string(),
5484                            &format!("view {} already exists — skipped", q.name),
5485                            "create_view",
5486                        ));
5487                    }
5488                    return Err(RedDBError::Internal(format!(
5489                        "view {} already exists",
5490                        q.name
5491                    )));
5492                }
5493                views.insert(q.name.clone(), Arc::new(q.clone()));
5494                drop(views);
5495
5496                // Materialized view: register cache slot (data is empty until REFRESH).
5497                if q.materialized {
5498                    use crate::storage::cache::result::{MaterializedViewDef, RefreshPolicy};
5499                    let def = MaterializedViewDef {
5500                        name: q.name.clone(),
5501                        query: format!("<parsed view {}>", q.name),
5502                        dependencies: collect_table_refs(&q.query),
5503                        refresh: RefreshPolicy::Manual,
5504                    };
5505                    self.inner.materialized_views.write().register(def);
5506                }
5507                // Plan cache may have cached a plan that didn't know about this
5508                // view — invalidate so future references pick up the new binding.
5509                // Result cache gets flushed too: OR REPLACE must not serve a
5510                // prior execution of the obsolete body.
5511                self.invalidate_plan_cache();
5512                self.invalidate_result_cache();
5513
5514                Ok(RuntimeQueryResult::ok_message(
5515                    query.to_string(),
5516                    &format!(
5517                        "{}view {} created",
5518                        if q.materialized { "materialized " } else { "" },
5519                        q.name
5520                    ),
5521                    "create_view",
5522                ))
5523            }
5524            QueryExpr::DropView(ref q) => {
5525                let mut views = self.inner.views.write();
5526                let existed = views.remove(&q.name).is_some();
5527                drop(views);
5528                if q.materialized || existed {
5529                    // Try the materialised cache too — silent if absent.
5530                    self.inner.materialized_views.write().remove(&q.name);
5531                }
5532                // Drop any plan / result cache entries that baked the
5533                // view body into their QueryExpr.
5534                self.invalidate_plan_cache();
5535                self.invalidate_result_cache();
5536                if !existed && !q.if_exists {
5537                    return Err(RedDBError::Internal(format!(
5538                        "view {} does not exist",
5539                        q.name
5540                    )));
5541                }
5542                self.invalidate_plan_cache();
5543                Ok(RuntimeQueryResult::ok_message(
5544                    query.to_string(),
5545                    &format!("view {} dropped", q.name),
5546                    "drop_view",
5547                ))
5548            }
5549            QueryExpr::RefreshMaterializedView(ref q) => {
5550                // Look up the view definition, execute its underlying query,
5551                // and stash the serialized result in the materialised cache.
5552                let view = {
5553                    let views = self.inner.views.read();
5554                    views.get(&q.name).cloned()
5555                };
5556                let view = match view {
5557                    Some(v) => v,
5558                    None => {
5559                        return Err(RedDBError::Internal(format!(
5560                            "view {} does not exist",
5561                            q.name
5562                        )))
5563                    }
5564                };
5565                if !view.materialized {
5566                    return Err(RedDBError::Internal(format!(
5567                        "view {} is not materialized — REFRESH requires \
5568                         CREATE MATERIALIZED VIEW",
5569                        q.name
5570                    )));
5571                }
5572                // Execute the underlying query fresh.
5573                let inner_result = self.execute_query_expr((*view.query).clone())?;
5574                // Cache data = JSON-serialised result (opaque blob; read path
5575                // returns it verbatim for now).
5576                let serialized = format!("{:?}", inner_result.result);
5577                self.inner
5578                    .materialized_views
5579                    .write()
5580                    .refresh(&q.name, serialized.into_bytes());
5581                Ok(RuntimeQueryResult::ok_message(
5582                    query.to_string(),
5583                    &format!("materialized view {} refreshed", q.name),
5584                    "refresh_materialized_view",
5585                ))
5586            }
5587            // Row Level Security (Phase 2.5 PG parity).
5588            //
5589            // Policies live in an in-memory registry keyed by (table, name).
5590            // Enforcement (AND-ing the policy's USING clause into every
5591            // query's WHERE for the table) arrives in Phase 2.5.2 via the
5592            // filter compiler; this dispatch only manages the catalog.
5593            QueryExpr::CreatePolicy(ref q) => {
5594                let key = (q.table.clone(), q.name.clone());
5595                self.inner
5596                    .rls_policies
5597                    .write()
5598                    .insert(key, Arc::new(q.clone()));
5599                self.invalidate_plan_cache();
5600                // Issue #120 — surface policy names in the
5601                // schema-vocabulary so AskPipeline (#121) can resolve
5602                // a policy reference back to its table.
5603                self.schema_vocabulary_apply(
5604                    crate::runtime::schema_vocabulary::DdlEvent::CreatePolicy {
5605                        collection: q.table.clone(),
5606                        policy: q.name.clone(),
5607                    },
5608                );
5609                Ok(RuntimeQueryResult::ok_message(
5610                    query.to_string(),
5611                    &format!("policy {} on {} created", q.name, q.table),
5612                    "create_policy",
5613                ))
5614            }
5615            QueryExpr::DropPolicy(ref q) => {
5616                let removed = self
5617                    .inner
5618                    .rls_policies
5619                    .write()
5620                    .remove(&(q.table.clone(), q.name.clone()))
5621                    .is_some();
5622                if !removed && !q.if_exists {
5623                    return Err(RedDBError::Internal(format!(
5624                        "policy {} on {} does not exist",
5625                        q.name, q.table
5626                    )));
5627                }
5628                self.invalidate_plan_cache();
5629                // Issue #120 — keep the schema-vocabulary policy
5630                // entry in sync.
5631                self.schema_vocabulary_apply(
5632                    crate::runtime::schema_vocabulary::DdlEvent::DropPolicy {
5633                        collection: q.table.clone(),
5634                        policy: q.name.clone(),
5635                    },
5636                );
5637                Ok(RuntimeQueryResult::ok_message(
5638                    query.to_string(),
5639                    &format!("policy {} on {} dropped", q.name, q.table),
5640                    "drop_policy",
5641                ))
5642            }
5643            // Foreign Data Wrappers (Phase 3.2 PG parity).
5644            //
5645            // CREATE SERVER / CREATE FOREIGN TABLE register into the shared
5646            // `ForeignTableRegistry`. The read path consults that registry
5647            // before dispatching a SELECT — when the table name matches a
5648            // registered foreign table, we forward the scan to the wrapper
5649            // and skip the normal collection lookup.
5650            //
5651            // Phase 3.2 is in-memory only; persistence across restarts is a
5652            // 3.2.2 follow-up that mirrors the view registry pattern.
5653            QueryExpr::CreateServer(ref q) => {
5654                use crate::storage::fdw::FdwOptions;
5655                let registry = Arc::clone(&self.inner.foreign_tables);
5656                if registry.server(&q.name).is_some() {
5657                    if q.if_not_exists {
5658                        return Ok(RuntimeQueryResult::ok_message(
5659                            query.to_string(),
5660                            &format!("server {} already exists — skipped", q.name),
5661                            "create_server",
5662                        ));
5663                    }
5664                    return Err(RedDBError::Internal(format!(
5665                        "server {} already exists",
5666                        q.name
5667                    )));
5668                }
5669                let mut opts = FdwOptions::new();
5670                for (k, v) in &q.options {
5671                    opts.values.insert(k.clone(), v.clone());
5672                }
5673                registry
5674                    .create_server(&q.name, &q.wrapper, opts)
5675                    .map_err(|e| RedDBError::Internal(e.to_string()))?;
5676                Ok(RuntimeQueryResult::ok_message(
5677                    query.to_string(),
5678                    &format!("server {} created (wrapper {})", q.name, q.wrapper),
5679                    "create_server",
5680                ))
5681            }
5682            QueryExpr::DropServer(ref q) => {
5683                let existed = self.inner.foreign_tables.drop_server(&q.name);
5684                if !existed && !q.if_exists {
5685                    return Err(RedDBError::Internal(format!(
5686                        "server {} does not exist",
5687                        q.name
5688                    )));
5689                }
5690                Ok(RuntimeQueryResult::ok_message(
5691                    query.to_string(),
5692                    &format!(
5693                        "server {} dropped{}",
5694                        q.name,
5695                        if q.cascade { " (cascade)" } else { "" }
5696                    ),
5697                    "drop_server",
5698                ))
5699            }
5700            QueryExpr::CreateForeignTable(ref q) => {
5701                use crate::storage::fdw::{FdwOptions, ForeignColumn, ForeignTable};
5702                let registry = Arc::clone(&self.inner.foreign_tables);
5703                if registry.foreign_table(&q.name).is_some() {
5704                    if q.if_not_exists {
5705                        return Ok(RuntimeQueryResult::ok_message(
5706                            query.to_string(),
5707                            &format!("foreign table {} already exists — skipped", q.name),
5708                            "create_foreign_table",
5709                        ));
5710                    }
5711                    return Err(RedDBError::Internal(format!(
5712                        "foreign table {} already exists",
5713                        q.name
5714                    )));
5715                }
5716                let mut opts = FdwOptions::new();
5717                for (k, v) in &q.options {
5718                    opts.values.insert(k.clone(), v.clone());
5719                }
5720                let columns: Vec<ForeignColumn> = q
5721                    .columns
5722                    .iter()
5723                    .map(|c| ForeignColumn {
5724                        name: c.name.clone(),
5725                        data_type: c.data_type.clone(),
5726                        not_null: c.not_null,
5727                    })
5728                    .collect();
5729                registry
5730                    .create_foreign_table(ForeignTable {
5731                        name: q.name.clone(),
5732                        server_name: q.server.clone(),
5733                        columns,
5734                        options: opts,
5735                    })
5736                    .map_err(|e| RedDBError::Internal(e.to_string()))?;
5737                self.invalidate_plan_cache();
5738                Ok(RuntimeQueryResult::ok_message(
5739                    query.to_string(),
5740                    &format!("foreign table {} created (server {})", q.name, q.server),
5741                    "create_foreign_table",
5742                ))
5743            }
5744            QueryExpr::DropForeignTable(ref q) => {
5745                let existed = self.inner.foreign_tables.drop_foreign_table(&q.name);
5746                if !existed && !q.if_exists {
5747                    return Err(RedDBError::Internal(format!(
5748                        "foreign table {} does not exist",
5749                        q.name
5750                    )));
5751                }
5752                self.invalidate_plan_cache();
5753                Ok(RuntimeQueryResult::ok_message(
5754                    query.to_string(),
5755                    &format!("foreign table {} dropped", q.name),
5756                    "drop_foreign_table",
5757                ))
5758            }
5759            // COPY table FROM 'path' (Phase 1.5 PG parity).
5760            //
5761            // Stream CSV rows through the shared `CsvImporter`. The collection
5762            // is auto-created on first insert (via `insert_auto`-style path);
5763            // VACUUM/ANALYZE afterwards is up to the caller.
5764            QueryExpr::CopyFrom(ref q) => {
5765                use crate::storage::import::{CsvConfig, CsvImporter};
5766                let store = self.inner.db.store();
5767                let cfg = CsvConfig {
5768                    collection: q.table.clone(),
5769                    has_header: q.has_header,
5770                    delimiter: q.delimiter.map(|c| c as u8).unwrap_or(b','),
5771                    ..CsvConfig::default()
5772                };
5773                let importer = CsvImporter::new(cfg);
5774                let stats = importer
5775                    .import_file(&q.path, store.as_ref())
5776                    .map_err(|e| RedDBError::Internal(format!("COPY failed: {e}")))?;
5777                // Tables are written → invalidate cached plans / result cache.
5778                self.note_table_write(&q.table);
5779                Ok(RuntimeQueryResult::ok_message(
5780                    query.to_string(),
5781                    &format!(
5782                        "COPY imported {} rows into {} ({} errors skipped, {}ms)",
5783                        stats.records_imported, q.table, stats.errors_skipped, stats.duration_ms
5784                    ),
5785                    "copy_from",
5786                ))
5787            }
5788            // Maintenance commands (Phase 1.2 PG parity).
5789            //
5790            // - VACUUM [FULL] [table]: refreshes planner stats for the target
5791            //   collection(s) and — when FULL — triggers a full pager persist
5792            //   (flushes dirty pages + fsync). Also invalidates the result cache
5793            //   so subsequent reads re-execute against the freshly compacted
5794            //   storage. RedDB's segment/btree GC runs continuously via the
5795            //   background lifecycle; explicit space reclamation for sealed
5796            //   segments arrives with Phase 2.3 (MVCC + dead-tuple reclamation).
5797            // - ANALYZE [table]: reruns `analyze_collection` +
5798            //   `persist_table_stats` via `refresh_table_planner_stats` so the
5799            //   planner has fresh histograms, distinct estimates, null counts.
5800            //
5801            // Both commands accept an optional target; omitting the target
5802            // iterates every collection in the store.
5803            QueryExpr::MaintenanceCommand(ref cmd) => {
5804                use crate::storage::query::ast::MaintenanceCommand as Mc;
5805                let store = self.inner.db.store();
5806                let (kind, msg) = match cmd {
5807                    Mc::Analyze { target } => {
5808                        let targets: Vec<String> = match target {
5809                            Some(t) => vec![t.clone()],
5810                            None => store.list_collections(),
5811                        };
5812                        for t in &targets {
5813                            self.refresh_table_planner_stats(t);
5814                        }
5815                        (
5816                            "analyze",
5817                            format!("ANALYZE refreshed stats for {} table(s)", targets.len()),
5818                        )
5819                    }
5820                    Mc::Vacuum { target, full } => {
5821                        let targets: Vec<String> = match target {
5822                            Some(t) => vec![t.clone()],
5823                            None => store.list_collections(),
5824                        };
5825                        let cutoff_xid = self.mvcc_vacuum_cutoff_xid();
5826                        let mut vacuum_stats =
5827                            crate::storage::unified::store::MvccVacuumStats::default();
5828                        for t in &targets {
5829                            let stats = store.vacuum_mvcc_history(t, cutoff_xid).map_err(|e| {
5830                                RedDBError::Internal(format!(
5831                                    "VACUUM MVCC history failed for {t}: {e}"
5832                                ))
5833                            })?;
5834                            if stats.reclaimed_versions > 0 {
5835                                self.rebuild_runtime_indexes_for_table(t)?;
5836                            }
5837                            vacuum_stats.add(&stats);
5838                        }
5839                        self.inner.snapshot_manager.prune_aborted(cutoff_xid);
5840                        // Stats refresh covers every target (same as ANALYZE).
5841                        for t in &targets {
5842                            self.refresh_table_planner_stats(t);
5843                        }
5844                        // FULL forces a pager persist (dirty-page flush + fsync).
5845                        // Regular VACUUM relies on the background writer / segment
5846                        // lifecycle so the command is non-blocking.
5847                        let persisted = if *full {
5848                            match store.persist() {
5849                                Ok(()) => true,
5850                                Err(e) => {
5851                                    return Err(RedDBError::Internal(format!(
5852                                        "VACUUM FULL persist failed: {e:?}"
5853                                    )));
5854                                }
5855                            }
5856                        } else {
5857                            false
5858                        };
5859                        // Result cache depended on pre-vacuum state.
5860                        self.invalidate_result_cache();
5861                        (
5862                            "vacuum",
5863                            format!(
5864                                "VACUUM{} processed {} table(s): scanned_versions={}, retained_versions={}, reclaimed_versions={}, retained_history_versions={}, reclaimed_history_versions={}, retained_tombstones={}, reclaimed_tombstones={}{}",
5865                                if *full { " FULL" } else { "" },
5866                                targets.len(),
5867                                vacuum_stats.scanned_versions,
5868                                vacuum_stats.retained_versions,
5869                                vacuum_stats.reclaimed_versions,
5870                                vacuum_stats.retained_history_versions,
5871                                vacuum_stats.reclaimed_history_versions,
5872                                vacuum_stats.retained_tombstones,
5873                                vacuum_stats.reclaimed_tombstones,
5874                                if persisted {
5875                                    " (pages flushed to disk)"
5876                                } else {
5877                                    ""
5878                                }
5879                            ),
5880                        )
5881                    }
5882                };
5883                Ok(RuntimeQueryResult::ok_message(
5884                    query.to_string(),
5885                    &msg,
5886                    kind,
5887                ))
5888            }
5889            // GRANT / REVOKE / ALTER USER (RBAC milestone).
5890            //
5891            // These hit the AuthStore directly. The privilege-check
5892            // gate at the top of `execute_query_expr` already decided
5893            // whether the caller may even run the statement; here we
5894            // just translate the AST into AuthStore calls.
5895            QueryExpr::Grant(ref g) => self.execute_grant_statement(query, g),
5896            QueryExpr::Revoke(ref r) => self.execute_revoke_statement(query, r),
5897            QueryExpr::AlterUser(ref a) => self.execute_alter_user_statement(query, a),
5898            QueryExpr::CreateIamPolicy { ref id, ref json } => {
5899                self.execute_create_iam_policy(query, id, json)
5900            }
5901            QueryExpr::DropIamPolicy { ref id } => self.execute_drop_iam_policy(query, id),
5902            QueryExpr::AttachPolicy {
5903                ref policy_id,
5904                ref principal,
5905            } => self.execute_attach_policy(query, policy_id, principal),
5906            QueryExpr::DetachPolicy {
5907                ref policy_id,
5908                ref principal,
5909            } => self.execute_detach_policy(query, policy_id, principal),
5910            QueryExpr::ShowPolicies { ref filter } => {
5911                self.execute_show_policies(query, filter.as_ref())
5912            }
5913            QueryExpr::ShowEffectivePermissions {
5914                ref user,
5915                ref resource,
5916            } => self.execute_show_effective_permissions(query, user, resource.as_ref()),
5917            QueryExpr::SimulatePolicy {
5918                ref user,
5919                ref action,
5920                ref resource,
5921            } => self.execute_simulate_policy(query, user, action, resource),
5922            QueryExpr::CreateMigration(ref q) => self.execute_create_migration(query, q),
5923            QueryExpr::ApplyMigration(ref q) => self.execute_apply_migration(query, q),
5924            QueryExpr::RollbackMigration(ref q) => self.execute_rollback_migration(query, q),
5925            QueryExpr::ExplainMigration(ref q) => self.execute_explain_migration(query, q),
5926        };
5927
5928        // Decrypt Value::Secret columns in-place before caching, so
5929        // cached results match the post-decrypt shape and repeat
5930        // queries skip the per-row AES-GCM pass.
5931        let mut query_result = query_result;
5932        if let Ok(ref mut result) = query_result {
5933            if result.statement_type == "select" {
5934                self.apply_secret_decryption(result);
5935            }
5936        }
5937
5938        // Cache SELECT results for 30s.
5939        // Skip: pre-serialized JSON (large clone), and result sets > 5 rows.
5940        // Large multi-row results (range scans, filtered scans) are rarely
5941        // repeated with the same literal values so the cache hit rate is near
5942        // zero while the clone cost (100 records × ~16 fields each) is high.
5943        // Aggregations (1 row) and point lookups (1 row) still benefit.
5944        if let Ok(ref result) = query_result {
5945            frame.write_result_cache(self, result, result_cache_scopes);
5946        }
5947
5948        query_result
5949    }
5950
5951    /// Execute a pre-parsed `QueryExpr` directly, bypassing SQL parsing and the
5952    /// plan cache. Used by the prepared-statement fast path so that `execute_prepared`
5953    /// calls pay zero parse + cache overhead.
5954    ///
5955    /// Applies secret decryption on SELECT results, identical to `execute_query`.
5956    pub fn execute_query_expr(&self, expr: QueryExpr) -> RedDBResult<RuntimeQueryResult> {
5957        let _config_snapshot_guard = ConfigSnapshotGuard::install(Arc::clone(&self.inner.db));
5958        let _secret_store_guard = SecretStoreGuard::install(self.inner.auth_store.read().clone());
5959        // View rewrite (Phase 2.1): substitute any `QueryExpr::Table(tq)`
5960        // whose `tq.table` matches a registered view with the view's
5961        // underlying query. Safe to call even when no views are registered.
5962        let expr = self.rewrite_view_refs(expr);
5963
5964        self.validate_model_operations_before_auth(&expr)?;
5965        // Granular RBAC privilege check. Runs before dispatch so a
5966        // denied caller never reaches storage. Fail-closed: any error
5967        // resolving the action / resource produces PermissionDenied.
5968        if let Err(err) = self.check_query_privilege(&expr) {
5969            return Err(RedDBError::Query(format!("permission denied: {err}")));
5970        }
5971
5972        let statement = query_expr_name(&expr);
5973        let mode = detect_mode(statement);
5974        let query_str = statement;
5975
5976        let result = self.dispatch_expr(expr, query_str, mode)?;
5977        let mut r = result;
5978        if r.statement_type == "select" {
5979            self.apply_secret_decryption(&mut r);
5980        }
5981        Ok(r)
5982    }
5983
5984    pub(super) fn validate_model_operations_before_auth(
5985        &self,
5986        expr: &QueryExpr,
5987    ) -> RedDBResult<()> {
5988        use crate::catalog::CollectionModel;
5989        use crate::runtime::ddl::polymorphic_resolver;
5990        use crate::storage::query::ast::KvCommand;
5991
5992        let system_schema_target = match expr {
5993            QueryExpr::DropTable(q) => Some(q.name.as_str()),
5994            QueryExpr::DropGraph(q) => Some(q.name.as_str()),
5995            QueryExpr::DropVector(q) => Some(q.name.as_str()),
5996            QueryExpr::DropDocument(q) => Some(q.name.as_str()),
5997            QueryExpr::DropKv(q) => Some(q.name.as_str()),
5998            QueryExpr::DropCollection(q) => Some(q.name.as_str()),
5999            QueryExpr::Truncate(q) => Some(q.name.as_str()),
6000            _ => None,
6001        };
6002        if system_schema_target.is_some_and(crate::runtime::impl_ddl::is_system_schema_name) {
6003            return Err(RedDBError::Query("system schema is read-only".to_string()));
6004        }
6005
6006        let expected = match expr {
6007            QueryExpr::DropTable(q) => Some((q.name.as_str(), CollectionModel::Table)),
6008            QueryExpr::DropGraph(q) => Some((q.name.as_str(), CollectionModel::Graph)),
6009            QueryExpr::DropVector(q) => Some((q.name.as_str(), CollectionModel::Vector)),
6010            QueryExpr::DropDocument(q) => Some((q.name.as_str(), CollectionModel::Document)),
6011            QueryExpr::DropKv(q) => Some((q.name.as_str(), q.model)),
6012            QueryExpr::Truncate(q) => q.model.map(|model| (q.name.as_str(), model)),
6013            QueryExpr::KvCommand(cmd) => {
6014                let (collection, model) = match cmd {
6015                    KvCommand::Put {
6016                        collection, model, ..
6017                    }
6018                    | KvCommand::Get {
6019                        collection, model, ..
6020                    }
6021                    | KvCommand::Incr {
6022                        collection, model, ..
6023                    }
6024                    | KvCommand::Cas {
6025                        collection, model, ..
6026                    }
6027                    | KvCommand::Delete {
6028                        collection, model, ..
6029                    } => (collection.as_str(), *model),
6030                    KvCommand::Rotate { collection, .. }
6031                    | KvCommand::History { collection, .. }
6032                    | KvCommand::List { collection, .. }
6033                    | KvCommand::Purge { collection, .. } => {
6034                        (collection.as_str(), CollectionModel::Vault)
6035                    }
6036                    KvCommand::InvalidateTags { collection, .. } => {
6037                        (collection.as_str(), CollectionModel::Kv)
6038                    }
6039                    KvCommand::Watch {
6040                        collection, model, ..
6041                    } => (collection.as_str(), *model),
6042                    KvCommand::Unseal { collection, .. } => {
6043                        (collection.as_str(), CollectionModel::Vault)
6044                    }
6045                };
6046                Some((collection, model))
6047            }
6048            QueryExpr::ConfigCommand(cmd) => {
6049                self.validate_config_command_before_auth(cmd)?;
6050                None
6051            }
6052            _ => None,
6053        };
6054
6055        let Some((name, expected_model)) = expected else {
6056            return Ok(());
6057        };
6058        let snapshot = self.inner.db.catalog_model_snapshot();
6059        let Some(actual_model) = snapshot
6060            .collections
6061            .iter()
6062            .find(|collection| collection.name == name)
6063            .map(|collection| collection.declared_model.unwrap_or(collection.model))
6064        else {
6065            return Ok(());
6066        };
6067        polymorphic_resolver::ensure_model_match(expected_model, actual_model)
6068    }
6069
6070    /// Walk a `QueryExpr` and replace `QueryExpr::Table(tq)` nodes whose
6071    /// `tq.table` matches a registered view name with the view's stored
6072    /// body. Recurses through joins so `SELECT ... FROM t JOIN myview ...`
6073    /// resolves correctly. Pure operation — no side effects.
6074    pub(super) fn rewrite_view_refs(&self, expr: QueryExpr) -> QueryExpr {
6075        // Fast path: no views registered → return original expression.
6076        if self.inner.views.read().is_empty() {
6077            return expr;
6078        }
6079        self.rewrite_view_refs_inner(expr)
6080    }
6081
6082    fn rewrite_view_refs_inner(&self, expr: QueryExpr) -> QueryExpr {
6083        use crate::storage::query::ast::{Filter, TableSource};
6084        match expr {
6085            QueryExpr::Table(mut tq) => {
6086                // 1. If the TableSource is a subquery, recurse into it so
6087                //    `SELECT ... FROM (SELECT ... FROM myview) t` expands.
6088                //    The legacy `table` field (set to a synthetic
6089                //    "__subq_NNNN" sentinel) stays as-is so callers that
6090                //    read it keep compiling.
6091                if let Some(TableSource::Subquery(body)) = tq.source.take() {
6092                    tq.source = Some(TableSource::Subquery(Box::new(
6093                        self.rewrite_view_refs_inner(*body),
6094                    )));
6095                    return QueryExpr::Table(tq);
6096                }
6097
6098                // 2. Restore the source field (took it above for match).
6099                // When the source was `None` or `TableSource::Name(_)`, the
6100                // real lookup key is `tq.table` — check the view registry.
6101                let maybe_view = {
6102                    let views = self.inner.views.read();
6103                    views.get(&tq.table).cloned()
6104                };
6105                let Some(view) = maybe_view else {
6106                    return QueryExpr::Table(tq);
6107                };
6108
6109                // Recurse into the view body — views may reference other
6110                // views. The recursion yields the final QueryExpr we need
6111                // to merge the outer's filter / limit / offset into.
6112                let inner_expr = self.rewrite_view_refs_inner((*view.query).clone());
6113
6114                // Phase 5: when the body is a Table we merge the outer
6115                // TableQuery's WHERE / LIMIT / OFFSET into it so stacked
6116                // views filter recursively. Non-table bodies (Search,
6117                // Ask, Vector, Graph, Hybrid) can't meaningfully combine
6118                // with an outer Table query today — return the body
6119                // verbatim; outer predicates are lost. Full projection
6120                // merge lands in Phase 5.2.
6121                match inner_expr {
6122                    QueryExpr::Table(mut inner_tq) => {
6123                        if let Some(outer_filter) = tq.filter.take() {
6124                            inner_tq.filter = Some(match inner_tq.filter.take() {
6125                                Some(existing) => {
6126                                    Filter::And(Box::new(existing), Box::new(outer_filter))
6127                                }
6128                                None => outer_filter,
6129                            });
6130                        }
6131                        if let Some(outer_limit) = tq.limit {
6132                            inner_tq.limit = Some(match inner_tq.limit {
6133                                Some(existing) => existing.min(outer_limit),
6134                                None => outer_limit,
6135                            });
6136                        }
6137                        if let Some(outer_offset) = tq.offset {
6138                            inner_tq.offset = Some(match inner_tq.offset {
6139                                Some(existing) => existing + outer_offset,
6140                                None => outer_offset,
6141                            });
6142                        }
6143                        QueryExpr::Table(inner_tq)
6144                    }
6145                    other => other,
6146                }
6147            }
6148            QueryExpr::Join(mut jq) => {
6149                jq.left = Box::new(self.rewrite_view_refs_inner(*jq.left));
6150                jq.right = Box::new(self.rewrite_view_refs_inner(*jq.right));
6151                QueryExpr::Join(jq)
6152            }
6153            // Other variants don't carry nested QueryExpr that can reference
6154            // a view by table name. Return as-is.
6155            other => other,
6156        }
6157    }
6158
6159    /// Internal dispatch: route a `QueryExpr` to the appropriate executor.
6160    /// Shared by `execute_query` (after parse/cache) and `execute_query_expr`
6161    /// (direct call from prepared-statement handler).
6162    fn authorize_relational_table_select(
6163        &self,
6164        mut table: TableQuery,
6165        frame: &dyn super::statement_frame::ReadFrame,
6166    ) -> RedDBResult<Option<TableQuery>> {
6167        if let Some(TableSource::Subquery(inner)) = table.source.take() {
6168            let authorized_inner = self.authorize_relational_select_expr(*inner, frame)?;
6169            table.source = Some(TableSource::Subquery(Box::new(authorized_inner)));
6170            return Ok(Some(table));
6171        }
6172
6173        self.check_table_column_projection_authz(&table, frame)?;
6174
6175        if self.inner.rls_enabled_tables.read().contains(&table.table) {
6176            return Ok(inject_rls_filters(self, frame, table));
6177        }
6178
6179        Ok(Some(table))
6180    }
6181
6182    fn authorize_relational_join_select(
6183        &self,
6184        mut join: JoinQuery,
6185        frame: &dyn super::statement_frame::ReadFrame,
6186    ) -> RedDBResult<Option<JoinQuery>> {
6187        self.check_join_column_projection_authz(&join, frame)?;
6188        join.left = Box::new(self.authorize_relational_join_child(*join.left, frame)?);
6189        join.right = Box::new(self.authorize_relational_join_child(*join.right, frame)?);
6190        Ok(inject_rls_into_join(self, frame, join))
6191    }
6192
6193    fn authorize_relational_join_child(
6194        &self,
6195        expr: QueryExpr,
6196        frame: &dyn super::statement_frame::ReadFrame,
6197    ) -> RedDBResult<QueryExpr> {
6198        match expr {
6199            QueryExpr::Table(mut table) => {
6200                if let Some(TableSource::Subquery(inner)) = table.source.take() {
6201                    let authorized_inner = self.authorize_relational_select_expr(*inner, frame)?;
6202                    table.source = Some(TableSource::Subquery(Box::new(authorized_inner)));
6203                }
6204                Ok(QueryExpr::Table(table))
6205            }
6206            QueryExpr::Join(join) => self
6207                .authorize_relational_join_select(join, frame)?
6208                .map(QueryExpr::Join)
6209                .ok_or_else(|| {
6210                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6211                }),
6212            other => Ok(other),
6213        }
6214    }
6215
6216    fn authorize_relational_select_expr(
6217        &self,
6218        expr: QueryExpr,
6219        frame: &dyn super::statement_frame::ReadFrame,
6220    ) -> RedDBResult<QueryExpr> {
6221        match expr {
6222            QueryExpr::Table(table) => self
6223                .authorize_relational_table_select(table, frame)?
6224                .map(QueryExpr::Table)
6225                .ok_or_else(|| {
6226                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6227                }),
6228            QueryExpr::Join(join) => self
6229                .authorize_relational_join_select(join, frame)?
6230                .map(QueryExpr::Join)
6231                .ok_or_else(|| {
6232                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6233                }),
6234            other => Ok(other),
6235        }
6236    }
6237
6238    fn check_table_column_projection_authz(
6239        &self,
6240        table: &TableQuery,
6241        frame: &dyn super::statement_frame::ReadFrame,
6242    ) -> RedDBResult<()> {
6243        let Some((username, role)) = frame.identity() else {
6244            return Ok(());
6245        };
6246        let Some(auth_store) = self.inner.auth_store.read().clone() else {
6247            return Ok(());
6248        };
6249
6250        let columns = self.resolved_table_projection_columns(table)?;
6251        let request = ColumnAccessRequest::select(table.table.clone(), columns);
6252        let principal = UserId::from_parts(frame.effective_scope(), username);
6253        let ctx = runtime_iam_context(role, frame.effective_scope());
6254        let outcome = auth_store.check_column_projection_authz(&principal, &request, &ctx);
6255        if outcome.allowed() {
6256            return Ok(());
6257        }
6258
6259        if let Some(denied) = outcome.first_denied_column() {
6260            return Err(RedDBError::Query(format!(
6261                "permission denied: principal=`{username}` cannot select column `{}`",
6262                denied.resource.name
6263            )));
6264        }
6265        Err(RedDBError::Query(format!(
6266            "permission denied: principal=`{username}` cannot select table `{}`",
6267            table.table
6268        )))
6269    }
6270
6271    fn check_join_column_projection_authz(
6272        &self,
6273        join: &JoinQuery,
6274        frame: &dyn super::statement_frame::ReadFrame,
6275    ) -> RedDBResult<()> {
6276        let mut by_table: HashMap<String, BTreeSet<String>> = HashMap::new();
6277        let projections = crate::storage::query::sql_lowering::effective_join_projections(join);
6278        self.collect_join_projection_columns(join, &projections, &mut by_table)?;
6279
6280        for (table, columns) in by_table {
6281            let query = TableQuery {
6282                table,
6283                source: None,
6284                alias: None,
6285                select_items: Vec::new(),
6286                columns: columns.into_iter().map(Projection::Column).collect(),
6287                where_expr: None,
6288                filter: None,
6289                group_by_exprs: Vec::new(),
6290                group_by: Vec::new(),
6291                having_expr: None,
6292                having: None,
6293                order_by: Vec::new(),
6294                limit: None,
6295                limit_param: None,
6296                offset: None,
6297                offset_param: None,
6298                expand: None,
6299                as_of: None,
6300            };
6301            self.check_table_column_projection_authz(&query, frame)?;
6302        }
6303        Ok(())
6304    }
6305
6306    fn collect_join_projection_columns(
6307        &self,
6308        join: &JoinQuery,
6309        projections: &[Projection],
6310        out: &mut HashMap<String, BTreeSet<String>>,
6311    ) -> RedDBResult<()> {
6312        let left = table_side_context(join.left.as_ref());
6313        let right = table_side_context(join.right.as_ref());
6314
6315        if projections
6316            .iter()
6317            .any(|projection| matches!(projection, Projection::All))
6318        {
6319            for side in [left.as_ref(), right.as_ref()].into_iter().flatten() {
6320                out.entry(side.table.clone())
6321                    .or_default()
6322                    .extend(self.table_all_projection_columns(&side.table)?);
6323            }
6324            return Ok(());
6325        }
6326
6327        for projection in projections {
6328            collect_projection_columns_for_join_side(
6329                projection,
6330                left.as_ref(),
6331                right.as_ref(),
6332                out,
6333            )?;
6334        }
6335        Ok(())
6336    }
6337
6338    fn resolved_table_projection_columns(&self, table: &TableQuery) -> RedDBResult<Vec<String>> {
6339        let projections = crate::storage::query::sql_lowering::effective_table_projections(table);
6340        if projections
6341            .iter()
6342            .any(|projection| matches!(projection, Projection::All))
6343        {
6344            return self.table_all_projection_columns(&table.table);
6345        }
6346
6347        let mut columns = BTreeSet::new();
6348        for projection in &projections {
6349            collect_projection_columns_for_table(
6350                projection,
6351                &table.table,
6352                table.alias.as_deref(),
6353                &mut columns,
6354            );
6355        }
6356        Ok(columns.into_iter().collect())
6357    }
6358
6359    fn table_all_projection_columns(&self, table: &str) -> RedDBResult<Vec<String>> {
6360        if let Some(contract) = self.inner.db.collection_contract_arc(table) {
6361            let columns: Vec<String> = contract
6362                .declared_columns
6363                .iter()
6364                .map(|column| column.name.clone())
6365                .collect();
6366            if !columns.is_empty() {
6367                return Ok(columns);
6368            }
6369        }
6370
6371        let records = scan_runtime_table_source_records_limited(&self.inner.db, table, Some(1))?;
6372        Ok(records
6373            .first()
6374            .map(|record| {
6375                record
6376                    .column_names()
6377                    .into_iter()
6378                    .map(|column| column.to_string())
6379                    .collect()
6380            })
6381            .unwrap_or_default())
6382    }
6383
6384    fn resolve_table_expr_subqueries(
6385        &self,
6386        mut table: TableQuery,
6387        frame: &dyn super::statement_frame::ReadFrame,
6388    ) -> RedDBResult<TableQuery> {
6389        if let Some(TableSource::Subquery(inner)) = table.source.take() {
6390            let inner = self.resolve_select_expr_subqueries(*inner, frame)?;
6391            table.source = Some(TableSource::Subquery(Box::new(inner)));
6392        }
6393
6394        let outer_scopes = relation_scopes_for_query(&QueryExpr::Table(table.clone()));
6395        for item in &mut table.select_items {
6396            if let crate::storage::query::ast::SelectItem::Expr { expr, .. } = item {
6397                *expr = self.resolve_expr_subqueries(expr.clone(), &outer_scopes, frame)?;
6398            }
6399        }
6400        if let Some(where_expr) = table.where_expr.take() {
6401            table.where_expr =
6402                Some(self.resolve_expr_subqueries(where_expr, &outer_scopes, frame)?);
6403            table.filter = None;
6404        }
6405        if let Some(having_expr) = table.having_expr.take() {
6406            table.having_expr =
6407                Some(self.resolve_expr_subqueries(having_expr, &outer_scopes, frame)?);
6408            table.having = None;
6409        }
6410        for expr in &mut table.group_by_exprs {
6411            *expr = self.resolve_expr_subqueries(expr.clone(), &outer_scopes, frame)?;
6412        }
6413        for clause in &mut table.order_by {
6414            if let Some(expr) = clause.expr.take() {
6415                clause.expr = Some(self.resolve_expr_subqueries(expr, &outer_scopes, frame)?);
6416            }
6417        }
6418        Ok(table)
6419    }
6420
6421    fn resolve_select_expr_subqueries(
6422        &self,
6423        expr: QueryExpr,
6424        frame: &dyn super::statement_frame::ReadFrame,
6425    ) -> RedDBResult<QueryExpr> {
6426        match expr {
6427            QueryExpr::Table(table) => self
6428                .resolve_table_expr_subqueries(table, frame)
6429                .map(QueryExpr::Table),
6430            QueryExpr::Join(mut join) => {
6431                join.left = Box::new(self.resolve_select_expr_subqueries(*join.left, frame)?);
6432                join.right = Box::new(self.resolve_select_expr_subqueries(*join.right, frame)?);
6433                Ok(QueryExpr::Join(join))
6434            }
6435            other => Ok(other),
6436        }
6437    }
6438
6439    fn resolve_expr_subqueries(
6440        &self,
6441        expr: crate::storage::query::ast::Expr,
6442        outer_scopes: &[String],
6443        frame: &dyn super::statement_frame::ReadFrame,
6444    ) -> RedDBResult<crate::storage::query::ast::Expr> {
6445        use crate::storage::query::ast::Expr;
6446
6447        match expr {
6448            Expr::Subquery { query, span } => {
6449                let values = self.execute_expr_subquery_values(query, outer_scopes, frame)?;
6450                if values.len() > 1 {
6451                    return Err(RedDBError::Query(
6452                        "scalar subquery returned more than one row".to_string(),
6453                    ));
6454                }
6455                Ok(Expr::Literal {
6456                    value: values.into_iter().next().unwrap_or(Value::Null),
6457                    span,
6458                })
6459            }
6460            Expr::BinaryOp { op, lhs, rhs, span } => Ok(Expr::BinaryOp {
6461                op,
6462                lhs: Box::new(self.resolve_expr_subqueries(*lhs, outer_scopes, frame)?),
6463                rhs: Box::new(self.resolve_expr_subqueries(*rhs, outer_scopes, frame)?),
6464                span,
6465            }),
6466            Expr::UnaryOp { op, operand, span } => Ok(Expr::UnaryOp {
6467                op,
6468                operand: Box::new(self.resolve_expr_subqueries(*operand, outer_scopes, frame)?),
6469                span,
6470            }),
6471            Expr::Cast {
6472                inner,
6473                target,
6474                span,
6475            } => Ok(Expr::Cast {
6476                inner: Box::new(self.resolve_expr_subqueries(*inner, outer_scopes, frame)?),
6477                target,
6478                span,
6479            }),
6480            Expr::FunctionCall { name, args, span } => {
6481                let args = args
6482                    .into_iter()
6483                    .map(|arg| self.resolve_expr_subqueries(arg, outer_scopes, frame))
6484                    .collect::<RedDBResult<Vec<_>>>()?;
6485                Ok(Expr::FunctionCall { name, args, span })
6486            }
6487            Expr::Case {
6488                branches,
6489                else_,
6490                span,
6491            } => {
6492                let branches = branches
6493                    .into_iter()
6494                    .map(|(cond, value)| {
6495                        Ok((
6496                            self.resolve_expr_subqueries(cond, outer_scopes, frame)?,
6497                            self.resolve_expr_subqueries(value, outer_scopes, frame)?,
6498                        ))
6499                    })
6500                    .collect::<RedDBResult<Vec<_>>>()?;
6501                let else_ = else_
6502                    .map(|expr| self.resolve_expr_subqueries(*expr, outer_scopes, frame))
6503                    .transpose()?
6504                    .map(Box::new);
6505                Ok(Expr::Case {
6506                    branches,
6507                    else_,
6508                    span,
6509                })
6510            }
6511            Expr::IsNull {
6512                operand,
6513                negated,
6514                span,
6515            } => Ok(Expr::IsNull {
6516                operand: Box::new(self.resolve_expr_subqueries(*operand, outer_scopes, frame)?),
6517                negated,
6518                span,
6519            }),
6520            Expr::InList {
6521                target,
6522                values,
6523                negated,
6524                span,
6525            } => {
6526                let target =
6527                    Box::new(self.resolve_expr_subqueries(*target, outer_scopes, frame)?);
6528                let mut resolved = Vec::new();
6529                for value in values {
6530                    if let Expr::Subquery { query, .. } = value {
6531                        resolved.extend(
6532                            self.execute_expr_subquery_values(query, outer_scopes, frame)?
6533                                .into_iter()
6534                                .map(Expr::lit),
6535                        );
6536                    } else {
6537                        resolved.push(self.resolve_expr_subqueries(value, outer_scopes, frame)?);
6538                    }
6539                }
6540                Ok(Expr::InList {
6541                    target,
6542                    values: resolved,
6543                    negated,
6544                    span,
6545                })
6546            }
6547            Expr::Between {
6548                target,
6549                low,
6550                high,
6551                negated,
6552                span,
6553            } => Ok(Expr::Between {
6554                target: Box::new(self.resolve_expr_subqueries(*target, outer_scopes, frame)?),
6555                low: Box::new(self.resolve_expr_subqueries(*low, outer_scopes, frame)?),
6556                high: Box::new(self.resolve_expr_subqueries(*high, outer_scopes, frame)?),
6557                negated,
6558                span,
6559            }),
6560            other => Ok(other),
6561        }
6562    }
6563
6564    fn execute_expr_subquery_values(
6565        &self,
6566        subquery: crate::storage::query::ast::ExprSubquery,
6567        outer_scopes: &[String],
6568        frame: &dyn super::statement_frame::ReadFrame,
6569    ) -> RedDBResult<Vec<Value>> {
6570        let query = *subquery.query;
6571        if query_references_outer_scope(&query, outer_scopes) {
6572            return Err(RedDBError::Query(
6573                "NOT_YET_SUPPORTED: correlated subqueries are not supported yet; track follow-up issue #470-correlated-subqueries".to_string(),
6574            ));
6575        }
6576        let query = self.rewrite_view_refs(query);
6577        let query = self.resolve_select_expr_subqueries(query, frame)?;
6578        let query = self.authorize_relational_select_expr(query, frame)?;
6579        let result = match query {
6580            QueryExpr::Table(table) => {
6581                execute_runtime_table_query(&self.inner.db, &table, Some(&self.inner.index_store))?
6582            }
6583            QueryExpr::Join(join) => execute_runtime_join_query(&self.inner.db, &join)?,
6584            other => {
6585                return Err(RedDBError::Query(format!(
6586                    "expression subquery must be a SELECT query, got {}",
6587                    query_expr_name(&other)
6588                )))
6589            }
6590        };
6591        first_column_values(result)
6592    }
6593
6594    fn dispatch_expr(
6595        &self,
6596        expr: QueryExpr,
6597        query_str: &str,
6598        mode: QueryMode,
6599    ) -> RedDBResult<RuntimeQueryResult> {
6600        let statement = query_expr_name(&expr);
6601        match expr {
6602            QueryExpr::Graph(_) | QueryExpr::Path(_) => {
6603                // Graph queries are not cacheable as prepared statements.
6604                Err(RedDBError::Query(
6605                    "graph queries cannot be used as prepared statements".to_string(),
6606                ))
6607            }
6608            QueryExpr::Table(table) => {
6609                let scope = self.ai_scope();
6610                let table = self.resolve_table_expr_subqueries(
6611                    table,
6612                    &scope as &dyn super::statement_frame::ReadFrame,
6613                )?;
6614                if super::red_schema::is_virtual_table(&table.table) {
6615                    return Ok(RuntimeQueryResult {
6616                        query: query_str.to_string(),
6617                        mode,
6618                        statement,
6619                        engine: "runtime-red-schema",
6620                        result: super::red_schema::red_query(
6621                            self,
6622                            &table.table,
6623                            &table,
6624                            &scope as &dyn super::statement_frame::ReadFrame,
6625                        )?,
6626                        affected_rows: 0,
6627                        statement_type: "select",
6628                    });
6629                }
6630                let Some(table_with_rls) = self.authorize_relational_table_select(
6631                    table,
6632                    &scope as &dyn super::statement_frame::ReadFrame,
6633                )?
6634                else {
6635                    return Ok(RuntimeQueryResult {
6636                        query: query_str.to_string(),
6637                        mode,
6638                        statement,
6639                        engine: "runtime-table-rls",
6640                        result: crate::storage::query::unified::UnifiedResult::empty(),
6641                        affected_rows: 0,
6642                        statement_type: "select",
6643                    });
6644                };
6645                Ok(RuntimeQueryResult {
6646                    query: query_str.to_string(),
6647                    mode,
6648                    statement,
6649                    engine: "runtime-table",
6650                    result: execute_runtime_table_query(
6651                        &self.inner.db,
6652                        &table_with_rls,
6653                        Some(&self.inner.index_store),
6654                    )?,
6655                    affected_rows: 0,
6656                    statement_type: "select",
6657                })
6658            }
6659            QueryExpr::Join(join) => {
6660                let scope = self.ai_scope();
6661                let Some(join_with_rls) = self.authorize_relational_join_select(
6662                    join,
6663                    &scope as &dyn super::statement_frame::ReadFrame,
6664                )?
6665                else {
6666                    return Ok(RuntimeQueryResult {
6667                        query: query_str.to_string(),
6668                        mode,
6669                        statement,
6670                        engine: "runtime-join-rls",
6671                        result: crate::storage::query::unified::UnifiedResult::empty(),
6672                        affected_rows: 0,
6673                        statement_type: "select",
6674                    });
6675                };
6676                Ok(RuntimeQueryResult {
6677                    query: query_str.to_string(),
6678                    mode,
6679                    statement,
6680                    engine: "runtime-join",
6681                    result: execute_runtime_join_query(&self.inner.db, &join_with_rls)?,
6682                    affected_rows: 0,
6683                    statement_type: "select",
6684                })
6685            }
6686            QueryExpr::Vector(vector) => Ok(RuntimeQueryResult {
6687                query: query_str.to_string(),
6688                mode,
6689                statement,
6690                engine: "runtime-vector",
6691                result: execute_runtime_vector_query(&self.inner.db, &vector)?,
6692                affected_rows: 0,
6693                statement_type: "select",
6694            }),
6695            QueryExpr::Hybrid(hybrid) => Ok(RuntimeQueryResult {
6696                query: query_str.to_string(),
6697                mode,
6698                statement,
6699                engine: "runtime-hybrid",
6700                result: execute_runtime_hybrid_query(&self.inner.db, &hybrid)?,
6701                affected_rows: 0,
6702                statement_type: "select",
6703            }),
6704            QueryExpr::Insert(ref insert) if super::red_schema::is_virtual_table(&insert.table) => {
6705                Err(RedDBError::Query(
6706                    super::red_schema::READ_ONLY_ERROR.to_string(),
6707                ))
6708            }
6709            QueryExpr::Update(ref update) if super::red_schema::is_virtual_table(&update.table) => {
6710                Err(RedDBError::Query(
6711                    super::red_schema::READ_ONLY_ERROR.to_string(),
6712                ))
6713            }
6714            QueryExpr::Delete(ref delete) if super::red_schema::is_virtual_table(&delete.table) => {
6715                Err(RedDBError::Query(
6716                    super::red_schema::READ_ONLY_ERROR.to_string(),
6717                ))
6718            }
6719            QueryExpr::Insert(ref insert) => self
6720                .with_deferred_store_wal_if_transaction(|| self.execute_insert(query_str, insert)),
6721            QueryExpr::Update(ref update) => self
6722                .with_deferred_store_wal_if_transaction(|| self.execute_update(query_str, update)),
6723            QueryExpr::Delete(ref delete) => self
6724                .with_deferred_store_wal_if_transaction(|| self.execute_delete(query_str, delete)),
6725            QueryExpr::SearchCommand(ref cmd) => self.execute_search_command(query_str, cmd),
6726            QueryExpr::Ask(ref ask) => self.execute_ask(query_str, ask),
6727            _ => Err(RedDBError::Query(format!(
6728                "prepared-statement execution does not support {statement} statements"
6729            ))),
6730        }
6731    }
6732
6733    /// Ultra-fast path: detect `SELECT * FROM table WHERE _entity_id = N` by string pattern
6734    /// and execute it without SQL parsing or planning. Returns None if pattern doesn't match.
6735    fn try_fast_entity_lookup(&self, query: &str) -> Option<RedDBResult<RuntimeQueryResult>> {
6736        // Pattern: "SELECT * FROM <table> WHERE _entity_id = <id>"
6737        // or "SELECT * FROM <table> WHERE _entity_id =<id>"
6738        let q = query.trim();
6739        if !q.starts_with("SELECT") && !q.starts_with("select") {
6740            return None;
6741        }
6742
6743        // Find "WHERE _entity_id = " or "WHERE _entity_id ="
6744        let where_pos = q
6745            .find("WHERE _entity_id")
6746            .or_else(|| q.find("where _entity_id"))?;
6747        let after_field = &q[where_pos + 16..].trim_start(); // skip "WHERE _entity_id"
6748        let after_eq = after_field.strip_prefix('=')?.trim_start();
6749
6750        // Parse the entity ID number
6751        let id_str = after_eq.trim();
6752        let entity_id: u64 = id_str.parse().ok()?;
6753
6754        // Extract table name: between "FROM " and " WHERE"
6755        let from_pos = q.find("FROM ").or_else(|| q.find("from "))? + 5;
6756        let table = q[from_pos..where_pos].trim();
6757        if table.is_empty()
6758            || table.contains(' ') && !table.contains(" AS ") && !table.contains(" as ")
6759        {
6760            return None; // complex query, fall through
6761        }
6762        let table_name = table.split_whitespace().next()?;
6763
6764        // Direct entity lookup — skips SQL parse, plan cache, result
6765        // cache, view rewriter, RLS gate. Safe because the gating in
6766        // `execute_query` guarantees no scope override / no
6767        // transaction context is active. MVCC visibility is still
6768        // honoured against the current snapshot.
6769        let store = self.inner.db.store();
6770        let entity = store
6771            .get(
6772                table_name,
6773                crate::storage::unified::EntityId::new(entity_id),
6774            )
6775            .filter(entity_visible_under_current_snapshot);
6776
6777        let count = if entity.is_some() { 1u64 } else { 0 };
6778
6779        // Materialize a record so downstream consumers that walk
6780        // `result.records` (embedded runtime API, decrypt pass, CLI)
6781        // see the row. Previously only `pre_serialized_json` was
6782        // filled, which caused those consumers to see zero rows and
6783        // skewed benchmarks.
6784        let records: Vec<crate::storage::query::unified::UnifiedRecord> = entity
6785            .as_ref()
6786            .and_then(|e| runtime_table_record_from_entity(e.clone()))
6787            .into_iter()
6788            .collect();
6789
6790        let json = match entity {
6791            Some(ref e) => execute_runtime_serialize_single_entity(e),
6792            None => r#"{"columns":[],"record_count":0,"selection":{"scope":"any"},"records":[]}"#
6793                .to_string(),
6794        };
6795
6796        Some(Ok(RuntimeQueryResult {
6797            query: query.to_string(),
6798            mode: crate::storage::query::modes::QueryMode::Sql,
6799            statement: "select",
6800            engine: "fast-entity-lookup",
6801            result: crate::storage::query::unified::UnifiedResult {
6802                columns: Vec::new(),
6803                records,
6804                stats: crate::storage::query::unified::QueryStats {
6805                    rows_scanned: count,
6806                    ..Default::default()
6807                },
6808                pre_serialized_json: Some(json),
6809            },
6810            affected_rows: 0,
6811            statement_type: "select",
6812        }))
6813    }
6814
6815    fn result_cache_backend(&self) -> RuntimeResultCacheBackend {
6816        match self
6817            .config_string(RESULT_CACHE_BACKEND_KEY, RESULT_CACHE_DEFAULT_BACKEND)
6818            .as_str()
6819        {
6820            "blob_cache" => RuntimeResultCacheBackend::BlobCache,
6821            "shadow" => RuntimeResultCacheBackend::Shadow,
6822            _ => RuntimeResultCacheBackend::Legacy,
6823        }
6824    }
6825
6826    pub(super) fn get_result_cache_entry(&self, key: &str) -> Option<RuntimeQueryResult> {
6827        match self.result_cache_backend() {
6828            RuntimeResultCacheBackend::Legacy => self.get_legacy_result_cache_entry(key),
6829            RuntimeResultCacheBackend::BlobCache => self.get_blob_result_cache_entry(key),
6830            RuntimeResultCacheBackend::Shadow => {
6831                let legacy = self.get_legacy_result_cache_entry(key);
6832                let blob = self.get_blob_result_cache_entry(key);
6833                if let (Some(ref legacy), Some(ref blob)) = (&legacy, &blob) {
6834                    if result_cache_fingerprint(legacy) != result_cache_fingerprint(blob) {
6835                        self.inner
6836                            .result_cache_shadow_divergences
6837                            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
6838                        tracing::warn!(
6839                            key,
6840                            metric = crate::runtime::METRIC_CACHE_SHADOW_DIVERGENCE_TOTAL,
6841                            "result cache shadow backend diverged from legacy"
6842                        );
6843                    }
6844                }
6845                legacy
6846            }
6847        }
6848    }
6849
6850    fn get_legacy_result_cache_entry(&self, key: &str) -> Option<RuntimeQueryResult> {
6851        let cache = self.inner.result_cache.read();
6852        cache.0.get(key).and_then(|entry| {
6853            if entry.cached_at.elapsed().as_secs() < RESULT_CACHE_TTL_SECS {
6854                Some(entry.result.clone())
6855            } else {
6856                None
6857            }
6858        })
6859    }
6860
6861    fn get_blob_result_cache_entry(&self, key: &str) -> Option<RuntimeQueryResult> {
6862        let hit = self
6863            .inner
6864            .result_blob_cache
6865            .get(RESULT_CACHE_BLOB_NAMESPACE, key)?;
6866        {
6867            let cache = self.inner.result_blob_entries.read();
6868            if let Some(entry) = cache.0.get(key) {
6869                return Some(entry.result.clone());
6870            }
6871        }
6872
6873        let (result, scopes) = decode_result_cache_payload(hit.value())?;
6874        let mut cache = self.inner.result_blob_entries.write();
6875        let (ref mut map, ref mut order) = *cache;
6876        if !map.contains_key(key) {
6877            order.push_back(key.to_string());
6878        }
6879        map.insert(
6880            key.to_string(),
6881            RuntimeResultCacheEntry {
6882                result: result.clone(),
6883                cached_at: std::time::Instant::now(),
6884                scopes,
6885            },
6886        );
6887        trim_result_cache(map, order);
6888        Some(result)
6889    }
6890
6891    pub(super) fn put_result_cache_entry(&self, key: &str, entry: RuntimeResultCacheEntry) {
6892        match self.result_cache_backend() {
6893            RuntimeResultCacheBackend::Legacy => self.put_legacy_result_cache_entry(key, entry),
6894            RuntimeResultCacheBackend::BlobCache => self.put_blob_result_cache_entry(key, entry),
6895            RuntimeResultCacheBackend::Shadow => {
6896                self.put_legacy_result_cache_entry(key, entry.clone());
6897                self.put_blob_result_cache_entry(key, entry);
6898            }
6899        }
6900    }
6901
6902    fn put_legacy_result_cache_entry(&self, key: &str, entry: RuntimeResultCacheEntry) {
6903        let mut cache = self.inner.result_cache.write();
6904        let (ref mut map, ref mut order) = *cache;
6905        if !map.contains_key(key) {
6906            order.push_back(key.to_string());
6907        }
6908        map.insert(key.to_string(), entry);
6909        trim_result_cache(map, order);
6910    }
6911
6912    fn put_blob_result_cache_entry(&self, key: &str, entry: RuntimeResultCacheEntry) {
6913        let policy = crate::storage::cache::BlobCachePolicy::default()
6914            .ttl_ms(RESULT_CACHE_TTL_SECS * 1000)
6915            .priority(200);
6916        let dependencies = entry.scopes.iter().cloned().collect::<Vec<_>>();
6917        let bytes = encode_result_cache_payload(&entry)
6918            .unwrap_or_else(|| result_cache_fingerprint(&entry.result).into_bytes());
6919        let put = crate::storage::cache::BlobCachePut::new(bytes)
6920            .with_dependencies(dependencies)
6921            .with_policy(policy);
6922        if self
6923            .inner
6924            .result_blob_cache
6925            .put(RESULT_CACHE_BLOB_NAMESPACE, key, put)
6926            .is_err()
6927        {
6928            return;
6929        }
6930
6931        let mut cache = self.inner.result_blob_entries.write();
6932        let (ref mut map, ref mut order) = *cache;
6933        if !map.contains_key(key) {
6934            order.push_back(key.to_string());
6935        }
6936        map.insert(key.to_string(), entry);
6937        trim_result_cache(map, order);
6938    }
6939
6940    pub fn result_cache_shadow_divergences(&self) -> u64 {
6941        self.inner
6942            .result_cache_shadow_divergences
6943            .load(std::sync::atomic::Ordering::Relaxed)
6944    }
6945
6946    /// Invalidate the result cache (call after any write operation).
6947    /// Full clear — use for DDL (DROP TABLE, schema changes) or when table is unknown.
6948    pub fn invalidate_result_cache(&self) {
6949        let mut cache = self.inner.result_cache.write();
6950        cache.0.clear();
6951        cache.1.clear();
6952        let mut blob_entries = self.inner.result_blob_entries.write();
6953        blob_entries.0.clear();
6954        blob_entries.1.clear();
6955        self.inner
6956            .result_blob_cache
6957            .invalidate_namespace(RESULT_CACHE_BLOB_NAMESPACE);
6958        let mut ask_entries = self.inner.ask_answer_cache_entries.write();
6959        ask_entries.0.clear();
6960        ask_entries.1.clear();
6961        self.inner
6962            .result_blob_cache
6963            .invalidate_namespace(ASK_ANSWER_CACHE_NAMESPACE);
6964    }
6965
6966    /// Invalidate only result cache entries that declared a dependency on `table`.
6967    /// Cheaper than a full clear: unrelated tables keep their cached results.
6968    pub(crate) fn invalidate_result_cache_for_table(&self, table: &str) {
6969        // Hot-path probe both backends before taking write locks. The blob
6970        // backend is node-local, same as the legacy result cache.
6971        let legacy_has_match = {
6972            let cache = self.inner.result_cache.read();
6973            let (ref map, _) = *cache;
6974            !map.is_empty() && map.values().any(|entry| entry.scopes.contains(table))
6975        };
6976        let blob_has_match = {
6977            let cache = self.inner.result_blob_entries.read();
6978            let (ref map, _) = *cache;
6979            !map.is_empty() && map.values().any(|entry| entry.scopes.contains(table))
6980        };
6981        if legacy_has_match {
6982            let mut cache = self.inner.result_cache.write();
6983            let (ref mut map, ref mut order) = *cache;
6984            map.retain(|_, entry| !entry.scopes.contains(table));
6985            order.retain(|key| map.contains_key(key));
6986        }
6987
6988        if matches!(
6989            self.result_cache_backend(),
6990            RuntimeResultCacheBackend::BlobCache | RuntimeResultCacheBackend::Shadow
6991        ) {
6992            let mut blob_entries = self.inner.result_blob_entries.write();
6993            let (ref mut blob_map, ref mut blob_order) = *blob_entries;
6994            blob_map.clear();
6995            blob_order.clear();
6996            self.inner
6997                .result_blob_cache
6998                .invalidate_namespace(RESULT_CACHE_BLOB_NAMESPACE);
6999        } else if blob_has_match {
7000            let mut blob_entries = self.inner.result_blob_entries.write();
7001            let (ref mut blob_map, ref mut blob_order) = *blob_entries;
7002            blob_map.retain(|_, entry| !entry.scopes.contains(table));
7003            blob_order.retain(|key| blob_map.contains_key(key));
7004        }
7005        let mut ask_entries = self.inner.ask_answer_cache_entries.write();
7006        ask_entries.0.clear();
7007        ask_entries.1.clear();
7008        self.inner
7009            .result_blob_cache
7010            .invalidate_namespace(ASK_ANSWER_CACHE_NAMESPACE);
7011    }
7012
7013    pub(crate) fn invalidate_plan_cache(&self) {
7014        self.inner.query_cache.write().clear();
7015        self.inner
7016            .ddl_epoch
7017            .fetch_add(1, std::sync::atomic::Ordering::Release);
7018    }
7019
7020    /// Read the monotonic DDL epoch counter. Bumped by every
7021    /// `invalidate_plan_cache` call so prepared-statement holders can
7022    /// detect schema drift between PREPARE and EXECUTE.
7023    pub fn ddl_epoch(&self) -> u64 {
7024        self.inner
7025            .ddl_epoch
7026            .load(std::sync::atomic::Ordering::Acquire)
7027    }
7028
7029    pub(crate) fn clear_table_planner_stats(&self, table: &str) {
7030        let store = self.inner.db.store();
7031        crate::storage::query::planner::stats_catalog::clear_table_stats(store.as_ref(), table);
7032        self.invalidate_plan_cache();
7033    }
7034
7035    /// Replay `tenant_tables.*.column` keys from red_config at boot so
7036    /// `CREATE TABLE ... TENANT BY (col)` declarations persist across
7037    /// restarts (Phase 2.5.4). Reads every row of the `red_config`
7038    /// collection, picks the keys matching the tenant-marker shape,
7039    /// and calls `register_tenant_table` for each.
7040    ///
7041    /// Safe no-op when `red_config` doesn't exist (first boot on a
7042    /// fresh datadir).
7043    pub(crate) fn rehydrate_tenant_tables(&self) {
7044        let store = self.inner.db.store();
7045        let Some(manager) = store.get_collection("red_config") else {
7046            return;
7047        };
7048        // Replay in insertion order (SegmentManager iteration). Multiple
7049        // toggles on the same table leave several rows behind — the
7050        // last one processed wins because each register/unregister
7051        // call overwrites the in-memory state.
7052        for entity in manager.query_all(|_| true) {
7053            let crate::storage::unified::entity::EntityData::Row(row) = &entity.data else {
7054                continue;
7055            };
7056            let Some(named) = &row.named else { continue };
7057            let Some(crate::storage::schema::Value::Text(key)) = named.get("key") else {
7058                continue;
7059            };
7060            // Shape: tenant_tables.{table}.column
7061            let Some(rest) = key.strip_prefix("tenant_tables.") else {
7062                continue;
7063            };
7064            let Some((table, suffix)) = rest.rsplit_once('.') else {
7065                // Issue #205 — a `tenant_tables.*` row that doesn't
7066                // split cleanly is a schema-shape regression: the
7067                // metadata writer must always emit the `.column`
7068                // suffix, so reaching this branch means an upgrade
7069                // with incompatible state or external tampering.
7070                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7071                    collection: "red_config".to_string(),
7072                    detail: format!("malformed tenant_tables key: {key}"),
7073                }
7074                .emit_global();
7075                continue;
7076            };
7077            if suffix != "column" {
7078                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7079                    collection: "red_config".to_string(),
7080                    detail: format!("unexpected tenant_tables suffix: {key}"),
7081                }
7082                .emit_global();
7083                continue;
7084            }
7085            match named.get("value") {
7086                Some(crate::storage::schema::Value::Text(column)) => {
7087                    self.register_tenant_table(table, column);
7088                }
7089                // Null / missing value = DISABLE TENANCY marker.
7090                Some(crate::storage::schema::Value::Null) | None => {
7091                    self.unregister_tenant_table(table);
7092                }
7093                _ => {}
7094            }
7095        }
7096    }
7097
7098    /// Register a table as tenant-scoped (Phase 2.5.4). Installs the
7099    /// in-memory column mapping, the implicit RLS policy, and enables
7100    /// row-level security on the table. Idempotent — re-registering
7101    /// the same `(table, column)` replaces the prior auto-policy.
7102    pub fn register_tenant_table(&self, table: &str, column: &str) {
7103        use crate::storage::query::ast::{
7104            CompareOp, CreatePolicyQuery, Expr, FieldRef, Filter, Span,
7105        };
7106        self.inner
7107            .tenant_tables
7108            .write()
7109            .insert(table.to_string(), column.to_string());
7110
7111        // Build the policy: col = CURRENT_TENANT()
7112        // Uses CompareExpr so the comparison happens at runtime against
7113        // the thread-local tenant value read by the CURRENT_TENANT
7114        // scalar. Spans are synthetic — there's no source location for
7115        // an auto-generated policy.
7116        let lhs = Expr::Column {
7117            field: FieldRef::TableColumn {
7118                table: table.to_string(),
7119                column: column.to_string(),
7120            },
7121            span: Span::synthetic(),
7122        };
7123        let rhs = Expr::FunctionCall {
7124            name: "CURRENT_TENANT".to_string(),
7125            args: Vec::new(),
7126            span: Span::synthetic(),
7127        };
7128        let policy_filter = Filter::CompareExpr {
7129            lhs,
7130            op: CompareOp::Eq,
7131            rhs,
7132        };
7133
7134        let policy = CreatePolicyQuery {
7135            name: "__tenant_iso".to_string(),
7136            table: table.to_string(),
7137            action: None, // None = ALL actions (SELECT/INSERT/UPDATE/DELETE)
7138            role: None,   // None = every role
7139            using: Box::new(policy_filter),
7140            // Auto-tenancy defaults to Table targets. Collections of
7141            // other kinds (graph / vector / queue / timeseries) that
7142            // opt in via `ALTER ... ENABLE TENANCY` should use the
7143            // matching kind — but for now we keep the auto-policy
7144            // kind-agnostic so the evaluator can apply it to any
7145            // entity living in the collection.
7146            target_kind: crate::storage::query::ast::PolicyTargetKind::Table,
7147        };
7148
7149        // Replace any prior auto-policy for this table (column rename).
7150        self.inner.rls_policies.write().insert(
7151            (table.to_string(), "__tenant_iso".to_string()),
7152            Arc::new(policy),
7153        );
7154        self.inner
7155            .rls_enabled_tables
7156            .write()
7157            .insert(table.to_string());
7158
7159        // Auto-build a hash index on the tenant column. Every read/write
7160        // against a tenant-scoped table carries an implicit
7161        // `col = CURRENT_TENANT()` predicate from the auto-policy, so an
7162        // index on that column is on the hot path of every query. Without
7163        // it, every SELECT/UPDATE/DELETE degrades to a full scan.
7164        self.ensure_tenant_index(table, column);
7165    }
7166
7167    /// Auto-create the hash index that backs the tenant-iso RLS predicate.
7168    /// Skipped when:
7169    ///   * the column is dotted (nested path — flat secondary indices
7170    ///     don't cover those today; RLS still works via the policy)
7171    ///   * `__tenant_idx_{table}` already exists (idempotent on rehydrate)
7172    ///   * the user already registered an index whose first column matches
7173    ///     (avoids redundant duplicates of a user-defined composite)
7174    fn ensure_tenant_index(&self, table: &str, column: &str) {
7175        if column.contains('.') {
7176            return;
7177        }
7178        let index_name = format!("__tenant_idx_{table}");
7179        let registry = self.inner.index_store.list_indices(table);
7180        if registry.iter().any(|idx| idx.name == index_name) {
7181            return;
7182        }
7183        if registry
7184            .iter()
7185            .any(|idx| idx.columns.first().map(|c| c.as_str()) == Some(column))
7186        {
7187            return;
7188        }
7189
7190        let store = self.inner.db.store();
7191        let Some(manager) = store.get_collection(table) else {
7192            return;
7193        };
7194        let entities = manager.query_all(|_| true);
7195        let entity_fields: Vec<(
7196            crate::storage::unified::EntityId,
7197            Vec<(String, crate::storage::schema::Value)>,
7198        )> = entities
7199            .iter()
7200            .map(|e| {
7201                let fields = match &e.data {
7202                    crate::storage::EntityData::Row(row) => {
7203                        if let Some(ref named) = row.named {
7204                            named.iter().map(|(k, v)| (k.clone(), v.clone())).collect()
7205                        } else if let Some(ref schema) = row.schema {
7206                            schema
7207                                .iter()
7208                                .zip(row.columns.iter())
7209                                .map(|(k, v)| (k.clone(), v.clone()))
7210                                .collect()
7211                        } else {
7212                            Vec::new()
7213                        }
7214                    }
7215                    crate::storage::EntityData::Node(node) => node
7216                        .properties
7217                        .iter()
7218                        .map(|(k, v)| (k.clone(), v.clone()))
7219                        .collect(),
7220                    _ => Vec::new(),
7221                };
7222                (e.id, fields)
7223            })
7224            .collect();
7225
7226        let columns = vec![column.to_string()];
7227        if self
7228            .inner
7229            .index_store
7230            .create_index(
7231                &index_name,
7232                table,
7233                &columns,
7234                super::index_store::IndexMethodKind::Hash,
7235                false,
7236                &entity_fields,
7237            )
7238            .is_err()
7239        {
7240            return;
7241        }
7242        self.inner
7243            .index_store
7244            .register(super::index_store::RegisteredIndex {
7245                name: index_name,
7246                collection: table.to_string(),
7247                columns,
7248                method: super::index_store::IndexMethodKind::Hash,
7249                unique: false,
7250            });
7251        self.invalidate_plan_cache();
7252    }
7253
7254    /// Drop the auto-generated tenant index, if one exists. Called from
7255    /// `unregister_tenant_table` so DISABLE TENANCY / DROP TABLE clean up.
7256    fn drop_tenant_index(&self, table: &str) {
7257        let index_name = format!("__tenant_idx_{table}");
7258        self.inner.index_store.drop_index(&index_name, table);
7259    }
7260
7261    /// Retrieve the tenant column for a table, if any (Phase 2.5.4).
7262    /// Used by the INSERT auto-fill path to know which column to
7263    /// populate with `current_tenant()` when the user didn't name it.
7264    pub fn tenant_column(&self, table: &str) -> Option<String> {
7265        self.inner.tenant_tables.read().get(table).cloned()
7266    }
7267
7268    /// Remove a table's tenant registration (Phase 2.5.4). Called by
7269    /// DROP TABLE / ALTER TABLE DISABLE TENANCY. Removes the auto-policy
7270    /// but leaves any user-installed explicit policies intact.
7271    pub fn unregister_tenant_table(&self, table: &str) {
7272        self.inner.tenant_tables.write().remove(table);
7273        self.inner
7274            .rls_policies
7275            .write()
7276            .remove(&(table.to_string(), "__tenant_iso".to_string()));
7277        self.drop_tenant_index(table);
7278        // Only clear RLS enablement if no other policies remain.
7279        let has_other_policies = self
7280            .inner
7281            .rls_policies
7282            .read()
7283            .keys()
7284            .any(|(t, _)| t == table);
7285        if !has_other_policies {
7286            self.inner.rls_enabled_tables.write().remove(table);
7287        }
7288    }
7289
7290    /// Record that the running transaction has marked `id` in `collection`
7291    /// for deletion (Phase 2.3.2b MVCC tombstones). `stamper_xid` is the
7292    /// xid that was written into `xmax` — either the parent txn xid or
7293    /// the innermost savepoint sub-xid. Savepoint rollback filters by
7294    /// this xid to revive only its own tombstones.
7295    pub(crate) fn record_pending_tombstone(
7296        &self,
7297        conn_id: u64,
7298        collection: &str,
7299        id: crate::storage::unified::entity::EntityId,
7300        stamper_xid: crate::storage::transaction::snapshot::Xid,
7301        previous_xmax: crate::storage::transaction::snapshot::Xid,
7302    ) {
7303        self.inner
7304            .pending_tombstones
7305            .write()
7306            .entry(conn_id)
7307            .or_default()
7308            .push((collection.to_string(), id, stamper_xid, previous_xmax));
7309    }
7310
7311    pub(crate) fn record_pending_versioned_update(
7312        &self,
7313        conn_id: u64,
7314        collection: &str,
7315        old_id: crate::storage::unified::entity::EntityId,
7316        new_id: crate::storage::unified::entity::EntityId,
7317        stamper_xid: crate::storage::transaction::snapshot::Xid,
7318        previous_xmax: crate::storage::transaction::snapshot::Xid,
7319    ) {
7320        self.inner
7321            .pending_versioned_updates
7322            .write()
7323            .entry(conn_id)
7324            .or_default()
7325            .push((
7326                collection.to_string(),
7327                old_id,
7328                new_id,
7329                stamper_xid,
7330                previous_xmax,
7331            ));
7332    }
7333
7334    fn with_deferred_store_wal_if_transaction<T>(
7335        &self,
7336        f: impl FnOnce() -> RedDBResult<T>,
7337    ) -> RedDBResult<T> {
7338        let conn_id = current_connection_id();
7339        if !self.inner.tx_contexts.read().contains_key(&conn_id) {
7340            return f();
7341        }
7342
7343        crate::storage::UnifiedStore::begin_deferred_store_wal_capture();
7344        let result = f();
7345        let captured = crate::storage::UnifiedStore::take_deferred_store_wal_capture();
7346        match result {
7347            Ok(value) => {
7348                self.record_pending_store_wal_actions(conn_id, captured);
7349                Ok(value)
7350            }
7351            Err(err) => Err(err),
7352        }
7353    }
7354
7355    fn record_pending_store_wal_actions(
7356        &self,
7357        conn_id: u64,
7358        actions: crate::storage::unified::DeferredStoreWalActions,
7359    ) {
7360        if actions.is_empty() {
7361            return;
7362        }
7363        let mut guard = self.inner.pending_store_wal_actions.write();
7364        guard.entry(conn_id).or_default().extend(actions);
7365    }
7366
7367    fn flush_pending_store_wal_actions(&self, conn_id: u64) -> RedDBResult<()> {
7368        let Some(actions) = self
7369            .inner
7370            .pending_store_wal_actions
7371            .write()
7372            .remove(&conn_id)
7373        else {
7374            return Ok(());
7375        };
7376        self.inner
7377            .db
7378            .store()
7379            .append_deferred_store_wal_actions(actions)
7380            .map_err(|err| RedDBError::Internal(err.to_string()))
7381    }
7382
7383    fn discard_pending_store_wal_actions(&self, conn_id: u64) {
7384        self.inner
7385            .pending_store_wal_actions
7386            .write()
7387            .remove(&conn_id);
7388    }
7389
7390    fn xid_conflicts_with_snapshot(
7391        &self,
7392        xid: crate::storage::transaction::snapshot::Xid,
7393        snapshot: &crate::storage::transaction::snapshot::Snapshot,
7394        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
7395    ) -> bool {
7396        xid != 0
7397            && !own_xids.contains(&xid)
7398            && !self.inner.snapshot_manager.is_aborted(xid)
7399            && !self.inner.snapshot_manager.is_active(xid)
7400            && (xid > snapshot.xid || snapshot.in_progress.contains(&xid))
7401    }
7402
7403    fn conflict_error(
7404        collection: &str,
7405        logical_id: crate::storage::unified::entity::EntityId,
7406        xid: crate::storage::transaction::snapshot::Xid,
7407    ) -> RedDBError {
7408        RedDBError::Query(format!(
7409            "serialization conflict: table row {collection}/{} was modified by concurrent transaction {xid}",
7410            logical_id.raw()
7411        ))
7412    }
7413
7414    fn check_logical_row_conflict(
7415        &self,
7416        collection: &str,
7417        logical_id: crate::storage::unified::entity::EntityId,
7418        excluded_ids: &[crate::storage::unified::entity::EntityId],
7419        snapshot: &crate::storage::transaction::snapshot::Snapshot,
7420        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
7421    ) -> RedDBResult<()> {
7422        let store = self.inner.db.store();
7423        let Some(manager) = store.get_collection(collection) else {
7424            return Ok(());
7425        };
7426
7427        for candidate in manager.query_all(|_| true) {
7428            if excluded_ids.contains(&candidate.id) || candidate.logical_id() != logical_id {
7429                continue;
7430            }
7431            if self.xid_conflicts_with_snapshot(candidate.xmin, snapshot, own_xids) {
7432                return Err(Self::conflict_error(collection, logical_id, candidate.xmin));
7433            }
7434            if self.xid_conflicts_with_snapshot(candidate.xmax, snapshot, own_xids) {
7435                return Err(Self::conflict_error(collection, logical_id, candidate.xmax));
7436            }
7437        }
7438        Ok(())
7439    }
7440
7441    pub(crate) fn check_table_row_write_conflicts(
7442        &self,
7443        conn_id: u64,
7444        snapshot: &crate::storage::transaction::snapshot::Snapshot,
7445        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
7446    ) -> RedDBResult<()> {
7447        let versioned_updates = self
7448            .inner
7449            .pending_versioned_updates
7450            .read()
7451            .get(&conn_id)
7452            .cloned()
7453            .unwrap_or_default();
7454        let tombstones = self
7455            .inner
7456            .pending_tombstones
7457            .read()
7458            .get(&conn_id)
7459            .cloned()
7460            .unwrap_or_default();
7461
7462        let store = self.inner.db.store();
7463        for (collection, old_id, new_id, xid, previous_xmax) in versioned_updates {
7464            let Some(manager) = store.get_collection(&collection) else {
7465                continue;
7466            };
7467            let Some(old) = manager.get(old_id) else {
7468                continue;
7469            };
7470            let logical_id = old.logical_id();
7471            if self.xid_conflicts_with_snapshot(previous_xmax, snapshot, own_xids) {
7472                return Err(Self::conflict_error(&collection, logical_id, previous_xmax));
7473            }
7474            if old.xmax != xid && self.xid_conflicts_with_snapshot(old.xmax, snapshot, own_xids) {
7475                return Err(Self::conflict_error(&collection, logical_id, old.xmax));
7476            }
7477            self.check_logical_row_conflict(
7478                &collection,
7479                logical_id,
7480                &[old_id, new_id],
7481                snapshot,
7482                own_xids,
7483            )?;
7484        }
7485
7486        for (collection, id, xid, previous_xmax) in tombstones {
7487            let Some(manager) = store.get_collection(&collection) else {
7488                continue;
7489            };
7490            let Some(entity) = manager.get(id) else {
7491                continue;
7492            };
7493            let logical_id = entity.logical_id();
7494            if self.xid_conflicts_with_snapshot(previous_xmax, snapshot, own_xids) {
7495                return Err(Self::conflict_error(&collection, logical_id, previous_xmax));
7496            }
7497            if entity.xmax != xid
7498                && self.xid_conflicts_with_snapshot(entity.xmax, snapshot, own_xids)
7499            {
7500                return Err(Self::conflict_error(&collection, logical_id, entity.xmax));
7501            }
7502            self.check_logical_row_conflict(&collection, logical_id, &[id], snapshot, own_xids)?;
7503        }
7504
7505        Ok(())
7506    }
7507
7508    pub(crate) fn restore_pending_write_stamps(&self, conn_id: u64) {
7509        let versioned_updates = self
7510            .inner
7511            .pending_versioned_updates
7512            .read()
7513            .get(&conn_id)
7514            .cloned()
7515            .unwrap_or_default();
7516        let tombstones = self
7517            .inner
7518            .pending_tombstones
7519            .read()
7520            .get(&conn_id)
7521            .cloned()
7522            .unwrap_or_default();
7523
7524        let store = self.inner.db.store();
7525        for (collection, old_id, _new_id, xid, _previous_xmax) in versioned_updates {
7526            if let Some(manager) = store.get_collection(&collection) {
7527                if let Some(mut entity) = manager.get(old_id) {
7528                    entity.set_xmax(xid);
7529                    let _ = manager.update(entity);
7530                }
7531            }
7532        }
7533        for (collection, id, xid, _previous_xmax) in tombstones {
7534            if let Some(manager) = store.get_collection(&collection) {
7535                if let Some(mut entity) = manager.get(id) {
7536                    entity.set_xmax(xid);
7537                    let _ = manager.update(entity);
7538                }
7539            }
7540        }
7541    }
7542
7543    pub(crate) fn finalize_pending_versioned_updates(&self, conn_id: u64) {
7544        self.inner
7545            .pending_versioned_updates
7546            .write()
7547            .remove(&conn_id);
7548    }
7549
7550    pub(crate) fn revive_pending_versioned_updates(&self, conn_id: u64) {
7551        let Some(pending) = self
7552            .inner
7553            .pending_versioned_updates
7554            .write()
7555            .remove(&conn_id)
7556        else {
7557            return;
7558        };
7559
7560        let store = self.inner.db.store();
7561        for (collection, old_id, new_id, xid, previous_xmax) in pending {
7562            if let Some(manager) = store.get_collection(&collection) {
7563                if let Some(mut old) = manager.get(old_id) {
7564                    if old.xmax == xid {
7565                        old.set_xmax(previous_xmax);
7566                        let _ = manager.update(old);
7567                    }
7568                }
7569            }
7570            let _ = store.delete_batch(&collection, &[new_id]);
7571        }
7572    }
7573
7574    pub(crate) fn revive_versioned_updates_since(&self, conn_id: u64, stamper_xid: u64) -> usize {
7575        let mut guard = self.inner.pending_versioned_updates.write();
7576        let Some(pending) = guard.get_mut(&conn_id) else {
7577            return 0;
7578        };
7579
7580        let store = self.inner.db.store();
7581        let mut reverted = 0usize;
7582        pending.retain(|(collection, old_id, new_id, xid, previous_xmax)| {
7583            if *xid < stamper_xid {
7584                return true;
7585            }
7586            if let Some(manager) = store.get_collection(collection) {
7587                if let Some(mut old) = manager.get(*old_id) {
7588                    if old.xmax == *xid {
7589                        old.set_xmax(*previous_xmax);
7590                        let _ = manager.update(old);
7591                    }
7592                }
7593            }
7594            let _ = store.delete_batch(collection, &[*new_id]);
7595            reverted += 1;
7596            false
7597        });
7598        if pending.is_empty() {
7599            guard.remove(&conn_id);
7600        }
7601        reverted
7602    }
7603
7604    /// Flush tombstones on COMMIT. The xmax stamp is already the durable
7605    /// delete marker; commit only drops the rollback journal and emits
7606    /// side effects. Physical reclamation is left for VACUUM so old
7607    /// snapshots can still resolve the pre-delete row version.
7608    pub(crate) fn finalize_pending_tombstones(&self, conn_id: u64) {
7609        let Some(pending) = self.inner.pending_tombstones.write().remove(&conn_id) else {
7610            return;
7611        };
7612        if pending.is_empty() {
7613            return;
7614        }
7615
7616        let store = self.inner.db.store();
7617        for (collection, id, _xid, _previous_xmax) in pending {
7618            store.context_index().remove_entity(id);
7619            self.cdc_emit(
7620                crate::replication::cdc::ChangeOperation::Delete,
7621                &collection,
7622                id.raw(),
7623                "entity",
7624            );
7625        }
7626    }
7627
7628    /// Revive tombstones on ROLLBACK — reset `xmax` to 0 so the tuples
7629    /// become visible again to future snapshots. Best-effort: a row
7630    /// already reclaimed by a concurrent VACUUM stays gone, but VACUUM
7631    /// never reclaims tuples whose xmax is still referenced by any
7632    /// active snapshot, so this case is only reachable via external
7633    /// storage corruption.
7634    pub(crate) fn revive_pending_tombstones(&self, conn_id: u64) {
7635        let Some(pending) = self.inner.pending_tombstones.write().remove(&conn_id) else {
7636            return;
7637        };
7638
7639        let store = self.inner.db.store();
7640        for (collection, id, xid, previous_xmax) in pending {
7641            let Some(manager) = store.get_collection(&collection) else {
7642                continue;
7643            };
7644            if let Some(mut entity) = manager.get(id) {
7645                if entity.xmax == xid {
7646                    entity.set_xmax(previous_xmax);
7647                    let _ = manager.update(entity);
7648                }
7649            }
7650        }
7651    }
7652
7653    pub(crate) fn finalize_pending_kv_watch_events(&self, conn_id: u64) {
7654        let Some(pending) = self.inner.pending_kv_watch_events.write().remove(&conn_id) else {
7655            return;
7656        };
7657        for event in pending {
7658            self.cdc_emit_kv(
7659                event.op,
7660                &event.collection,
7661                &event.key,
7662                0,
7663                event.before,
7664                event.after,
7665            );
7666        }
7667    }
7668
7669    pub(crate) fn discard_pending_kv_watch_events(&self, conn_id: u64) {
7670        self.inner.pending_kv_watch_events.write().remove(&conn_id);
7671    }
7672
7673    /// Materialise the entire graph store while applying MVCC visibility
7674    /// AND per-collection RLS to each candidate node and edge. Mirrors
7675    /// `materialize_graph` but routes every entity through the same
7676    /// gate the SELECT path uses, with the correct `PolicyTargetKind`
7677    /// per entity kind (`Nodes` for graph nodes, `Edges` for graph
7678    /// edges). Returns the filtered `GraphStore` plus the
7679    /// `node_id → properties` map the executor needs for `RETURN n.*`
7680    /// projections.
7681    fn materialize_graph_with_rls(
7682        &self,
7683    ) -> RedDBResult<(
7684        crate::storage::engine::GraphStore,
7685        std::collections::HashMap<
7686            String,
7687            std::collections::HashMap<String, crate::storage::schema::Value>,
7688        >,
7689    )> {
7690        use crate::storage::engine::GraphStore;
7691        use crate::storage::query::ast::{PolicyAction, PolicyTargetKind};
7692        use crate::storage::unified::entity::{EntityData, EntityKind};
7693        use std::collections::{HashMap, HashSet};
7694
7695        let store = self.inner.db.store();
7696        let snap_ctx = capture_current_snapshot();
7697        let role = current_auth_identity().map(|(_, r)| r.as_str().to_string());
7698
7699        let graph = GraphStore::new();
7700        let mut node_properties: HashMap<String, HashMap<String, crate::storage::schema::Value>> =
7701            HashMap::new();
7702        let mut allowed_nodes: HashSet<String> = HashSet::new();
7703
7704        // Per-collection cached compiled filters — Nodes-kind for
7705        // first pass, Edges-kind for the second. None entries mean
7706        // "RLS enabled, zero matching policy → deny all of this kind".
7707        let mut node_rls: HashMap<String, Option<crate::storage::query::ast::Filter>> =
7708            HashMap::new();
7709        let mut edge_rls: HashMap<String, Option<crate::storage::query::ast::Filter>> =
7710            HashMap::new();
7711
7712        let collections = store.list_collections();
7713
7714        // First pass — gather nodes.
7715        for collection in &collections {
7716            let Some(manager) = store.get_collection(collection) else {
7717                continue;
7718            };
7719            let entities = manager.query_all(|_| true);
7720            for entity in entities {
7721                if !entity_visible_with_context(snap_ctx.as_ref(), &entity) {
7722                    continue;
7723                }
7724                let EntityKind::GraphNode(ref node) = entity.kind else {
7725                    continue;
7726                };
7727                if !node_passes_rls(self, collection, role.as_deref(), &mut node_rls, &entity) {
7728                    continue;
7729                }
7730                let id_str = entity.id.raw().to_string();
7731                graph
7732                    .add_node_with_label(
7733                        &id_str,
7734                        &node.label,
7735                        &super::graph_node_label(&node.node_type),
7736                    )
7737                    .map_err(|err| RedDBError::Query(err.to_string()))?;
7738                allowed_nodes.insert(id_str.clone());
7739                if let EntityData::Node(node_data) = &entity.data {
7740                    node_properties.insert(id_str, node_data.properties.clone());
7741                }
7742            }
7743        }
7744
7745        // Second pass — gather edges. An edge appears only when both
7746        // endpoint nodes survived the RLS pass AND the edge itself
7747        // passes its own RLS gate.
7748        for collection in &collections {
7749            let Some(manager) = store.get_collection(collection) else {
7750                continue;
7751            };
7752            let entities = manager.query_all(|_| true);
7753            for entity in entities {
7754                if !entity_visible_with_context(snap_ctx.as_ref(), &entity) {
7755                    continue;
7756                }
7757                let EntityKind::GraphEdge(ref edge) = entity.kind else {
7758                    continue;
7759                };
7760                if !allowed_nodes.contains(&edge.from_node)
7761                    || !allowed_nodes.contains(&edge.to_node)
7762                {
7763                    continue;
7764                }
7765                if !edge_passes_rls(self, collection, role.as_deref(), &mut edge_rls, &entity) {
7766                    continue;
7767                }
7768                let weight = match &entity.data {
7769                    EntityData::Edge(e) => e.weight,
7770                    _ => edge.weight as f32 / 1000.0,
7771                };
7772                graph
7773                    .add_edge_with_label(
7774                        &edge.from_node,
7775                        &edge.to_node,
7776                        &super::graph_edge_label(&edge.label),
7777                        weight,
7778                    )
7779                    .map_err(|err| RedDBError::Query(err.to_string()))?;
7780            }
7781        }
7782
7783        // Suppress unused-PolicyAction/PolicyTargetKind warnings — both
7784        // are used inside the helper closures via the per-kind helpers
7785        // declared at the bottom of this file.
7786        let _ = (PolicyAction::Select, PolicyTargetKind::Nodes);
7787
7788        Ok((graph, node_properties))
7789    }
7790
7791    /// Phase 1.1 MVCC universal: post-save hook that stamps `xmin` on a
7792    /// freshly-inserted entity when the current connection holds an
7793    /// open transaction. Used by graph / vector / queue / timeseries
7794    /// write paths that go through the DevX builder API (`db.node(...)
7795    /// .save()` and friends) — those live in the storage crate and
7796    /// can't reach `current_xid()` without crossing layers, so the
7797    /// application layer calls this helper right after `save()` to
7798    /// finalise the MVCC stamp.
7799    ///
7800    /// Autocommit (outside BEGIN) is a no-op — no extra lookup or
7801    /// write, so the non-transactional hot path stays untouched.
7802    ///
7803    /// Best-effort: if the collection or entity disappears between
7804    /// the save and the stamp (concurrent DROP), we silently skip.
7805    pub(crate) fn stamp_xmin_if_in_txn(
7806        &self,
7807        collection: &str,
7808        id: crate::storage::unified::entity::EntityId,
7809    ) {
7810        let Some(xid) = self.current_xid() else {
7811            return;
7812        };
7813        let store = self.inner.db.store();
7814        let Some(manager) = store.get_collection(collection) else {
7815            return;
7816        };
7817        if let Some(mut entity) = manager.get(id) {
7818            entity.set_xmin(xid);
7819            let _ = manager.update(entity);
7820        }
7821    }
7822
7823    /// Revive tombstones stamped by `stamper_xid` or any sub-xid
7824    /// allocated after it (Phase 2.3.2e savepoint rollback). Any
7825    /// pending entries with `xid < stamper_xid` stay queued because
7826    /// they belong to the enclosing scope — they'll either flush on
7827    /// COMMIT or revive on an outer ROLLBACK TO SAVEPOINT.
7828    ///
7829    /// Returns the number of tuples whose `xmax` was wiped back to 0.
7830    pub(crate) fn revive_tombstones_since(&self, conn_id: u64, stamper_xid: u64) -> usize {
7831        let mut guard = self.inner.pending_tombstones.write();
7832        let Some(pending) = guard.get_mut(&conn_id) else {
7833            return 0;
7834        };
7835
7836        let store = self.inner.db.store();
7837        let mut revived = 0usize;
7838        pending.retain(|(collection, id, xid, previous_xmax)| {
7839            if *xid < stamper_xid {
7840                // Stamped before the savepoint — keep in queue.
7841                return true;
7842            }
7843            if let Some(manager) = store.get_collection(collection) {
7844                if let Some(mut entity) = manager.get(*id) {
7845                    if entity.xmax == *xid {
7846                        entity.set_xmax(*previous_xmax);
7847                        let _ = manager.update(entity);
7848                        revived += 1;
7849                    }
7850                }
7851            }
7852            false
7853        });
7854        if pending.is_empty() {
7855            guard.remove(&conn_id);
7856        }
7857        revived
7858    }
7859
7860    /// Return the snapshot the current connection should use for visibility
7861    /// checks (Phase 2.3 PG parity).
7862    ///
7863    /// * If the connection is inside a BEGIN-wrapped transaction, reuse
7864    ///   the snapshot stored in its `TxnContext`.
7865    /// * Otherwise (autocommit), capture a fresh snapshot tied to an
7866    ///   implicit xid=0 — the read path treats pre-MVCC rows as always
7867    ///   visible so this degrades to "see everything committed".
7868    pub fn current_snapshot(&self) -> crate::storage::transaction::snapshot::Snapshot {
7869        let conn_id = current_connection_id();
7870        if let Some(ctx) = self.inner.tx_contexts.read().get(&conn_id).cloned() {
7871            return ctx.snapshot;
7872        }
7873        // Autocommit: take a fresh snapshot bounded by `peek_next_xid` so
7874        // every already-committed xid (which is strictly less) passes the
7875        // `xmin <= snap.xid` gate, while concurrently-active xids land in
7876        // the `in_progress` set and stay hidden until they commit. Using
7877        // xid=0 would incorrectly hide every MVCC-stamped tuple.
7878        let high_water = self.inner.snapshot_manager.peek_next_xid();
7879        self.inner.snapshot_manager.snapshot(high_water)
7880    }
7881
7882    /// Xid of the current connection's active transaction, or `None` when
7883    /// running outside a BEGIN/COMMIT block. Write paths call this to
7884    /// decide whether to stamp `xmin`/`xmax` on tuples.
7885    /// Phase 2.3.2e: when a savepoint is open, `writer_xid` returns the
7886    /// sub-xid so new writes can be selectively rolled back. Otherwise
7887    /// the parent txn's xid is returned, matching pre-savepoint
7888    /// behaviour. Callers that need the enclosing *transaction* xid
7889    /// (e.g. VACUUM min-active calculations) should read `ctx.xid`
7890    /// directly.
7891    pub fn current_xid(&self) -> Option<crate::storage::transaction::snapshot::Xid> {
7892        let conn_id = current_connection_id();
7893        self.inner
7894            .tx_contexts
7895            .read()
7896            .get(&conn_id)
7897            .map(|ctx| ctx.writer_xid())
7898    }
7899
7900    /// Access the shared `SnapshotManager` — useful for VACUUM to compute
7901    /// the oldest-active xid when reclaiming dead tuples.
7902    pub fn snapshot_manager(&self) -> Arc<crate::storage::transaction::snapshot::SnapshotManager> {
7903        Arc::clone(&self.inner.snapshot_manager)
7904    }
7905
7906    fn mvcc_vacuum_cutoff_xid(&self) -> crate::storage::transaction::snapshot::Xid {
7907        let manager = &self.inner.snapshot_manager;
7908        let next_xid = manager.peek_next_xid();
7909        let mut cutoff = next_xid;
7910        if let Some(oldest_active) = manager.oldest_active_xid() {
7911            cutoff = cutoff.min(oldest_active);
7912        }
7913        if let Some(oldest_pinned) = manager.oldest_pinned_xid() {
7914            cutoff = cutoff.min(oldest_pinned);
7915        }
7916        let retention_xids = self.config_u64("runtime.mvcc.vacuum_retention_xids", 0);
7917        if retention_xids > 0 {
7918            cutoff = cutoff.min(next_xid.saturating_sub(retention_xids));
7919        }
7920        cutoff
7921    }
7922
7923    fn rebuild_runtime_indexes_for_table(&self, table: &str) -> RedDBResult<()> {
7924        let registered = self.inner.index_store.list_indices(table);
7925        if registered.is_empty() {
7926            return Ok(());
7927        }
7928        let store = self.inner.db.store();
7929        let Some(manager) = store.get_collection(table) else {
7930            return Ok(());
7931        };
7932        let entity_fields = manager
7933            .query_all(|entity| matches!(entity.kind, crate::storage::EntityKind::TableRow { .. }))
7934            .into_iter()
7935            .map(|entity| (entity.id, table_row_index_fields(&entity)))
7936            .collect::<Vec<_>>();
7937
7938        for index in registered {
7939            self.inner.index_store.drop_index(&index.name, table);
7940            self.inner
7941                .index_store
7942                .create_index(
7943                    &index.name,
7944                    table,
7945                    &index.columns,
7946                    index.method,
7947                    index.unique,
7948                    &entity_fields,
7949                )
7950                .map_err(RedDBError::Internal)?;
7951            self.inner.index_store.register(index);
7952        }
7953        self.invalidate_plan_cache();
7954        Ok(())
7955    }
7956
7957    /// Own-tx xids (parent + open/released savepoints) for the current
7958    /// connection. Transports + tests that build a `SnapshotContext`
7959    /// manually (outside the `execute_query` scope) need this set so
7960    /// the writer's own uncommitted tuples stay visible to self.
7961    pub fn current_txn_own_xids(
7962        &self,
7963    ) -> std::collections::HashSet<crate::storage::transaction::snapshot::Xid> {
7964        let mut set = std::collections::HashSet::new();
7965        if let Some(ctx) = self.inner.tx_contexts.read().get(&current_connection_id()) {
7966            set.insert(ctx.xid);
7967            for (_, sub) in &ctx.savepoints {
7968                set.insert(*sub);
7969            }
7970            for sub in &ctx.released_sub_xids {
7971                set.insert(*sub);
7972            }
7973        }
7974        set
7975    }
7976
7977    /// Access the shared `ForeignTableRegistry` (Phase 3.2 PG parity).
7978    ///
7979    /// Callers use this to check whether a table name is a registered
7980    /// foreign table (`registry.is_foreign_table(name)`) and, if so, to
7981    /// scan it (`registry.scan(name)`). The read-path rewriter consults
7982    /// this before dispatching into native-collection lookup.
7983    pub fn foreign_tables(&self) -> Arc<crate::storage::fdw::ForeignTableRegistry> {
7984        Arc::clone(&self.inner.foreign_tables)
7985    }
7986
7987    /// Is Row-Level Security enabled for this table? (Phase 2.5 PG parity)
7988    pub fn is_rls_enabled(&self, table: &str) -> bool {
7989        self.inner.rls_enabled_tables.read().contains(table)
7990    }
7991
7992    /// Collect the USING predicates that apply to this `(table, role, action)`.
7993    ///
7994    /// Returned filters should be OR-combined (a row passes RLS when *any*
7995    /// matching policy accepts it) and then AND-ed into the query's WHERE.
7996    /// When the table has RLS disabled this returns an empty Vec — callers
7997    /// can fast-path back to the unfiltered read.
7998    pub fn matching_rls_policies(
7999        &self,
8000        table: &str,
8001        role: Option<&str>,
8002        action: crate::storage::query::ast::PolicyAction,
8003    ) -> Vec<crate::storage::query::ast::Filter> {
8004        // Default kind = Table preserves the pre-Phase-2.5.5 behaviour:
8005        // callers that don't name a kind only see Table-scoped
8006        // policies (which is what execute SELECT / UPDATE / DELETE
8007        // expect).
8008        self.matching_rls_policies_for_kind(
8009            table,
8010            role,
8011            action,
8012            crate::storage::query::ast::PolicyTargetKind::Table,
8013        )
8014    }
8015
8016    /// Kind-aware variant used by cross-model scans (Phase 2.5.5).
8017    ///
8018    /// Graph scans request `Nodes` / `Edges`, vector ANN requests
8019    /// `Vectors`, queue consumers request `Messages`, and timeseries
8020    /// range scans request `Points`. Policies tagged with a
8021    /// different kind are skipped so a graph-scoped policy doesn't
8022    /// accidentally gate a table SELECT on the same collection.
8023    pub fn matching_rls_policies_for_kind(
8024        &self,
8025        table: &str,
8026        role: Option<&str>,
8027        action: crate::storage::query::ast::PolicyAction,
8028        kind: crate::storage::query::ast::PolicyTargetKind,
8029    ) -> Vec<crate::storage::query::ast::Filter> {
8030        if !self.is_rls_enabled(table) {
8031            return Vec::new();
8032        }
8033        let policies = self.inner.rls_policies.read();
8034        policies
8035            .iter()
8036            .filter_map(|((t, _), p)| {
8037                if t != table {
8038                    return None;
8039                }
8040                // Kind gate — Table policies also apply to every
8041                // other kind *iff* the policy predicate evaluates
8042                // against entity fields that exist uniformly; the
8043                // caller's kind filter is the stricter check, so
8044                // match literally. Auto-tenancy policies stamp
8045                // Table and the caller passes the concrete kind —
8046                // we allow Table policies to apply cross-kind for
8047                // backwards compat.
8048                if p.target_kind != kind
8049                    && p.target_kind != crate::storage::query::ast::PolicyTargetKind::Table
8050                {
8051                    return None;
8052                }
8053                // Action gate — `None` means "ALL" actions.
8054                if let Some(a) = p.action {
8055                    if a != action {
8056                        return None;
8057                    }
8058                }
8059                // Role gate — `None` means "any role".
8060                if let Some(p_role) = p.role.as_deref() {
8061                    match role {
8062                        Some(r) if r == p_role => {}
8063                        _ => return None,
8064                    }
8065                }
8066                Some((*p.using).clone())
8067            })
8068            .collect()
8069    }
8070
8071    pub(crate) fn refresh_table_planner_stats(&self, table: &str) {
8072        let store = self.inner.db.store();
8073        if let Some(stats) =
8074            crate::storage::query::planner::stats_catalog::analyze_collection(store.as_ref(), table)
8075        {
8076            crate::storage::query::planner::stats_catalog::persist_table_stats(
8077                store.as_ref(),
8078                &stats,
8079            );
8080        } else {
8081            crate::storage::query::planner::stats_catalog::clear_table_stats(store.as_ref(), table);
8082        }
8083        self.invalidate_plan_cache();
8084    }
8085
8086    pub(crate) fn note_table_write(&self, table: &str) {
8087        // Skip the write lock when the table is already marked
8088        // dirty. With single-row UPDATEs in a loop this used to
8089        // grab the planner_dirty_tables write lock N times even
8090        // though the first call already flipped the flag.
8091        let already_dirty = self.inner.planner_dirty_tables.read().contains(table);
8092        if !already_dirty {
8093            self.inner
8094                .planner_dirty_tables
8095                .write()
8096                .insert(table.to_string());
8097        }
8098        self.invalidate_result_cache_for_table(table);
8099    }
8100
8101    /// Wrap the planner's `RuntimeQueryExplain` as rows on a
8102    /// `RuntimeQueryResult` so callers over the SQL interface see the
8103    /// plan tree in the same shape a SELECT produces.
8104    ///
8105    /// Columns: `op`, `source`, `est_rows`, `est_cost`, `depth`.
8106    /// Nodes are walked depth-first; `depth` counts from 0 at the
8107    /// root so a text renderer can indent without re-walking.
8108    fn explain_as_rows(&self, raw_query: &str, inner_sql: &str) -> RedDBResult<RuntimeQueryResult> {
8109        let explain = self.explain_query(inner_sql)?;
8110
8111        let columns = vec![
8112            "op".to_string(),
8113            "source".to_string(),
8114            "est_rows".to_string(),
8115            "est_cost".to_string(),
8116            "depth".to_string(),
8117        ];
8118
8119        let mut records: Vec<crate::storage::query::unified::UnifiedRecord> = Vec::new();
8120
8121        // Prepend `CteScan` markers when the query carried a leading
8122        // WITH clause. The CTE bodies are already inlined into the
8123        // main plan tree, but operators reading EXPLAIN need to see
8124        // which named CTEs were resolved — without this row the plan
8125        // would look indistinguishable from a hand-inlined query.
8126        for name in &explain.cte_materializations {
8127            use std::sync::Arc;
8128            let mut rec = crate::storage::query::unified::UnifiedRecord::default();
8129            rec.set_arc(Arc::from("op"), Value::text("CteScan".to_string()));
8130            rec.set_arc(Arc::from("source"), Value::text(name.clone()));
8131            rec.set_arc(Arc::from("est_rows"), Value::Float(0.0));
8132            rec.set_arc(Arc::from("est_cost"), Value::Float(0.0));
8133            rec.set_arc(Arc::from("depth"), Value::Integer(0));
8134            records.push(rec);
8135        }
8136
8137        walk_plan_node(&explain.logical_plan.root, 0, &mut records);
8138
8139        let result = crate::storage::query::unified::UnifiedResult {
8140            columns,
8141            records,
8142            stats: Default::default(),
8143            pre_serialized_json: None,
8144        };
8145
8146        Ok(RuntimeQueryResult {
8147            query: raw_query.to_string(),
8148            mode: explain.mode,
8149            statement: "explain",
8150            engine: "runtime-explain",
8151            result,
8152            affected_rows: 0,
8153            statement_type: "select",
8154        })
8155    }
8156
8157    // -----------------------------------------------------------------
8158    // Granular RBAC — privilege gate + GRANT/REVOKE/ALTER USER dispatch
8159    // -----------------------------------------------------------------
8160
8161    /// Project a `QueryExpr` to the (action, resource) pair the
8162    /// privilege engine cares about. Returns `Ok(())` for statements
8163    /// that don't touch user data (transaction control, SHOW, SET, etc.).
8164    pub(super) fn check_query_privilege(
8165        &self,
8166        expr: &crate::storage::query::ast::QueryExpr,
8167    ) -> Result<(), String> {
8168        use crate::auth::privileges::{Action, AuthzContext, Resource};
8169        use crate::auth::UserId;
8170        use crate::storage::query::ast::QueryExpr;
8171
8172        // No auth store wired (embedded mode / fresh DB / tests) → bypass.
8173        // The bootstrap path itself goes through `execute_query` so this
8174        // is the only sensible default; once auth is wired, the gate
8175        // becomes active.
8176        let auth_store = match self.inner.auth_store.read().clone() {
8177            Some(s) => s,
8178            None => return Ok(()),
8179        };
8180
8181        // Resolve principal + role from the thread-local identity.
8182        // Anonymous (no identity) is allowed to read the bootstrap path
8183        // only when auth_store says so; we treat missing identity as
8184        // platform-admin-equivalent here so embedded test harnesses
8185        // continue to work without setting an identity.
8186        let (username, role) = match current_auth_identity() {
8187            Some(p) => p,
8188            None => return Ok(()),
8189        };
8190        let tenant = current_tenant();
8191
8192        let ctx = AuthzContext {
8193            principal: &username,
8194            effective_role: role,
8195            tenant: tenant.as_deref(),
8196        };
8197        let principal_id = UserId::from_parts(tenant.as_deref(), &username);
8198
8199        // Map QueryExpr → (Action, Resource).
8200        let (action, resource) = match expr {
8201            QueryExpr::Table(t) => (Action::Select, Resource::table_from_name(&t.table)),
8202            QueryExpr::QueueSelect(q) => (Action::Select, Resource::table_from_name(&q.queue)),
8203            QueryExpr::Graph(g) => {
8204                if auth_store.iam_authorization_enabled() {
8205                    self.check_graph_property_projection_privilege(
8206                        &auth_store,
8207                        &principal_id,
8208                        role,
8209                        tenant.as_deref(),
8210                        g,
8211                    )?;
8212                    return Ok(());
8213                }
8214                return Ok(());
8215            }
8216            QueryExpr::Vector(v) => {
8217                if auth_store.iam_authorization_enabled() {
8218                    self.check_table_like_column_projection_privilege(
8219                        &auth_store,
8220                        &principal_id,
8221                        role,
8222                        tenant.as_deref(),
8223                        &v.collection,
8224                        &["content".to_string()],
8225                    )?;
8226                    return Ok(());
8227                }
8228                return Ok(());
8229            }
8230            QueryExpr::Insert(i) => (Action::Insert, Resource::table_from_name(&i.table)),
8231            QueryExpr::Update(u) => (Action::Update, Resource::table_from_name(&u.table)),
8232            QueryExpr::Delete(d) => (Action::Delete, Resource::table_from_name(&d.table)),
8233            // Joins inherit the read privilege from any constituent
8234            // table — for now we emit a single Select on the database
8235            // (admins bypass; non-admins need a Database/Schema grant).
8236            QueryExpr::Join(_) => (Action::Select, Resource::Database),
8237            // GRANT / REVOKE / ALTER USER are authority statements;
8238            // require Admin (the helper methods enforce).
8239            QueryExpr::Grant(_) | QueryExpr::Revoke(_) | QueryExpr::AlterUser(_) => {
8240                return if role == crate::auth::Role::Admin {
8241                    Ok(())
8242                } else {
8243                    Err(format!(
8244                        "principal=`{}` role=`{:?}` cannot issue ACL/auth DDL",
8245                        username, role
8246                    ))
8247                };
8248            }
8249            QueryExpr::CreateIamPolicy { id, .. } => {
8250                return self.check_policy_management_privilege(
8251                    &auth_store,
8252                    &principal_id,
8253                    role,
8254                    tenant.as_deref(),
8255                    "policy:put",
8256                    "policy",
8257                    id,
8258                );
8259            }
8260            QueryExpr::DropIamPolicy { id } => {
8261                return self.check_policy_management_privilege(
8262                    &auth_store,
8263                    &principal_id,
8264                    role,
8265                    tenant.as_deref(),
8266                    "policy:drop",
8267                    "policy",
8268                    id,
8269                );
8270            }
8271            QueryExpr::AttachPolicy { policy_id, .. } => {
8272                return self.check_policy_management_privilege(
8273                    &auth_store,
8274                    &principal_id,
8275                    role,
8276                    tenant.as_deref(),
8277                    "policy:attach",
8278                    "policy",
8279                    policy_id,
8280                );
8281            }
8282            QueryExpr::DetachPolicy { policy_id, .. } => {
8283                return self.check_policy_management_privilege(
8284                    &auth_store,
8285                    &principal_id,
8286                    role,
8287                    tenant.as_deref(),
8288                    "policy:detach",
8289                    "policy",
8290                    policy_id,
8291                );
8292            }
8293            QueryExpr::ShowPolicies { .. } | QueryExpr::ShowEffectivePermissions { .. } => {
8294                return Ok(());
8295            }
8296            QueryExpr::SimulatePolicy { .. } => {
8297                return self.check_policy_management_privilege(
8298                    &auth_store,
8299                    &principal_id,
8300                    role,
8301                    tenant.as_deref(),
8302                    "policy:simulate",
8303                    "policy",
8304                    "*",
8305                );
8306            }
8307            // DROP and TRUNCATE — Write-role gate + per-collection IAM policy
8308            // when IAM mode is active. Other DDL stays role-only for now.
8309            QueryExpr::DropTable(q) => {
8310                return self.check_ddl_collection_privilege(
8311                    &auth_store,
8312                    &principal_id,
8313                    role,
8314                    tenant.as_deref(),
8315                    &username,
8316                    "drop",
8317                    &q.name,
8318                );
8319            }
8320            QueryExpr::DropGraph(q) => {
8321                return self.check_ddl_collection_privilege(
8322                    &auth_store,
8323                    &principal_id,
8324                    role,
8325                    tenant.as_deref(),
8326                    &username,
8327                    "drop",
8328                    &q.name,
8329                );
8330            }
8331            QueryExpr::DropVector(q) => {
8332                return self.check_ddl_collection_privilege(
8333                    &auth_store,
8334                    &principal_id,
8335                    role,
8336                    tenant.as_deref(),
8337                    &username,
8338                    "drop",
8339                    &q.name,
8340                );
8341            }
8342            QueryExpr::DropDocument(q) => {
8343                return self.check_ddl_collection_privilege(
8344                    &auth_store,
8345                    &principal_id,
8346                    role,
8347                    tenant.as_deref(),
8348                    &username,
8349                    "drop",
8350                    &q.name,
8351                );
8352            }
8353            QueryExpr::DropKv(q) => {
8354                return self.check_ddl_collection_privilege(
8355                    &auth_store,
8356                    &principal_id,
8357                    role,
8358                    tenant.as_deref(),
8359                    &username,
8360                    "drop",
8361                    &q.name,
8362                );
8363            }
8364            QueryExpr::DropCollection(q) => {
8365                return self.check_ddl_collection_privilege(
8366                    &auth_store,
8367                    &principal_id,
8368                    role,
8369                    tenant.as_deref(),
8370                    &username,
8371                    "drop",
8372                    &q.name,
8373                );
8374            }
8375            QueryExpr::Truncate(q) => {
8376                return self.check_ddl_collection_privilege(
8377                    &auth_store,
8378                    &principal_id,
8379                    role,
8380                    tenant.as_deref(),
8381                    &username,
8382                    "truncate",
8383                    &q.name,
8384                );
8385            }
8386            // Remaining DDL — gate on Write role. Fine-grained grants TBD.
8387            QueryExpr::CreateTable(_)
8388            | QueryExpr::CreateCollection(_)
8389            | QueryExpr::CreateVector(_)
8390            | QueryExpr::AlterTable(_)
8391            | QueryExpr::CreateIndex(_)
8392            | QueryExpr::DropIndex(_)
8393            | QueryExpr::CreateSchema(_)
8394            | QueryExpr::DropSchema(_)
8395            | QueryExpr::CreateSequence(_)
8396            | QueryExpr::DropSequence(_)
8397            | QueryExpr::CreateView(_)
8398            | QueryExpr::DropView(_)
8399            | QueryExpr::RefreshMaterializedView(_)
8400            | QueryExpr::CreatePolicy(_)
8401            | QueryExpr::DropPolicy(_)
8402            | QueryExpr::CreateServer(_)
8403            | QueryExpr::DropServer(_)
8404            | QueryExpr::CreateForeignTable(_)
8405            | QueryExpr::DropForeignTable(_)
8406            | QueryExpr::CreateTimeSeries(_)
8407            | QueryExpr::DropTimeSeries(_)
8408            | QueryExpr::CreateQueue(_)
8409            | QueryExpr::AlterQueue(_)
8410            | QueryExpr::DropQueue(_)
8411            | QueryExpr::CreateTree(_)
8412            | QueryExpr::DropTree(_) => {
8413                return if role >= crate::auth::Role::Write {
8414                    Ok(())
8415                } else {
8416                    Err(format!(
8417                        "principal=`{}` role=`{:?}` cannot issue DDL",
8418                        username, role
8419                    ))
8420                };
8421            }
8422            // Migration DDL — CREATE MIGRATION requires Write role (schema author).
8423            QueryExpr::CreateMigration(_) => {
8424                return if role >= crate::auth::Role::Write {
8425                    Ok(())
8426                } else {
8427                    Err(format!(
8428                        "principal=`{}` role=`{:?}` cannot issue CREATE MIGRATION",
8429                        username, role
8430                    ))
8431                };
8432            }
8433            // APPLY / ROLLBACK change data and schema — require Admin.
8434            QueryExpr::ApplyMigration(_) | QueryExpr::RollbackMigration(_) => {
8435                return if role == crate::auth::Role::Admin {
8436                    Ok(())
8437                } else {
8438                    Err(format!(
8439                        "principal=`{}` role=`{:?}` cannot issue APPLY/ROLLBACK MIGRATION",
8440                        username, role
8441                    ))
8442                };
8443            }
8444            // EXPLAIN MIGRATION is read-only — any authenticated principal.
8445            QueryExpr::ExplainMigration(_) => return Ok(()),
8446            // Everything else (SET, SHOW, transaction control, graph
8447            // commands, queue/tree commands, MaintenanceCommand …)
8448            // is allowed for any authenticated principal.
8449            _ => return Ok(()),
8450        };
8451
8452        if auth_store.iam_authorization_enabled() {
8453            let iam_action = legacy_action_to_iam(action);
8454            let iam_resource = legacy_resource_to_iam(&resource, tenant.as_deref());
8455            let iam_ctx = runtime_iam_context(role, tenant.as_deref());
8456            if !auth_store.check_policy_authz(&principal_id, iam_action, &iam_resource, &iam_ctx) {
8457                return Err(format!(
8458                    "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
8459                    username, iam_action, iam_resource.kind, iam_resource.name
8460                ));
8461            }
8462
8463            if let QueryExpr::Table(table) = expr {
8464                self.check_table_column_projection_privilege(
8465                    &auth_store,
8466                    &principal_id,
8467                    &iam_ctx,
8468                    table,
8469                )?;
8470            }
8471
8472            if let QueryExpr::Update(update) = expr {
8473                let columns = update_set_target_columns(update);
8474                if !columns.is_empty() {
8475                    let request = column_access_request_for_table_update(&update.table, columns);
8476                    let outcome =
8477                        auth_store.check_column_projection_authz(&principal_id, &request, &iam_ctx);
8478                    if let Some(denied) = outcome.first_denied_column() {
8479                        return Err(format!(
8480                            "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM column policy",
8481                            username, iam_action, denied.resource.kind, denied.resource.name
8482                        ));
8483                    }
8484                    if !outcome.allowed() {
8485                        return Err(format!(
8486                            "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
8487                            username,
8488                            iam_action,
8489                            outcome.table_resource.kind,
8490                            outcome.table_resource.name
8491                        ));
8492                    }
8493                }
8494            }
8495
8496            Ok(())
8497        } else {
8498            auth_store
8499                .check_grant(&ctx, action, &resource)
8500                .map_err(|e| e.to_string())
8501        }
8502    }
8503
8504    fn check_table_column_projection_privilege(
8505        &self,
8506        auth_store: &Arc<crate::auth::store::AuthStore>,
8507        principal: &crate::auth::UserId,
8508        ctx: &crate::auth::policies::EvalContext,
8509        table: &crate::storage::query::ast::TableQuery,
8510    ) -> Result<(), String> {
8511        use crate::auth::{ColumnAccessRequest, ColumnDecisionEffect};
8512
8513        let columns = requested_table_columns_for_policy(table);
8514        if columns.is_empty() {
8515            return Ok(());
8516        }
8517
8518        let request = ColumnAccessRequest::select(table.table.clone(), columns);
8519        let outcome = auth_store.check_column_projection_authz(principal, &request, ctx);
8520        if outcome.allowed() {
8521            return Ok(());
8522        }
8523
8524        if !matches!(
8525            outcome.table_decision,
8526            crate::auth::policies::Decision::Allow { .. }
8527                | crate::auth::policies::Decision::AdminBypass
8528        ) {
8529            return Err(format!(
8530                "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
8531                principal, outcome.table_resource.kind, outcome.table_resource.name
8532            ));
8533        }
8534
8535        let denied = outcome
8536            .first_denied_column()
8537            .filter(|decision| decision.effective == ColumnDecisionEffect::Denied);
8538        match denied {
8539            Some(decision) => Err(format!(
8540                "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
8541                principal, decision.resource.kind, decision.resource.name
8542            )),
8543            None => Ok(()),
8544        }
8545    }
8546
8547    fn check_graph_property_projection_privilege(
8548        &self,
8549        auth_store: &Arc<crate::auth::store::AuthStore>,
8550        principal: &crate::auth::UserId,
8551        role: crate::auth::Role,
8552        tenant: Option<&str>,
8553        query: &crate::storage::query::ast::GraphQuery,
8554    ) -> Result<(), String> {
8555        let columns = explicit_graph_projection_properties(query);
8556        if columns.is_empty() {
8557            return Ok(());
8558        }
8559        self.check_table_like_column_projection_privilege(
8560            auth_store, principal, role, tenant, "graph", &columns,
8561        )
8562    }
8563
8564    fn check_table_like_column_projection_privilege(
8565        &self,
8566        auth_store: &Arc<crate::auth::store::AuthStore>,
8567        principal: &crate::auth::UserId,
8568        role: crate::auth::Role,
8569        tenant: Option<&str>,
8570        table: &str,
8571        columns: &[String],
8572    ) -> Result<(), String> {
8573        let iam_ctx = runtime_iam_context(role, tenant);
8574        let request =
8575            crate::auth::ColumnAccessRequest::select(table.to_string(), columns.iter().cloned());
8576        let outcome = auth_store.check_column_projection_authz(principal, &request, &iam_ctx);
8577        if outcome.allowed() {
8578            return Ok(());
8579        }
8580        let denied = outcome
8581            .first_denied_column()
8582            .map(|d| d.resource.name.clone())
8583            .unwrap_or_else(|| format!("{table}.<unknown>"));
8584        Err(format!(
8585            "principal=`{}` action=`select` resource=`column:{}` denied by IAM policy",
8586            principal, denied
8587        ))
8588    }
8589
8590    fn check_policy_management_privilege(
8591        &self,
8592        auth_store: &Arc<crate::auth::store::AuthStore>,
8593        principal: &crate::auth::UserId,
8594        role: crate::auth::Role,
8595        tenant: Option<&str>,
8596        action: &str,
8597        resource_kind: &str,
8598        resource_name: &str,
8599    ) -> Result<(), String> {
8600        if !auth_store.iam_authorization_enabled() {
8601            return if role == crate::auth::Role::Admin {
8602                Ok(())
8603            } else {
8604                Err(format!(
8605                    "principal=`{}` role=`{:?}` cannot issue ACL/auth DDL",
8606                    principal, role
8607                ))
8608            };
8609        }
8610
8611        let mut resource = crate::auth::policies::ResourceRef::new(
8612            resource_kind.to_string(),
8613            resource_name.to_string(),
8614        );
8615        if let Some(t) = tenant {
8616            resource = resource.with_tenant(t.to_string());
8617        }
8618        let ctx = runtime_iam_context(role, tenant);
8619        if auth_store.check_policy_authz(principal, action, &resource, &ctx) {
8620            Ok(())
8621        } else {
8622            Err(format!(
8623                "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
8624                principal, action, resource.kind, resource.name
8625            ))
8626        }
8627    }
8628
8629    /// IAM privilege check for DROP / TRUNCATE on a named collection.
8630    ///
8631    /// In legacy mode (IAM not enabled): requires Write role.
8632    /// In IAM mode: requires an explicit `drop` / `truncate` policy on
8633    /// `collection:<name>` (Admin role auto-passes via AdminBypass).
8634    /// Records an audit log entry for both allow and deny outcomes.
8635    fn check_ddl_collection_privilege(
8636        &self,
8637        auth_store: &Arc<crate::auth::store::AuthStore>,
8638        principal: &crate::auth::UserId,
8639        role: crate::auth::Role,
8640        tenant: Option<&str>,
8641        username: &str,
8642        action: &str,
8643        collection: &str,
8644    ) -> Result<(), String> {
8645        if role < crate::auth::Role::Write {
8646            let msg = format!(
8647                "principal=`{}` role=`{:?}` cannot issue DDL",
8648                username, role
8649            );
8650            self.inner.audit_log.record(
8651                action,
8652                username,
8653                collection,
8654                "denied",
8655                crate::json::Value::Null,
8656            );
8657            return Err(msg);
8658        }
8659
8660        if !auth_store.iam_authorization_enabled() {
8661            self.inner.audit_log.record(
8662                action,
8663                username,
8664                collection,
8665                "ok",
8666                crate::json::Value::Null,
8667            );
8668            return Ok(());
8669        }
8670
8671        let resource_name = collection.to_string();
8672        let mut resource = crate::auth::policies::ResourceRef::new(
8673            "collection".to_string(),
8674            resource_name.clone(),
8675        );
8676        if let Some(t) = tenant {
8677            resource = resource.with_tenant(t.to_string());
8678        }
8679        let ctx = runtime_iam_context(role, tenant);
8680        if auth_store.check_policy_authz(principal, action, &resource, &ctx) {
8681            self.inner.audit_log.record(
8682                action,
8683                username,
8684                &resource_name,
8685                "ok",
8686                crate::json::Value::Null,
8687            );
8688            Ok(())
8689        } else {
8690            self.inner.audit_log.record(
8691                action,
8692                username,
8693                &resource_name,
8694                "denied",
8695                crate::json::Value::Null,
8696            );
8697            Err(format!(
8698                "principal=`{}` action=`{}` resource=`collection:{}` denied by IAM policy",
8699                username, action, resource_name
8700            ))
8701        }
8702    }
8703
8704    /// Translate the parsed [`GrantStmt`] into AuthStore mutations.
8705    fn execute_grant_statement(
8706        &self,
8707        query: &str,
8708        stmt: &crate::storage::query::ast::GrantStmt,
8709    ) -> RedDBResult<RuntimeQueryResult> {
8710        use crate::auth::privileges::{Action, GrantPrincipal, Resource};
8711        use crate::auth::UserId;
8712        use crate::storage::query::ast::{GrantObjectKind, GrantPrincipalRef};
8713
8714        let auth_store = self
8715            .inner
8716            .auth_store
8717            .read()
8718            .clone()
8719            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
8720
8721        // Granter identity + role.
8722        let (gname, grole) = current_auth_identity().ok_or_else(|| {
8723            RedDBError::Query("GRANT requires an authenticated principal".to_string())
8724        })?;
8725        let granter = UserId::from_parts(current_tenant().as_deref(), &gname);
8726        let granter_role = grole;
8727
8728        // Build the action set.
8729        let mut actions: Vec<Action> = Vec::new();
8730        if stmt.all {
8731            actions.push(Action::All);
8732        } else {
8733            for kw in &stmt.actions {
8734                let a = Action::from_keyword(kw).ok_or_else(|| {
8735                    RedDBError::Query(format!("unknown privilege keyword `{}`", kw))
8736                })?;
8737                actions.push(a);
8738            }
8739        }
8740
8741        // Audit emit (printed; structured emission is Agent #4's lane).
8742        let mut applied = 0usize;
8743        for obj in &stmt.objects {
8744            let resource = match stmt.object_kind {
8745                GrantObjectKind::Table => Resource::Table {
8746                    schema: obj.schema.clone(),
8747                    table: obj.name.clone(),
8748                },
8749                GrantObjectKind::Schema => Resource::Schema(obj.name.clone()),
8750                GrantObjectKind::Database => Resource::Database,
8751                GrantObjectKind::Function => Resource::Function {
8752                    schema: obj.schema.clone(),
8753                    name: obj.name.clone(),
8754                },
8755            };
8756            for principal in &stmt.principals {
8757                let p = match principal {
8758                    GrantPrincipalRef::Public => GrantPrincipal::Public,
8759                    GrantPrincipalRef::Group(g) => GrantPrincipal::Group(g.clone()),
8760                    GrantPrincipalRef::User { tenant, name } => {
8761                        GrantPrincipal::User(UserId::from_parts(tenant.as_deref(), name))
8762                    }
8763                };
8764                // Tenant of the grant follows the granter's tenant
8765                // (cross-tenant guard inside `AuthStore::grant`).
8766                let tenant = granter.tenant.clone();
8767                auth_store
8768                    .grant(
8769                        &granter,
8770                        granter_role,
8771                        p.clone(),
8772                        resource.clone(),
8773                        actions.clone(),
8774                        stmt.with_grant_option,
8775                        tenant.clone(),
8776                    )
8777                    .map_err(|e| RedDBError::Query(e.to_string()))?;
8778
8779                // IAM policy translation: every GRANT also lands as a
8780                // synthetic `_grant_<id>` policy attached to the
8781                // principal so the new evaluator sees it.
8782                if let Some(policy) =
8783                    grant_to_iam_policy(&p, &resource, &actions, tenant.as_deref())
8784                {
8785                    let pid = policy.id.clone();
8786                    auth_store
8787                        .put_policy_internal(policy)
8788                        .map_err(|e| RedDBError::Query(e.to_string()))?;
8789                    let attachment = match &p {
8790                        GrantPrincipal::User(uid) => {
8791                            crate::auth::store::PrincipalRef::User(uid.clone())
8792                        }
8793                        GrantPrincipal::Group(group) => {
8794                            crate::auth::store::PrincipalRef::Group(group.clone())
8795                        }
8796                        GrantPrincipal::Public => crate::auth::store::PrincipalRef::Group(
8797                            crate::auth::store::PUBLIC_IAM_GROUP.to_string(),
8798                        ),
8799                    };
8800                    auth_store
8801                        .attach_policy(attachment, &pid)
8802                        .map_err(|e| RedDBError::Query(e.to_string()))?;
8803                }
8804                applied += 1;
8805                tracing::info!(
8806                    target: "audit",
8807                    principal = %granter,
8808                    action = "grant",
8809                    "GRANT applied"
8810                );
8811            }
8812        }
8813
8814        self.invalidate_result_cache();
8815        Ok(RuntimeQueryResult::ok_message(
8816            query.to_string(),
8817            &format!("GRANT applied to {} target(s)", applied),
8818            "grant",
8819        ))
8820    }
8821
8822    /// Translate the parsed [`RevokeStmt`] into AuthStore mutations.
8823    fn execute_revoke_statement(
8824        &self,
8825        query: &str,
8826        stmt: &crate::storage::query::ast::RevokeStmt,
8827    ) -> RedDBResult<RuntimeQueryResult> {
8828        use crate::auth::privileges::{Action, GrantPrincipal, Resource};
8829        use crate::auth::UserId;
8830        use crate::storage::query::ast::{GrantObjectKind, GrantPrincipalRef};
8831
8832        let auth_store = self
8833            .inner
8834            .auth_store
8835            .read()
8836            .clone()
8837            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
8838
8839        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
8840            RedDBError::Query("REVOKE requires an authenticated principal".to_string())
8841        })?;
8842        let granter_role = grole;
8843
8844        let actions: Vec<Action> = if stmt.all {
8845            vec![Action::All]
8846        } else {
8847            stmt.actions
8848                .iter()
8849                .map(|kw| Action::from_keyword(kw).unwrap_or(Action::Select))
8850                .collect()
8851        };
8852
8853        let mut total_removed = 0usize;
8854        for obj in &stmt.objects {
8855            let resource = match stmt.object_kind {
8856                GrantObjectKind::Table => Resource::Table {
8857                    schema: obj.schema.clone(),
8858                    table: obj.name.clone(),
8859                },
8860                GrantObjectKind::Schema => Resource::Schema(obj.name.clone()),
8861                GrantObjectKind::Database => Resource::Database,
8862                GrantObjectKind::Function => Resource::Function {
8863                    schema: obj.schema.clone(),
8864                    name: obj.name.clone(),
8865                },
8866            };
8867            for principal in &stmt.principals {
8868                let p = match principal {
8869                    GrantPrincipalRef::Public => GrantPrincipal::Public,
8870                    GrantPrincipalRef::Group(g) => GrantPrincipal::Group(g.clone()),
8871                    GrantPrincipalRef::User { tenant, name } => {
8872                        GrantPrincipal::User(UserId::from_parts(tenant.as_deref(), name))
8873                    }
8874                };
8875                let removed = auth_store
8876                    .revoke(granter_role, &p, &resource, &actions)
8877                    .map_err(|e| RedDBError::Query(e.to_string()))?;
8878                let _removed_policies =
8879                    auth_store.delete_synthetic_grant_policies(&p, &resource, &actions);
8880                total_removed += removed;
8881            }
8882        }
8883
8884        self.invalidate_result_cache();
8885        Ok(RuntimeQueryResult::ok_message(
8886            query.to_string(),
8887            &format!("REVOKE removed {} grant(s)", total_removed),
8888            "revoke",
8889        ))
8890    }
8891
8892    /// Translate the parsed [`AlterUserStmt`] into AuthStore mutations.
8893    fn execute_alter_user_statement(
8894        &self,
8895        query: &str,
8896        stmt: &crate::storage::query::ast::AlterUserStmt,
8897    ) -> RedDBResult<RuntimeQueryResult> {
8898        use crate::auth::privileges::UserAttributes;
8899        use crate::auth::UserId;
8900        use crate::storage::query::ast::AlterUserAttribute;
8901
8902        let auth_store = self
8903            .inner
8904            .auth_store
8905            .read()
8906            .clone()
8907            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
8908
8909        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
8910            RedDBError::Query("ALTER USER requires an authenticated principal".to_string())
8911        })?;
8912        if grole != crate::auth::Role::Admin {
8913            return Err(RedDBError::Query(
8914                "ALTER USER requires Admin role".to_string(),
8915            ));
8916        }
8917
8918        let target = UserId::from_parts(stmt.tenant.as_deref(), &stmt.username);
8919
8920        // Apply attributes incrementally — each one reads the current
8921        // record, mutates the relevant field, writes back.
8922        let mut attrs = auth_store.user_attributes(&target);
8923        let mut enable_change: Option<bool> = None;
8924
8925        for a in &stmt.attributes {
8926            match a {
8927                AlterUserAttribute::ValidUntil(ts) => {
8928                    // Parse ISO-ish timestamp → ms since epoch. Fall
8929                    // back to integer-ms parsing for callers that pass
8930                    // `'1234567890123'`.
8931                    let ms = parse_timestamp_to_ms(ts).ok_or_else(|| {
8932                        RedDBError::Query(format!("invalid VALID UNTIL timestamp `{ts}`"))
8933                    })?;
8934                    attrs.valid_until = Some(ms);
8935                }
8936                AlterUserAttribute::ConnectionLimit(n) => {
8937                    if *n < 0 {
8938                        return Err(RedDBError::Query(
8939                            "CONNECTION LIMIT must be non-negative".to_string(),
8940                        ));
8941                    }
8942                    attrs.connection_limit = Some(*n as u32);
8943                }
8944                AlterUserAttribute::SetSearchPath(p) => {
8945                    attrs.search_path = Some(p.clone());
8946                }
8947                AlterUserAttribute::AddGroup(g) => {
8948                    if !attrs.groups.iter().any(|existing| existing == g) {
8949                        attrs.groups.push(g.clone());
8950                        attrs.groups.sort();
8951                    }
8952                }
8953                AlterUserAttribute::DropGroup(g) => {
8954                    attrs.groups.retain(|existing| existing != g);
8955                }
8956                AlterUserAttribute::Enable => enable_change = Some(true),
8957                AlterUserAttribute::Disable => enable_change = Some(false),
8958                AlterUserAttribute::Password(_) => {
8959                    // Out of scope — accept the AST but no-op so the
8960                    // parser stays compatible with future password
8961                    // rotation work.
8962                }
8963            }
8964        }
8965
8966        auth_store
8967            .set_user_attributes(&target, attrs)
8968            .map_err(|e| RedDBError::Query(e.to_string()))?;
8969        if let Some(en) = enable_change {
8970            auth_store
8971                .set_user_enabled(&target, en)
8972                .map_err(|e| RedDBError::Query(e.to_string()))?;
8973        }
8974        self.invalidate_result_cache();
8975        tracing::info!(
8976            target: "audit",
8977            principal = %target,
8978            action = "alter_user",
8979            "ALTER USER applied"
8980        );
8981
8982        Ok(RuntimeQueryResult::ok_message(
8983            query.to_string(),
8984            &format!("ALTER USER {} applied", target),
8985            "alter_user",
8986        ))
8987    }
8988
8989    // -----------------------------------------------------------------
8990    // IAM policy executors
8991    // -----------------------------------------------------------------
8992
8993    fn execute_create_iam_policy(
8994        &self,
8995        query: &str,
8996        id: &str,
8997        json: &str,
8998    ) -> RedDBResult<RuntimeQueryResult> {
8999        use crate::auth::policies::Policy;
9000
9001        let auth_store = self
9002            .inner
9003            .auth_store
9004            .read()
9005            .clone()
9006            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9007
9008        // Parse + validate. The kernel rejects oversize / bad shape /
9009        // bad action keywords. If the supplied id differs from the JSON
9010        // id, override it with the SQL-provided id (the JSON id is
9011        // optional context — the SQL DDL form is authoritative).
9012        let mut policy = Policy::from_json_str(json)
9013            .map_err(|e| RedDBError::Query(format!("policy parse: {e}")))?;
9014        if policy.id != id {
9015            policy.id = id.to_string();
9016        }
9017        let pid = policy.id.clone();
9018        auth_store
9019            .put_policy(policy)
9020            .map_err(|e| RedDBError::Query(e.to_string()))?;
9021
9022        let principal = current_auth_identity()
9023            .map(|(u, _)| u)
9024            .unwrap_or_else(|| "anonymous".into());
9025        tracing::info!(
9026            target: "audit",
9027            principal = %principal,
9028            action = "iam:policy.put",
9029            matched_policy_id = %pid,
9030            "CREATE POLICY applied"
9031        );
9032        self.inner.audit_log.record(
9033            "iam/policy.put",
9034            &principal,
9035            &pid,
9036            "ok",
9037            crate::json::Value::Null,
9038        );
9039
9040        self.invalidate_result_cache();
9041        Ok(RuntimeQueryResult::ok_message(
9042            query.to_string(),
9043            &format!("policy `{pid}` stored"),
9044            "create_iam_policy",
9045        ))
9046    }
9047
9048    fn execute_drop_iam_policy(&self, query: &str, id: &str) -> RedDBResult<RuntimeQueryResult> {
9049        let auth_store = self
9050            .inner
9051            .auth_store
9052            .read()
9053            .clone()
9054            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9055        auth_store
9056            .delete_policy(id)
9057            .map_err(|e| RedDBError::Query(e.to_string()))?;
9058
9059        let principal = current_auth_identity()
9060            .map(|(u, _)| u)
9061            .unwrap_or_else(|| "anonymous".into());
9062        tracing::info!(
9063            target: "audit",
9064            principal = %principal,
9065            action = "iam:policy.drop",
9066            matched_policy_id = %id,
9067            "DROP POLICY applied"
9068        );
9069        self.inner.audit_log.record(
9070            "iam/policy.drop",
9071            &principal,
9072            id,
9073            "ok",
9074            crate::json::Value::Null,
9075        );
9076
9077        self.invalidate_result_cache();
9078        Ok(RuntimeQueryResult::ok_message(
9079            query.to_string(),
9080            &format!("policy `{id}` dropped"),
9081            "drop_iam_policy",
9082        ))
9083    }
9084
9085    fn execute_attach_policy(
9086        &self,
9087        query: &str,
9088        policy_id: &str,
9089        principal: &crate::storage::query::ast::PolicyPrincipalRef,
9090    ) -> RedDBResult<RuntimeQueryResult> {
9091        use crate::auth::store::PrincipalRef;
9092        use crate::auth::UserId;
9093        use crate::storage::query::ast::PolicyPrincipalRef;
9094
9095        let auth_store = self
9096            .inner
9097            .auth_store
9098            .read()
9099            .clone()
9100            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9101        let p = match principal {
9102            PolicyPrincipalRef::User(u) => {
9103                PrincipalRef::User(UserId::from_parts(u.tenant.as_deref(), &u.username))
9104            }
9105            PolicyPrincipalRef::Group(g) => PrincipalRef::Group(g.clone()),
9106        };
9107        let pretty_target = principal_label(principal);
9108        auth_store
9109            .attach_policy(p, policy_id)
9110            .map_err(|e| RedDBError::Query(e.to_string()))?;
9111
9112        let principal_str = current_auth_identity()
9113            .map(|(u, _)| u)
9114            .unwrap_or_else(|| "anonymous".into());
9115        tracing::info!(
9116            target: "audit",
9117            principal = %principal_str,
9118            action = "iam:policy.attach",
9119            matched_policy_id = %policy_id,
9120            target = %pretty_target,
9121            "ATTACH POLICY applied"
9122        );
9123        self.inner.audit_log.record(
9124            "iam/policy.attach",
9125            &principal_str,
9126            &pretty_target,
9127            "ok",
9128            crate::json::Value::Null,
9129        );
9130
9131        self.invalidate_result_cache();
9132        Ok(RuntimeQueryResult::ok_message(
9133            query.to_string(),
9134            &format!("policy `{policy_id}` attached to {pretty_target}"),
9135            "attach_policy",
9136        ))
9137    }
9138
9139    fn execute_detach_policy(
9140        &self,
9141        query: &str,
9142        policy_id: &str,
9143        principal: &crate::storage::query::ast::PolicyPrincipalRef,
9144    ) -> RedDBResult<RuntimeQueryResult> {
9145        use crate::auth::store::PrincipalRef;
9146        use crate::auth::UserId;
9147        use crate::storage::query::ast::PolicyPrincipalRef;
9148
9149        let auth_store = self
9150            .inner
9151            .auth_store
9152            .read()
9153            .clone()
9154            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9155        let p = match principal {
9156            PolicyPrincipalRef::User(u) => {
9157                PrincipalRef::User(UserId::from_parts(u.tenant.as_deref(), &u.username))
9158            }
9159            PolicyPrincipalRef::Group(g) => PrincipalRef::Group(g.clone()),
9160        };
9161        let pretty_target = principal_label(principal);
9162        auth_store
9163            .detach_policy(p, policy_id)
9164            .map_err(|e| RedDBError::Query(e.to_string()))?;
9165
9166        let principal_str = current_auth_identity()
9167            .map(|(u, _)| u)
9168            .unwrap_or_else(|| "anonymous".into());
9169        tracing::info!(
9170            target: "audit",
9171            principal = %principal_str,
9172            action = "iam:policy.detach",
9173            matched_policy_id = %policy_id,
9174            target = %pretty_target,
9175            "DETACH POLICY applied"
9176        );
9177        self.inner.audit_log.record(
9178            "iam/policy.detach",
9179            &principal_str,
9180            &pretty_target,
9181            "ok",
9182            crate::json::Value::Null,
9183        );
9184
9185        self.invalidate_result_cache();
9186        Ok(RuntimeQueryResult::ok_message(
9187            query.to_string(),
9188            &format!("policy `{policy_id}` detached from {pretty_target}"),
9189            "detach_policy",
9190        ))
9191    }
9192
9193    fn execute_show_policies(
9194        &self,
9195        query: &str,
9196        filter: Option<&crate::storage::query::ast::PolicyPrincipalRef>,
9197    ) -> RedDBResult<RuntimeQueryResult> {
9198        use crate::auth::UserId;
9199        use crate::storage::query::ast::PolicyPrincipalRef;
9200        use crate::storage::query::unified::UnifiedRecord;
9201        use crate::storage::schema::Value as SchemaValue;
9202        use std::sync::Arc;
9203
9204        let auth_store = self
9205            .inner
9206            .auth_store
9207            .read()
9208            .clone()
9209            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9210
9211        let pols = match filter {
9212            None => auth_store.list_policies(),
9213            Some(PolicyPrincipalRef::User(u)) => {
9214                let id = UserId::from_parts(u.tenant.as_deref(), &u.username);
9215                auth_store.effective_policies(&id)
9216            }
9217            Some(PolicyPrincipalRef::Group(g)) => auth_store.group_policies(g),
9218        };
9219
9220        let mut records = Vec::with_capacity(pols.len());
9221        for p in pols.iter() {
9222            let mut rec = UnifiedRecord::default();
9223            rec.set_arc(Arc::from("id"), SchemaValue::text(p.id.clone()));
9224            rec.set_arc(
9225                Arc::from("statements"),
9226                SchemaValue::Integer(p.statements.len() as i64),
9227            );
9228            rec.set_arc(
9229                Arc::from("tenant"),
9230                p.tenant
9231                    .as_deref()
9232                    .map(|t| SchemaValue::text(t.to_string()))
9233                    .unwrap_or(SchemaValue::Null),
9234            );
9235            rec.set_arc(Arc::from("json"), SchemaValue::text(p.to_json_string()));
9236            records.push(rec);
9237        }
9238        let mut result = crate::storage::query::unified::UnifiedResult::empty();
9239        result.records = records;
9240        Ok(RuntimeQueryResult {
9241            query: query.to_string(),
9242            mode: crate::storage::query::modes::QueryMode::Sql,
9243            statement: "show_policies",
9244            engine: "iam-policies",
9245            result,
9246            affected_rows: 0,
9247            statement_type: "select",
9248        })
9249    }
9250
9251    fn execute_show_effective_permissions(
9252        &self,
9253        query: &str,
9254        user: &crate::storage::query::ast::PolicyUserRef,
9255        resource: Option<&crate::storage::query::ast::PolicyResourceRef>,
9256    ) -> RedDBResult<RuntimeQueryResult> {
9257        use crate::auth::UserId;
9258        use crate::storage::query::unified::UnifiedRecord;
9259        use crate::storage::schema::Value as SchemaValue;
9260        use std::sync::Arc;
9261
9262        let auth_store = self
9263            .inner
9264            .auth_store
9265            .read()
9266            .clone()
9267            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9268        let id = UserId::from_parts(user.tenant.as_deref(), &user.username);
9269        let pols = auth_store.effective_policies(&id);
9270
9271        // Show one row per (policy, statement) tuple, plus any
9272        // resource-level filter passed by the caller.
9273        let mut records = Vec::new();
9274        for p in pols.iter() {
9275            for (idx, st) in p.statements.iter().enumerate() {
9276                if let Some(_r) = resource {
9277                    // Naive filter: render statement targets to strings
9278                    // and skip if no match. Conservative default = include
9279                    // (the simulator handles fine-grained matching).
9280                }
9281                let mut rec = UnifiedRecord::default();
9282                rec.set_arc(Arc::from("policy_id"), SchemaValue::text(p.id.clone()));
9283                rec.set_arc(
9284                    Arc::from("statement_index"),
9285                    SchemaValue::Integer(idx as i64),
9286                );
9287                rec.set_arc(
9288                    Arc::from("sid"),
9289                    st.sid
9290                        .as_deref()
9291                        .map(|s| SchemaValue::text(s.to_string()))
9292                        .unwrap_or(SchemaValue::Null),
9293                );
9294                rec.set_arc(
9295                    Arc::from("effect"),
9296                    SchemaValue::text(match st.effect {
9297                        crate::auth::policies::Effect::Allow => "allow",
9298                        crate::auth::policies::Effect::Deny => "deny",
9299                    }),
9300                );
9301                rec.set_arc(
9302                    Arc::from("actions"),
9303                    SchemaValue::Integer(st.actions.len() as i64),
9304                );
9305                rec.set_arc(
9306                    Arc::from("resources"),
9307                    SchemaValue::Integer(st.resources.len() as i64),
9308                );
9309                records.push(rec);
9310            }
9311        }
9312        let mut result = crate::storage::query::unified::UnifiedResult::empty();
9313        result.records = records;
9314        Ok(RuntimeQueryResult {
9315            query: query.to_string(),
9316            mode: crate::storage::query::modes::QueryMode::Sql,
9317            statement: "show_effective_permissions",
9318            engine: "iam-policies",
9319            result,
9320            affected_rows: 0,
9321            statement_type: "select",
9322        })
9323    }
9324
9325    fn execute_simulate_policy(
9326        &self,
9327        query: &str,
9328        user: &crate::storage::query::ast::PolicyUserRef,
9329        action: &str,
9330        resource: &crate::storage::query::ast::PolicyResourceRef,
9331    ) -> RedDBResult<RuntimeQueryResult> {
9332        use crate::auth::policies::ResourceRef;
9333        use crate::auth::store::SimCtx;
9334        use crate::auth::UserId;
9335        use crate::storage::query::unified::UnifiedRecord;
9336        use crate::storage::schema::Value as SchemaValue;
9337        use std::sync::Arc;
9338
9339        let auth_store = self
9340            .inner
9341            .auth_store
9342            .read()
9343            .clone()
9344            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9345        let id = UserId::from_parts(user.tenant.as_deref(), &user.username);
9346        let r = ResourceRef::new(resource.kind.clone(), resource.name.clone());
9347        let outcome = auth_store.simulate(&id, action, &r, SimCtx::default());
9348
9349        let principal_str = current_auth_identity()
9350            .map(|(u, _)| u)
9351            .unwrap_or_else(|| "anonymous".into());
9352        let (decision_str, matched_pid, matched_sid) = decision_to_strings(&outcome.decision);
9353        tracing::info!(
9354            target: "audit",
9355            principal = %principal_str,
9356            action = "iam:policy.simulate",
9357            decision = %decision_str,
9358            matched_policy_id = ?matched_pid,
9359            matched_sid = ?matched_sid,
9360            "SIMULATE issued"
9361        );
9362        self.inner.audit_log.record(
9363            "iam/policy.simulate",
9364            &principal_str,
9365            &id.to_string(),
9366            "ok",
9367            crate::json::Value::Null,
9368        );
9369
9370        let mut rec = UnifiedRecord::default();
9371        rec.set_arc(Arc::from("decision"), SchemaValue::text(decision_str));
9372        rec.set_arc(
9373            Arc::from("matched_policy_id"),
9374            matched_pid
9375                .map(SchemaValue::text)
9376                .unwrap_or(SchemaValue::Null),
9377        );
9378        rec.set_arc(
9379            Arc::from("matched_sid"),
9380            matched_sid
9381                .map(SchemaValue::text)
9382                .unwrap_or(SchemaValue::Null),
9383        );
9384        rec.set_arc(Arc::from("reason"), SchemaValue::text(outcome.reason));
9385        rec.set_arc(
9386            Arc::from("trail_len"),
9387            SchemaValue::Integer(outcome.trail.len() as i64),
9388        );
9389        let mut result = crate::storage::query::unified::UnifiedResult::empty();
9390        result.records = vec![rec];
9391        Ok(RuntimeQueryResult {
9392            query: query.to_string(),
9393            mode: crate::storage::query::modes::QueryMode::Sql,
9394            statement: "simulate_policy",
9395            engine: "iam-policies",
9396            result,
9397            affected_rows: 0,
9398            statement_type: "select",
9399        })
9400    }
9401}
9402
9403/// Translate a parsed GRANT into a synthetic IAM policy whose id
9404/// starts with `_grant_<unique>`. PUBLIC is represented as an
9405/// implicit IAM group; legacy GROUP grants are still rejected by the
9406/// grant store and are not translated here.
9407fn grant_to_iam_policy(
9408    principal: &crate::auth::privileges::GrantPrincipal,
9409    resource: &crate::auth::privileges::Resource,
9410    actions: &[crate::auth::privileges::Action],
9411    tenant: Option<&str>,
9412) -> Option<crate::auth::policies::Policy> {
9413    use crate::auth::policies::{
9414        compile_action, ActionPattern, Effect, Policy, ResourcePattern, Statement,
9415    };
9416    use crate::auth::privileges::{Action, GrantPrincipal, Resource};
9417
9418    if matches!(principal, GrantPrincipal::Group(_)) {
9419        return None;
9420    }
9421
9422    let now = crate::auth::now_ms();
9423    let id = format!("_grant_{:x}_{:x}", now, std::process::id());
9424
9425    let resource_str = match resource {
9426        Resource::Database => "table:*".to_string(),
9427        Resource::Schema(s) => format!("table:{s}.*"),
9428        Resource::Table { schema, table } => match schema {
9429            Some(s) => format!("table:{s}.{table}"),
9430            None => format!("table:{table}"),
9431        },
9432        Resource::Function { schema, name } => match schema {
9433            Some(s) => format!("function:{s}.{name}"),
9434            None => format!("function:{name}"),
9435        },
9436    };
9437
9438    // Compile actions — fall back to `*` only when the grant included
9439    // `Action::All`. Map every other action keyword to its lowercase
9440    // form so it lines up with the kernel's allowlist.
9441    let action_patterns: Vec<ActionPattern> = if actions.contains(&Action::All) {
9442        vec![ActionPattern::Wildcard]
9443    } else {
9444        actions
9445            .iter()
9446            .map(|a| compile_action(&a.as_str().to_ascii_lowercase()))
9447            .collect()
9448    };
9449    if action_patterns.is_empty() {
9450        return None;
9451    }
9452
9453    // Inline resource compilation matching the kernel's `compile_resource`:
9454    //   * `*` → wildcard
9455    //   * contains `*` → glob
9456    //   * `kind:name` → exact
9457    let resource_patterns = if resource_str == "*" {
9458        vec![ResourcePattern::Wildcard]
9459    } else if resource_str.contains('*') {
9460        vec![ResourcePattern::Glob(resource_str.clone())]
9461    } else if let Some((kind, name)) = resource_str.split_once(':') {
9462        vec![ResourcePattern::Exact {
9463            kind: kind.to_string(),
9464            name: name.to_string(),
9465        }]
9466    } else {
9467        vec![ResourcePattern::Wildcard]
9468    };
9469
9470    let policy = Policy {
9471        id,
9472        version: 1,
9473        tenant: tenant.map(|t| t.to_string()),
9474        created_at: now,
9475        updated_at: now,
9476        statements: vec![Statement {
9477            sid: None,
9478            effect: Effect::Allow,
9479            actions: action_patterns,
9480            resources: resource_patterns,
9481            condition: None,
9482        }],
9483    };
9484    if policy.validate().is_err() {
9485        return None;
9486    }
9487    Some(policy)
9488}
9489
9490fn legacy_action_to_iam(action: crate::auth::privileges::Action) -> &'static str {
9491    use crate::auth::privileges::Action;
9492    match action {
9493        Action::Select => "select",
9494        Action::Insert => "insert",
9495        Action::Update => "update",
9496        Action::Delete => "delete",
9497        Action::Truncate => "truncate",
9498        Action::References => "references",
9499        Action::Execute => "execute",
9500        Action::Usage => "usage",
9501        Action::All => "*",
9502    }
9503}
9504
9505fn update_set_target_columns(query: &crate::storage::query::ast::UpdateQuery) -> Vec<String> {
9506    let mut columns = Vec::new();
9507    for (column, _) in &query.assignment_exprs {
9508        if !columns.iter().any(|seen| seen == column) {
9509            columns.push(column.clone());
9510        }
9511    }
9512    columns
9513}
9514
9515fn column_access_request_for_table_update(
9516    table_name: &str,
9517    columns: Vec<String>,
9518) -> crate::auth::ColumnAccessRequest {
9519    match table_name.split_once('.') {
9520        Some((schema, table)) => {
9521            crate::auth::ColumnAccessRequest::update(table.to_string(), columns)
9522                .with_schema(schema.to_string())
9523        }
9524        None => crate::auth::ColumnAccessRequest::update(table_name.to_string(), columns),
9525    }
9526}
9527
9528fn requested_table_columns_for_policy(
9529    table: &crate::storage::query::ast::TableQuery,
9530) -> Vec<String> {
9531    use crate::storage::query::sql_lowering::{
9532        effective_table_filter, effective_table_group_by_exprs, effective_table_having_filter,
9533        effective_table_projections,
9534    };
9535
9536    let table_name = table.table.as_str();
9537    let table_alias = table.alias.as_deref();
9538    let mut columns = std::collections::BTreeSet::new();
9539
9540    for projection in effective_table_projections(table) {
9541        collect_projection_columns(&projection, table_name, table_alias, &mut columns);
9542    }
9543    if let Some(filter) = effective_table_filter(table) {
9544        collect_filter_columns(&filter, table_name, table_alias, &mut columns);
9545    }
9546    for expr in effective_table_group_by_exprs(table) {
9547        collect_expr_columns(&expr, table_name, table_alias, &mut columns);
9548    }
9549    if let Some(filter) = effective_table_having_filter(table) {
9550        collect_filter_columns(&filter, table_name, table_alias, &mut columns);
9551    }
9552    for order in &table.order_by {
9553        if let Some(expr) = order.expr.as_ref() {
9554            collect_expr_columns(expr, table_name, table_alias, &mut columns);
9555        } else {
9556            collect_field_ref_column(&order.field, table_name, table_alias, &mut columns);
9557        }
9558    }
9559
9560    columns.into_iter().collect()
9561}
9562
9563fn collect_projection_columns(
9564    projection: &crate::storage::query::ast::Projection,
9565    table_name: &str,
9566    table_alias: Option<&str>,
9567    columns: &mut std::collections::BTreeSet<String>,
9568) {
9569    use crate::storage::query::ast::Projection;
9570    match projection {
9571        Projection::All => {
9572            columns.insert("*".to_string());
9573        }
9574        Projection::Column(column) | Projection::Alias(column, _) => {
9575            if column != "*" {
9576                columns.insert(column.clone());
9577            }
9578        }
9579        Projection::Function(_, args) => {
9580            for arg in args {
9581                collect_projection_columns(arg, table_name, table_alias, columns);
9582            }
9583        }
9584        Projection::Expression(filter, _) => {
9585            collect_filter_columns(filter, table_name, table_alias, columns);
9586        }
9587        Projection::Field(field, _) => {
9588            collect_field_ref_column(field, table_name, table_alias, columns);
9589        }
9590    }
9591}
9592
9593fn collect_filter_columns(
9594    filter: &crate::storage::query::ast::Filter,
9595    table_name: &str,
9596    table_alias: Option<&str>,
9597    columns: &mut std::collections::BTreeSet<String>,
9598) {
9599    use crate::storage::query::ast::Filter;
9600    match filter {
9601        Filter::Compare { field, .. }
9602        | Filter::IsNull(field)
9603        | Filter::IsNotNull(field)
9604        | Filter::In { field, .. }
9605        | Filter::Between { field, .. }
9606        | Filter::Like { field, .. }
9607        | Filter::StartsWith { field, .. }
9608        | Filter::EndsWith { field, .. }
9609        | Filter::Contains { field, .. } => {
9610            collect_field_ref_column(field, table_name, table_alias, columns);
9611        }
9612        Filter::CompareFields { left, right, .. } => {
9613            collect_field_ref_column(left, table_name, table_alias, columns);
9614            collect_field_ref_column(right, table_name, table_alias, columns);
9615        }
9616        Filter::CompareExpr { lhs, rhs, .. } => {
9617            collect_expr_columns(lhs, table_name, table_alias, columns);
9618            collect_expr_columns(rhs, table_name, table_alias, columns);
9619        }
9620        Filter::And(left, right) | Filter::Or(left, right) => {
9621            collect_filter_columns(left, table_name, table_alias, columns);
9622            collect_filter_columns(right, table_name, table_alias, columns);
9623        }
9624        Filter::Not(inner) => collect_filter_columns(inner, table_name, table_alias, columns),
9625    }
9626}
9627
9628fn collect_expr_columns(
9629    expr: &crate::storage::query::ast::Expr,
9630    table_name: &str,
9631    table_alias: Option<&str>,
9632    columns: &mut std::collections::BTreeSet<String>,
9633) {
9634    use crate::storage::query::ast::Expr;
9635    match expr {
9636        Expr::Column { field, .. } => {
9637            collect_field_ref_column(field, table_name, table_alias, columns);
9638        }
9639        Expr::Literal { .. } | Expr::Parameter { .. } => {}
9640        Expr::UnaryOp { operand, .. } | Expr::Cast { inner: operand, .. } => {
9641            collect_expr_columns(operand, table_name, table_alias, columns);
9642        }
9643        Expr::BinaryOp { lhs, rhs, .. } => {
9644            collect_expr_columns(lhs, table_name, table_alias, columns);
9645            collect_expr_columns(rhs, table_name, table_alias, columns);
9646        }
9647        Expr::FunctionCall { args, .. } => {
9648            for arg in args {
9649                collect_expr_columns(arg, table_name, table_alias, columns);
9650            }
9651        }
9652        Expr::Case {
9653            branches, else_, ..
9654        } => {
9655            for (condition, value) in branches {
9656                collect_expr_columns(condition, table_name, table_alias, columns);
9657                collect_expr_columns(value, table_name, table_alias, columns);
9658            }
9659            if let Some(value) = else_ {
9660                collect_expr_columns(value, table_name, table_alias, columns);
9661            }
9662        }
9663        Expr::IsNull { operand, .. } => {
9664            collect_expr_columns(operand, table_name, table_alias, columns);
9665        }
9666        Expr::InList { target, values, .. } => {
9667            collect_expr_columns(target, table_name, table_alias, columns);
9668            for value in values {
9669                collect_expr_columns(value, table_name, table_alias, columns);
9670            }
9671        }
9672        Expr::Between {
9673            target, low, high, ..
9674        } => {
9675            collect_expr_columns(target, table_name, table_alias, columns);
9676            collect_expr_columns(low, table_name, table_alias, columns);
9677            collect_expr_columns(high, table_name, table_alias, columns);
9678        }
9679        Expr::Subquery { .. } => {}
9680    }
9681}
9682
9683fn collect_field_ref_column(
9684    field: &crate::storage::query::ast::FieldRef,
9685    table_name: &str,
9686    table_alias: Option<&str>,
9687    columns: &mut std::collections::BTreeSet<String>,
9688) {
9689    if let Some(column) = policy_column_name_from_field_ref(field, table_name, table_alias) {
9690        if column != "*" {
9691            columns.insert(column);
9692        }
9693    }
9694}
9695
9696fn policy_column_name_from_field_ref(
9697    field: &crate::storage::query::ast::FieldRef,
9698    table_name: &str,
9699    table_alias: Option<&str>,
9700) -> Option<String> {
9701    match field {
9702        crate::storage::query::ast::FieldRef::TableColumn { table, column } => {
9703            if column == "*" {
9704                return Some("*".to_string());
9705            }
9706            if table.is_empty() || table == table_name || Some(table.as_str()) == table_alias {
9707                Some(column.clone())
9708            } else {
9709                Some(format!("{table}.{column}"))
9710            }
9711        }
9712        _ => None,
9713    }
9714}
9715
9716fn legacy_resource_to_iam(
9717    resource: &crate::auth::privileges::Resource,
9718    tenant: Option<&str>,
9719) -> crate::auth::policies::ResourceRef {
9720    use crate::auth::privileges::Resource;
9721
9722    let (kind, name) = match resource {
9723        Resource::Database => ("database".to_string(), "*".to_string()),
9724        Resource::Schema(s) => ("schema".to_string(), format!("{s}.*")),
9725        Resource::Table { schema, table } => (
9726            "table".to_string(),
9727            match schema {
9728                Some(s) => format!("{s}.{table}"),
9729                None => table.clone(),
9730            },
9731        ),
9732        Resource::Function { schema, name } => (
9733            "function".to_string(),
9734            match schema {
9735                Some(s) => format!("{s}.{name}"),
9736                None => name.clone(),
9737            },
9738        ),
9739    };
9740
9741    let mut out = crate::auth::policies::ResourceRef::new(kind, name);
9742    if let Some(t) = tenant {
9743        out = out.with_tenant(t.to_string());
9744    }
9745    out
9746}
9747
9748#[derive(Debug)]
9749struct JoinTableSide {
9750    table: String,
9751    alias: String,
9752}
9753
9754fn table_side_context(expr: &QueryExpr) -> Option<JoinTableSide> {
9755    match expr {
9756        QueryExpr::Table(table) => Some(JoinTableSide {
9757            table: table.table.clone(),
9758            alias: table.alias.clone().unwrap_or_else(|| table.table.clone()),
9759        }),
9760        _ => None,
9761    }
9762}
9763
9764fn collect_projection_columns_for_table(
9765    projection: &Projection,
9766    table: &str,
9767    alias: Option<&str>,
9768    out: &mut BTreeSet<String>,
9769) {
9770    match projection {
9771        Projection::Column(column) | Projection::Alias(column, _) => {
9772            match split_qualified_column(column) {
9773                Some((qualifier, column))
9774                    if qualifier == table || alias.is_some_and(|alias| qualifier == alias) =>
9775                {
9776                    push_policy_column(column, out);
9777                }
9778                Some(_) => {}
9779                None => push_policy_column(column, out),
9780            }
9781        }
9782        Projection::Field(
9783            FieldRef::TableColumn {
9784                table: qualifier,
9785                column,
9786            },
9787            _,
9788        ) => {
9789            if qualifier.is_empty()
9790                || qualifier == table
9791                || alias.is_some_and(|alias| qualifier == alias)
9792            {
9793                push_policy_column(column, out);
9794            }
9795        }
9796        Projection::Field(
9797            FieldRef::NodeProperty {
9798                alias: qualifier,
9799                property,
9800            },
9801            _,
9802        )
9803        | Projection::Field(
9804            FieldRef::EdgeProperty {
9805                alias: qualifier,
9806                property,
9807            },
9808            _,
9809        ) => {
9810            if qualifier == table || alias.is_some_and(|alias| qualifier == alias) {
9811                push_policy_column(property, out);
9812            }
9813        }
9814        Projection::Function(_, args) => {
9815            for arg in args {
9816                collect_projection_columns_for_table(arg, table, alias, out);
9817            }
9818        }
9819        Projection::Expression(_, _) | Projection::All | Projection::Field(_, _) => {}
9820    }
9821}
9822
9823fn collect_projection_columns_for_join_side(
9824    projection: &Projection,
9825    left: Option<&JoinTableSide>,
9826    right: Option<&JoinTableSide>,
9827    out: &mut HashMap<String, BTreeSet<String>>,
9828) -> RedDBResult<()> {
9829    match projection {
9830        Projection::Column(column) | Projection::Alias(column, _) => {
9831            if let Some((qualifier, column)) = split_qualified_column(column) {
9832                push_qualified_join_column(qualifier, column, left, right, out);
9833            } else {
9834                push_unqualified_join_column(column, left, right, out);
9835            }
9836        }
9837        Projection::Field(FieldRef::TableColumn { table, column }, _) => {
9838            if table.is_empty() {
9839                push_unqualified_join_column(column, left, right, out);
9840            } else if let Some(side) = [left, right]
9841                .into_iter()
9842                .flatten()
9843                .find(|side| table == side.table.as_str() || table == side.alias.as_str())
9844            {
9845                push_join_column(&side.table, column, out);
9846            }
9847        }
9848        Projection::Field(FieldRef::NodeProperty { alias, property }, _)
9849        | Projection::Field(FieldRef::EdgeProperty { alias, property }, _) => {
9850            push_qualified_join_column(alias, property, left, right, out);
9851        }
9852        Projection::Function(_, args) => {
9853            for arg in args {
9854                collect_projection_columns_for_join_side(arg, left, right, out)?;
9855            }
9856        }
9857        Projection::Expression(_, _) | Projection::All | Projection::Field(_, _) => {}
9858    }
9859    Ok(())
9860}
9861
9862fn split_qualified_column(column: &str) -> Option<(&str, &str)> {
9863    let (qualifier, column) = column.split_once('.')?;
9864    if qualifier.is_empty() || column.is_empty() || column.contains('.') {
9865        return None;
9866    }
9867    Some((qualifier, column))
9868}
9869
9870fn push_qualified_join_column(
9871    qualifier: &str,
9872    column: &str,
9873    left: Option<&JoinTableSide>,
9874    right: Option<&JoinTableSide>,
9875    out: &mut HashMap<String, BTreeSet<String>>,
9876) {
9877    if let Some(side) = [left, right]
9878        .into_iter()
9879        .flatten()
9880        .find(|side| qualifier == side.table.as_str() || qualifier == side.alias.as_str())
9881    {
9882        push_join_column(&side.table, column, out);
9883    }
9884}
9885
9886fn push_unqualified_join_column(
9887    column: &str,
9888    left: Option<&JoinTableSide>,
9889    right: Option<&JoinTableSide>,
9890    out: &mut HashMap<String, BTreeSet<String>>,
9891) {
9892    for side in [left, right].into_iter().flatten() {
9893        push_join_column(&side.table, column, out);
9894    }
9895}
9896
9897fn push_join_column(table: &str, column: &str, out: &mut HashMap<String, BTreeSet<String>>) {
9898    if is_policy_column_name(column) {
9899        out.entry(table.to_string())
9900            .or_default()
9901            .insert(column.to_string());
9902    }
9903}
9904
9905fn push_policy_column(column: &str, out: &mut BTreeSet<String>) {
9906    if is_policy_column_name(column) {
9907        out.insert(column.to_string());
9908    }
9909}
9910
9911fn is_policy_column_name(column: &str) -> bool {
9912    !column.is_empty()
9913        && column != "*"
9914        && !column.starts_with("LIT:")
9915        && !column.starts_with("TYPE:")
9916}
9917
9918fn runtime_iam_context(
9919    role: crate::auth::Role,
9920    tenant: Option<&str>,
9921) -> crate::auth::policies::EvalContext {
9922    crate::auth::policies::EvalContext {
9923        principal_tenant: tenant.map(|t| t.to_string()),
9924        current_tenant: tenant.map(|t| t.to_string()),
9925        peer_ip: None,
9926        mfa_present: false,
9927        now_ms: crate::auth::now_ms(),
9928        principal_is_admin_role: role == crate::auth::Role::Admin,
9929    }
9930}
9931
9932fn explicit_table_projection_columns(
9933    query: &crate::storage::query::ast::TableQuery,
9934) -> Vec<String> {
9935    use crate::storage::query::ast::{FieldRef, Projection};
9936
9937    let mut columns = Vec::new();
9938    for projection in crate::storage::query::sql_lowering::effective_table_projections(query) {
9939        match projection {
9940            Projection::Column(column) | Projection::Alias(column, _) => {
9941                push_unique(&mut columns, column)
9942            }
9943            Projection::Field(FieldRef::TableColumn { column, .. }, _) => {
9944                push_unique(&mut columns, column)
9945            }
9946            // SELECT * and expression/function projections need the
9947            // executor-wide column-policy context mapped in
9948            // docs/security/select-relational-column-policy-audit-2026-05-08.md.
9949            _ => {}
9950        }
9951    }
9952    columns
9953}
9954
9955fn explicit_graph_projection_properties(
9956    query: &crate::storage::query::ast::GraphQuery,
9957) -> Vec<String> {
9958    use crate::storage::query::ast::{FieldRef, Projection};
9959
9960    let mut columns = Vec::new();
9961    for projection in &query.return_ {
9962        match projection {
9963            Projection::Field(FieldRef::NodeProperty { property, .. }, _)
9964            | Projection::Field(FieldRef::EdgeProperty { property, .. }, _) => {
9965                push_unique(&mut columns, property.clone())
9966            }
9967            _ => {}
9968        }
9969    }
9970    columns
9971}
9972
9973fn push_unique(columns: &mut Vec<String>, column: String) {
9974    if !columns.iter().any(|existing| existing == &column) {
9975        columns.push(column);
9976    }
9977}
9978
9979fn principal_label(p: &crate::storage::query::ast::PolicyPrincipalRef) -> String {
9980    use crate::storage::query::ast::PolicyPrincipalRef;
9981    match p {
9982        PolicyPrincipalRef::User(u) => match &u.tenant {
9983            Some(t) => format!("user:{t}/{}", u.username),
9984            None => format!("user:{}", u.username),
9985        },
9986        PolicyPrincipalRef::Group(g) => format!("group:{g}"),
9987    }
9988}
9989
9990/// Render a `Decision` into the (decision, matched_policy_id, matched_sid)
9991/// shape used by every audit emit + the simulator response.
9992pub(crate) fn decision_to_strings(
9993    d: &crate::auth::policies::Decision,
9994) -> (String, Option<String>, Option<String>) {
9995    use crate::auth::policies::Decision;
9996    match d {
9997        Decision::Allow {
9998            matched_policy_id,
9999            matched_sid,
10000        } => (
10001            "allow".into(),
10002            Some(matched_policy_id.clone()),
10003            matched_sid.clone(),
10004        ),
10005        Decision::Deny {
10006            matched_policy_id,
10007            matched_sid,
10008        } => (
10009            "deny".into(),
10010            Some(matched_policy_id.clone()),
10011            matched_sid.clone(),
10012        ),
10013        Decision::DefaultDeny => ("default_deny".into(), None, None),
10014        Decision::AdminBypass => ("admin_bypass".into(), None, None),
10015    }
10016}
10017
10018fn relation_scopes_for_query(query: &QueryExpr) -> Vec<String> {
10019    let mut scopes = Vec::new();
10020    collect_relation_scopes(query, &mut scopes);
10021    scopes.sort();
10022    scopes.dedup();
10023    scopes
10024}
10025
10026fn collect_relation_scopes(query: &QueryExpr, scopes: &mut Vec<String>) {
10027    match query {
10028        QueryExpr::Table(table) => {
10029            if !table.table.is_empty() {
10030                scopes.push(table.table.clone());
10031            }
10032            if let Some(alias) = &table.alias {
10033                scopes.push(alias.clone());
10034            }
10035        }
10036        QueryExpr::Join(join) => {
10037            collect_relation_scopes(&join.left, scopes);
10038            collect_relation_scopes(&join.right, scopes);
10039        }
10040        _ => {}
10041    }
10042}
10043
10044fn query_references_outer_scope(query: &QueryExpr, outer_scopes: &[String]) -> bool {
10045    let inner_scopes = relation_scopes_for_query(query);
10046    query_expr_references_outer_scope(query, outer_scopes, &inner_scopes)
10047}
10048
10049fn query_expr_references_outer_scope(
10050    query: &QueryExpr,
10051    outer_scopes: &[String],
10052    inner_scopes: &[String],
10053) -> bool {
10054    match query {
10055        QueryExpr::Table(table) => {
10056            table.select_items.iter().any(|item| match item {
10057                crate::storage::query::ast::SelectItem::Wildcard => false,
10058                crate::storage::query::ast::SelectItem::Expr { expr, .. } => {
10059                    expr_references_outer_scope(expr, outer_scopes, inner_scopes)
10060                }
10061            }) || table
10062                .where_expr
10063                .as_ref()
10064                .is_some_and(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
10065                || table.filter.as_ref().is_some_and(|filter| {
10066                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
10067                })
10068                || table.having_expr.as_ref().is_some_and(|expr| {
10069                    expr_references_outer_scope(expr, outer_scopes, inner_scopes)
10070                })
10071                || table.having.as_ref().is_some_and(|filter| {
10072                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
10073                })
10074                || table
10075                    .group_by_exprs
10076                    .iter()
10077                    .any(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
10078                || table.order_by.iter().any(|clause| {
10079                    clause.expr.as_ref().is_some_and(|expr| {
10080                        expr_references_outer_scope(expr, outer_scopes, inner_scopes)
10081                    })
10082                })
10083        }
10084        QueryExpr::Join(join) => {
10085            query_expr_references_outer_scope(&join.left, outer_scopes, inner_scopes)
10086                || query_expr_references_outer_scope(&join.right, outer_scopes, inner_scopes)
10087                || join.filter.as_ref().is_some_and(|filter| {
10088                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
10089                })
10090                || join.return_items.iter().any(|item| match item {
10091                    crate::storage::query::ast::SelectItem::Wildcard => false,
10092                    crate::storage::query::ast::SelectItem::Expr { expr, .. } => {
10093                        expr_references_outer_scope(expr, outer_scopes, inner_scopes)
10094                    }
10095                })
10096        }
10097        _ => false,
10098    }
10099}
10100
10101fn filter_references_outer_scope(
10102    filter: &crate::storage::query::ast::Filter,
10103    outer_scopes: &[String],
10104    inner_scopes: &[String],
10105) -> bool {
10106    use crate::storage::query::ast::Filter;
10107    match filter {
10108        Filter::Compare { field, .. }
10109        | Filter::IsNull(field)
10110        | Filter::IsNotNull(field)
10111        | Filter::In { field, .. }
10112        | Filter::Between { field, .. }
10113        | Filter::Like { field, .. }
10114        | Filter::StartsWith { field, .. }
10115        | Filter::EndsWith { field, .. }
10116        | Filter::Contains { field, .. } => {
10117            field_ref_references_outer_scope(field, outer_scopes, inner_scopes)
10118        }
10119        Filter::CompareFields { left, right, .. } => {
10120            field_ref_references_outer_scope(left, outer_scopes, inner_scopes)
10121                || field_ref_references_outer_scope(right, outer_scopes, inner_scopes)
10122        }
10123        Filter::CompareExpr { lhs, rhs, .. } => {
10124            expr_references_outer_scope(lhs, outer_scopes, inner_scopes)
10125                || expr_references_outer_scope(rhs, outer_scopes, inner_scopes)
10126        }
10127        Filter::And(left, right) | Filter::Or(left, right) => {
10128            filter_references_outer_scope(left, outer_scopes, inner_scopes)
10129                || filter_references_outer_scope(right, outer_scopes, inner_scopes)
10130        }
10131        Filter::Not(inner) => filter_references_outer_scope(inner, outer_scopes, inner_scopes),
10132    }
10133}
10134
10135fn expr_references_outer_scope(
10136    expr: &crate::storage::query::ast::Expr,
10137    outer_scopes: &[String],
10138    inner_scopes: &[String],
10139) -> bool {
10140    use crate::storage::query::ast::Expr;
10141    match expr {
10142        Expr::Column { field, .. } => {
10143            field_ref_references_outer_scope(field, outer_scopes, inner_scopes)
10144        }
10145        Expr::BinaryOp { lhs, rhs, .. } => {
10146            expr_references_outer_scope(lhs, outer_scopes, inner_scopes)
10147                || expr_references_outer_scope(rhs, outer_scopes, inner_scopes)
10148        }
10149        Expr::UnaryOp { operand, .. }
10150        | Expr::Cast { inner: operand, .. }
10151        | Expr::IsNull { operand, .. } => {
10152            expr_references_outer_scope(operand, outer_scopes, inner_scopes)
10153        }
10154        Expr::FunctionCall { args, .. } => args
10155            .iter()
10156            .any(|arg| expr_references_outer_scope(arg, outer_scopes, inner_scopes)),
10157        Expr::Case {
10158            branches, else_, ..
10159        } => {
10160            branches.iter().any(|(cond, value)| {
10161                expr_references_outer_scope(cond, outer_scopes, inner_scopes)
10162                    || expr_references_outer_scope(value, outer_scopes, inner_scopes)
10163            }) || else_
10164                .as_ref()
10165                .is_some_and(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
10166        }
10167        Expr::InList { target, values, .. } => {
10168            expr_references_outer_scope(target, outer_scopes, inner_scopes)
10169                || values
10170                    .iter()
10171                    .any(|value| expr_references_outer_scope(value, outer_scopes, inner_scopes))
10172        }
10173        Expr::Between {
10174            target, low, high, ..
10175        } => {
10176            expr_references_outer_scope(target, outer_scopes, inner_scopes)
10177                || expr_references_outer_scope(low, outer_scopes, inner_scopes)
10178                || expr_references_outer_scope(high, outer_scopes, inner_scopes)
10179        }
10180        Expr::Subquery { query, .. } => query_references_outer_scope(&query.query, inner_scopes),
10181        Expr::Literal { .. } | Expr::Parameter { .. } => false,
10182    }
10183}
10184
10185fn field_ref_references_outer_scope(
10186    field: &crate::storage::query::ast::FieldRef,
10187    outer_scopes: &[String],
10188    inner_scopes: &[String],
10189) -> bool {
10190    match field {
10191        crate::storage::query::ast::FieldRef::TableColumn { table, .. } if !table.is_empty() => {
10192            outer_scopes.iter().any(|scope| scope == table)
10193                && !inner_scopes.iter().any(|scope| scope == table)
10194        }
10195        _ => false,
10196    }
10197}
10198
10199fn first_column_values(
10200    result: crate::storage::query::unified::UnifiedResult,
10201) -> RedDBResult<Vec<Value>> {
10202    if result.columns.len() > 1 {
10203        return Err(RedDBError::Query(
10204            "expression subquery must return exactly one column".to_string(),
10205        ));
10206    }
10207    let fallback_column = result
10208        .records
10209        .first()
10210        .and_then(|record| record.column_names().into_iter().next())
10211        .map(|name| name.to_string());
10212    let column = result.columns.first().cloned().or(fallback_column);
10213    let Some(column) = column else {
10214        return Ok(Vec::new());
10215    };
10216    Ok(result
10217        .records
10218        .iter()
10219        .map(|record| record.get(column.as_str()).cloned().unwrap_or(Value::Null))
10220        .collect())
10221}
10222
10223fn parse_timestamp_to_ms(s: &str) -> Option<u128> {
10224    // Bare integer ms.
10225    if let Ok(n) = s.parse::<u128>() {
10226        return Some(n);
10227    }
10228    // Fallback: ISO-8601 like 2030-01-02 03:04:05 — accept the date
10229    // portion only (midnight UTC). Full RFC3339 parsing is a stretch
10230    // goal; the common case is `'2030-01-01'`.
10231    if let Some(date) = s.split_whitespace().next() {
10232        let parts: Vec<&str> = date.split('-').collect();
10233        if parts.len() == 3 {
10234            let (y, m, d) = (parts[0], parts[1], parts[2]);
10235            if let (Ok(y), Ok(m), Ok(d)) = (y.parse::<i64>(), m.parse::<u32>(), d.parse::<u32>()) {
10236                // Days since 1970-01-01 — simple Julian arithmetic
10237                // suitable for years 1970-2100. Good enough for test
10238                // fixtures; precise parsing lands when we wire chrono.
10239                let days_in = days_from_civil(y, m, d);
10240                return Some((days_in as u128) * 86_400_000u128);
10241            }
10242        }
10243    }
10244    None
10245}
10246
10247/// Days from Unix epoch using H. Hinnant's civil-from-days algorithm.
10248/// Robust for the entire Gregorian range; used by `parse_timestamp_to_ms`.
10249fn days_from_civil(y: i64, m: u32, d: u32) -> i64 {
10250    let y = if m <= 2 { y - 1 } else { y };
10251    let era = if y >= 0 { y } else { y - 399 } / 400;
10252    let yoe = (y - era * 400) as u64; // [0, 399]
10253    let doy = (153 * (if m > 2 { m - 3 } else { m + 9 }) as u64 + 2) / 5 + d as u64 - 1;
10254    let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
10255    era * 146097 + doe as i64 - 719468
10256}
10257
10258fn walk_plan_node(
10259    node: &crate::storage::query::planner::CanonicalLogicalNode,
10260    depth: usize,
10261    out: &mut Vec<crate::storage::query::unified::UnifiedRecord>,
10262) {
10263    use std::sync::Arc;
10264    let mut rec = crate::storage::query::unified::UnifiedRecord::default();
10265    rec.set_arc(Arc::from("op"), Value::text(node.operator.clone()));
10266    rec.set_arc(
10267        Arc::from("source"),
10268        node.source.clone().map(Value::text).unwrap_or(Value::Null),
10269    );
10270    rec.set_arc(Arc::from("est_rows"), Value::Float(node.estimated_rows));
10271    rec.set_arc(Arc::from("est_cost"), Value::Float(node.operator_cost));
10272    rec.set_arc(Arc::from("depth"), Value::Integer(depth as i64));
10273    out.push(rec);
10274    for child in &node.children {
10275        walk_plan_node(child, depth + 1, out);
10276    }
10277}