Skip to main content

reddb_server/runtime/
impl_core.rs

1use super::*;
2use crate::application::entity::metadata_to_json;
3use crate::auth::column_policy_gate::ColumnAccessRequest;
4use crate::auth::UserId;
5use crate::replication::cdc::ChangeRecord;
6use crate::replication::logical::{ApplyMode, LogicalChangeApplier};
7use crate::storage::query::ast::TableSource;
8
9thread_local! {
10    /// Current connection id for the executing statement. Set by the
11    /// per-connection wrapper (stdio/gRPC handlers) before dispatching
12    /// into `execute_query`; falls back to `0` for embedded callers.
13    static CURRENT_CONN_ID: std::cell::Cell<u64> = const { std::cell::Cell::new(0) };
14
15    /// Authenticated user + role for the executing statement (Phase 2.5.2
16    /// RLS enforcement). Set by the transport middleware after validating
17    /// credentials (password / cert / oauth); unset means "anonymous" /
18    /// "embedded" — RLS policies degrade to the role-agnostic subset.
19    ///
20    /// `None` skips RLS injection entirely; `Some((username, role))`
21    /// passes `role` to `matching_rls_policies(table, Some(role), action)`.
22    static CURRENT_AUTH_IDENTITY: std::cell::RefCell<Option<(String, crate::auth::Role)>> =
23        const { std::cell::RefCell::new(None) };
24
25    /// MVCC snapshot scoped to the currently-executing statement (Phase
26    /// 2.3.2d PG parity). `execute_query` captures it on entry and drops
27    /// it on exit; every scan consults it via
28    /// `entity_visible_under_current_snapshot` to hide tuples whose xmin
29    /// hasn't committed or whose xmax already has.
30    ///
31    /// `None` means "pre-MVCC semantics" — the read path returns every
32    /// tuple regardless of xmin/xmax. All embedded callers that bypass
33    /// `execute_query` see this default.
34    static CURRENT_SNAPSHOT: std::cell::RefCell<Option<SnapshotContext>> =
35        const { std::cell::RefCell::new(None) };
36
37    /// Cheap presence flag for `CURRENT_SNAPSHOT`. Scan hot paths
38    /// poll this instead of `borrow()`-ing the RefCell on every
39    /// row — the common case (autocommit / no MVCC session) reads
40    /// one atomic `Cell<bool>` and short-circuits, saving ~10ns × N
41    /// rows on aggregate_group / select_range scans.
42    static HAS_SNAPSHOT: std::cell::Cell<bool> = const { std::cell::Cell::new(false) };
43
44    /// Session-scoped tenant id for the current connection (Phase 2.5.3
45    /// multi-tenancy). Populated by `SET TENANT 'id'` or by transport
46    /// middleware after resolving tenant from auth claims. Read by the
47    /// `CURRENT_TENANT()` scalar function — RLS policies typically
48    /// combine it as `USING (tenant_id = CURRENT_TENANT())` to scope
49    /// every query to one tenant.
50    ///
51    /// `None` means "no tenant bound" — `CURRENT_TENANT()` returns
52    /// NULL, and RLS policies that gate on it hide every row.
53    static CURRENT_TENANT_ID: std::cell::RefCell<Option<String>> =
54        const { std::cell::RefCell::new(None) };
55
56    /// Statement-local config resolver. SQL expressions materialize the
57    /// `red_config` snapshot lazily on the first `$config.*`/`CONFIG()`
58    /// access, keeping ordinary statements on the zero-scan path.
59    static CURRENT_CONFIG_RESOLVER: std::cell::RefCell<Option<ConfigResolver>> =
60        const { std::cell::RefCell::new(None) };
61
62    /// Statement-local secret resolver. SQL expressions materialize the
63    /// vault KV snapshot lazily on first `$secret.*` access, then use
64    /// lock-free map reads for the rest of the statement.
65    static CURRENT_SECRET_RESOLVER: std::cell::RefCell<Option<SecretResolver>> =
66        const { std::cell::RefCell::new(None) };
67}
68
69fn secret_sql_value_to_string(value: &Value) -> RedDBResult<String> {
70    match value {
71        Value::Text(s) => Ok(s.to_string()),
72        Value::Integer(n) => Ok(n.to_string()),
73        Value::UnsignedInteger(n) => Ok(n.to_string()),
74        Value::Float(n) => Ok(n.to_string()),
75        Value::Boolean(b) => Ok(b.to_string()),
76        Value::Null => Err(RedDBError::Query(
77            "SET SECRET key = NULL deletes the secret; use DELETE SECRET for explicit deletes"
78                .to_string(),
79        )),
80        Value::Password(_) | Value::Secret(_) => Err(RedDBError::Query(
81            "SET SECRET accepts plain scalar literals; PASSWORD() and SECRET() are for typed columns"
82                .to_string(),
83        )),
84        _ => Err(RedDBError::Query(format!(
85            "SET SECRET does not support value type {:?} yet",
86            value.data_type()
87        ))),
88    }
89}
90
91fn system_keyed_collection_contract(
92    name: &str,
93    model: crate::catalog::CollectionModel,
94) -> crate::physical::CollectionContract {
95    let now = crate::utils::now_unix_millis() as u128;
96    crate::physical::CollectionContract {
97        name: name.to_string(),
98        declared_model: model,
99        schema_mode: crate::catalog::SchemaMode::Dynamic,
100        origin: crate::physical::ContractOrigin::Implicit,
101        version: 1,
102        created_at_unix_ms: now,
103        updated_at_unix_ms: now,
104        default_ttl_ms: None,
105        vector_dimension: None,
106        vector_metric: None,
107        context_index_fields: Vec::new(),
108        declared_columns: Vec::new(),
109        table_def: None,
110        timestamps_enabled: false,
111        context_index_enabled: false,
112        metrics_raw_retention_ms: None,
113        metrics_rollup_policies: Vec::new(),
114        metrics_tenant_identity: None,
115        metrics_namespace: None,
116        append_only: false,
117        subscriptions: Vec::new(),
118    }
119}
120
121/// Snapshot + manager pair used for read-path visibility checks.
122///
123/// The manager is needed in addition to the snapshot because `aborted`
124/// state mutates after the snapshot is captured — a ROLLBACK by a
125/// committed-at-capture-time writer must still hide its tuples. Keeping
126/// the Arc around is O(pointer) and the RwLock reads on `is_aborted`
127/// are cheap (HashSet lookup under a parking_lot read guard).
128///
129/// `own_xids` (Phase 2.3.2e) lists the xids belonging to the current
130/// connection's transaction — the parent xid plus open and released
131/// savepoint sub-xids. The visibility rule promotes rows stamped with
132/// these xids to "always visible (unless aborted)" so the writer sees
133/// its own nested-savepoint writes even though their xids exceed
134/// `snapshot.xid`.
135#[derive(Clone)]
136pub struct SnapshotContext {
137    pub snapshot: crate::storage::transaction::snapshot::Snapshot,
138    pub manager: Arc<crate::storage::transaction::snapshot::SnapshotManager>,
139    pub own_xids: std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
140    pub requires_index_fallback: bool,
141}
142
143/// Install a connection id on the current thread for the duration of a
144/// statement. Transaction state (`RuntimeInner::tx_contexts`) is keyed
145/// by this id so different connections can hold independent BEGINs.
146///
147/// Pub so transports (PG wire, gRPC, HTTP per-request spawners) and
148/// tests can emulate per-connection isolation. Call it once when
149/// binding the connection's worker thread; pair with
150/// `clear_current_connection_id` on teardown.
151pub fn set_current_connection_id(id: u64) {
152    CURRENT_CONN_ID.with(|c| c.set(id));
153}
154
155/// Reset the thread's connection id back to `0` (autocommit).
156pub fn clear_current_connection_id() {
157    CURRENT_CONN_ID.with(|c| c.set(0));
158}
159
160/// Read the connection id set by `set_current_connection_id`. Returns
161/// `0` when no wrapper installed one — auto-commit path.
162pub fn current_connection_id() -> u64 {
163    CURRENT_CONN_ID.with(|c| c.get())
164}
165
166/// Install the authenticated identity for the current thread (Phase 2.5.2
167/// RLS enforcement). Transport layers call this right after resolving
168/// auth so the query dispatch can fold RLS policies into the filter.
169pub fn set_current_auth_identity(username: String, role: crate::auth::Role) {
170    CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = Some((username, role)));
171}
172
173/// Clear the thread-local auth identity. Transports call this after the
174/// statement completes so pooled threads don't leak identities across
175/// requests.
176pub fn clear_current_auth_identity() {
177    CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = None);
178}
179
180/// Read the current-thread auth identity. `None` when no transport
181/// installed one (embedded mode / anonymous access).
182pub(crate) fn current_auth_identity() -> Option<(String, crate::auth::Role)> {
183    CURRENT_AUTH_IDENTITY.with(|cell| cell.borrow().clone())
184}
185
186/// Install the session tenant id for the current thread (Phase 2.5.3
187/// multi-tenancy). Called by `SET TENANT 'id'` dispatch and by
188/// transport middleware that resolves tenant from auth claims (e.g.
189/// JWT `tenant` claim, HTTP header, subdomain).
190pub fn set_current_tenant(tenant_id: String) {
191    CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = Some(tenant_id));
192}
193
194/// Clear the current-thread tenant — `CURRENT_TENANT()` will then
195/// return NULL and any RLS policy gated on it will hide every row.
196pub fn clear_current_tenant() {
197    CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = None);
198}
199
200/// Read the current-thread tenant id, applying overrides in priority order:
201///   1. `WITHIN TENANT '<id>' …` per-statement override (highest)
202///   2. `SET LOCAL TENANT '<id>'` transaction-local override (consulted
203///      only when the current connection has an open transaction)
204///   3. `SET TENANT '<id>'` session-level thread-local
205///   4. `None` (deny-default for RLS).
206///
207/// The transaction-local layer is read through the runtime; an embedded
208/// helper crate that has no `RedDBRuntime` access still gets correct
209/// behaviour for layers 1, 3, and 4.
210pub fn current_tenant() -> Option<String> {
211    let inherited = CURRENT_TENANT_ID.with(|cell| cell.borrow().clone());
212    if let Some(over) = current_scope_override() {
213        if over.tenant.is_active() {
214            return over.tenant.resolve(inherited);
215        }
216    }
217    if let Some(tx_local) = current_tx_local_tenant() {
218        return tx_local;
219    }
220    inherited
221}
222
223thread_local! {
224    /// Snapshot of the active connection's `tx_local_tenants` entry for
225    /// the current `execute_query` call. Outer `Some(_)` means "a
226    /// transaction-local tenant override is active for this call";
227    /// inner is the override's value (`Some(s)` overrides to `s`,
228    /// `None` overrides to NULL/cleared). Refreshed at the top of every
229    /// `execute_query` invocation and cleared by the RAII guard on
230    /// return so pooled connections cannot leak the override past the
231    /// statement that owns it.
232    static TX_LOCAL_TENANT: std::cell::RefCell<Option<Option<String>>> =
233        const { std::cell::RefCell::new(None) };
234}
235
236fn current_tx_local_tenant() -> Option<Option<String>> {
237    TX_LOCAL_TENANT.with(|cell| cell.borrow().clone())
238}
239
240/// Recognise `SET LOCAL TENANT '<id>'` / `SET LOCAL TENANT NULL` —
241/// returns `Ok(Some(Some(id)))` for an explicit value, `Ok(Some(None))`
242/// for an explicit NULL clear, `Ok(None)` when the input is not a
243/// `SET LOCAL TENANT` statement at all, and `Err` when the prefix
244/// matches but the value is malformed.
245fn parse_set_local_tenant(query: &str) -> RedDBResult<Option<Option<String>>> {
246    let mut tokens = query.split_ascii_whitespace();
247    let Some(w1) = tokens.next() else {
248        return Ok(None);
249    };
250    if !w1.eq_ignore_ascii_case("SET") {
251        return Ok(None);
252    }
253    let Some(w2) = tokens.next() else {
254        return Ok(None);
255    };
256    if !w2.eq_ignore_ascii_case("LOCAL") {
257        return Ok(None);
258    }
259    let Some(w3) = tokens.next() else {
260        return Ok(None);
261    };
262    if !w3.eq_ignore_ascii_case("TENANT") {
263        return Ok(None);
264    }
265    let rest: String = tokens.collect::<Vec<_>>().join(" ");
266    let rest = rest.trim().trim_end_matches(';').trim();
267    let value_str = rest.strip_prefix('=').map(|s| s.trim()).unwrap_or(rest);
268    if value_str.is_empty() {
269        return Err(RedDBError::Query(
270            "SET LOCAL TENANT expects a string literal or NULL".to_string(),
271        ));
272    }
273    if value_str.eq_ignore_ascii_case("NULL") {
274        return Ok(Some(None));
275    }
276    if value_str.starts_with('\'') && value_str.ends_with('\'') && value_str.len() >= 2 {
277        let inner = &value_str[1..value_str.len() - 1];
278        return Ok(Some(Some(inner.to_string())));
279    }
280    Err(RedDBError::Query(format!(
281        "SET LOCAL TENANT expects a string literal or NULL, got `{value_str}`"
282    )))
283}
284
285pub(crate) struct TxLocalTenantGuard;
286
287impl TxLocalTenantGuard {
288    pub fn install(value: Option<Option<String>>) -> Self {
289        TX_LOCAL_TENANT.with(|cell| *cell.borrow_mut() = value);
290        Self
291    }
292}
293
294impl Drop for TxLocalTenantGuard {
295    fn drop(&mut self) {
296        TX_LOCAL_TENANT.with(|cell| *cell.borrow_mut() = None);
297    }
298}
299
300thread_local! {
301    /// Stack of `WITHIN ... <stmt>` overrides active on the current
302    /// thread. Every entry corresponds to one in-flight `execute_query`
303    /// call that started with a `WITHIN` prefix; the entry is pushed
304    /// before dispatch and popped before the call returns. The stack
305    /// shape supports nested invocations (e.g. a view body that itself
306    /// re-enters execute_query).
307    static SCOPE_OVERRIDES: std::cell::RefCell<Vec<crate::runtime::within_clause::ScopeOverride>> =
308        const { std::cell::RefCell::new(Vec::new()) };
309}
310
311pub(crate) fn push_scope_override(over: crate::runtime::within_clause::ScopeOverride) {
312    SCOPE_OVERRIDES.with(|cell| cell.borrow_mut().push(over));
313}
314
315pub(crate) fn pop_scope_override() {
316    SCOPE_OVERRIDES.with(|cell| {
317        cell.borrow_mut().pop();
318    });
319}
320
321pub(crate) fn current_scope_override() -> Option<crate::runtime::within_clause::ScopeOverride> {
322    SCOPE_OVERRIDES.with(|cell| cell.borrow().last().cloned())
323}
324
325/// Cheap probe: is any `WITHIN …` scope override active on this
326/// thread? The fast-path needs to know without paying for the full
327/// `.last().cloned()` allocation — just peek at stack length.
328pub(crate) fn has_scope_override_active() -> bool {
329    SCOPE_OVERRIDES.with(|cell| !cell.borrow().is_empty())
330}
331
332/// RAII guard pairing `push_scope_override` with the matching pop, so
333/// the stack stays balanced even when the inner `execute_query` returns
334/// early via `?`.
335pub(crate) struct ScopeOverrideGuard;
336
337impl ScopeOverrideGuard {
338    pub fn install(over: crate::runtime::within_clause::ScopeOverride) -> Self {
339        push_scope_override(over);
340        Self
341    }
342}
343
344impl Drop for ScopeOverrideGuard {
345    fn drop(&mut self) {
346        pop_scope_override();
347    }
348}
349
350/// Read the current-thread auth identity, honouring per-statement
351/// `WITHIN ... USER '<u>' AS ROLE '<r>'` overrides. The override only
352/// supplies projected strings — it never grants additional privilege —
353/// so callers that need to make authorisation decisions must read from
354/// the underlying `current_auth_identity()` directly.
355pub(crate) fn current_user_projected() -> Option<String> {
356    let inherited = current_auth_identity().map(|(u, _)| u);
357    if let Some(over) = current_scope_override() {
358        if over.user.is_active() {
359            return over.user.resolve(inherited);
360        }
361    }
362    inherited
363}
364
365pub(crate) fn current_role_projected() -> Option<String> {
366    let inherited = current_auth_identity().map(|(_, r)| format!("{r:?}").to_lowercase());
367    if let Some(over) = current_scope_override() {
368        if over.role.is_active() {
369            return over.role.resolve(inherited);
370        }
371    }
372    inherited
373}
374
375pub(crate) fn current_secret_value(path: &str) -> Option<String> {
376    let key = path.to_ascii_lowercase();
377    CURRENT_SECRET_RESOLVER.with(|cell| {
378        let mut resolver = cell.borrow_mut();
379        let resolver = resolver.as_mut()?;
380        if resolver.values.is_none() {
381            resolver.values = resolver
382                .store
383                .as_ref()
384                .map(|store| store.vault_kv_snapshot());
385        }
386        let values = resolver.values.as_ref()?;
387        values.get(&key).cloned().or_else(|| {
388            key.strip_prefix("red.vault/").and_then(|rest| {
389                values
390                    .get(rest)
391                    .cloned()
392                    .or_else(|| values.get(&format!("red.secret.{rest}")).cloned())
393            })
394        })
395    })
396}
397
398struct SecretResolver {
399    store: Option<Arc<crate::auth::store::AuthStore>>,
400    values: Option<HashMap<String, String>>,
401}
402
403pub(super) struct SecretStoreGuard {
404    previous: Option<SecretResolver>,
405}
406
407impl SecretStoreGuard {
408    pub(super) fn install(store: Option<Arc<crate::auth::store::AuthStore>>) -> Self {
409        let previous = CURRENT_SECRET_RESOLVER.with(|cell| {
410            cell.replace(Some(SecretResolver {
411                store,
412                values: None,
413            }))
414        });
415        Self { previous }
416    }
417}
418
419impl Drop for SecretStoreGuard {
420    fn drop(&mut self) {
421        let previous = self.previous.take();
422        CURRENT_SECRET_RESOLVER.with(|cell| {
423            cell.replace(previous);
424        });
425    }
426}
427
428pub(crate) fn current_config_value(path: &str) -> Option<Value> {
429    let key = path.to_ascii_lowercase();
430    CURRENT_CONFIG_RESOLVER.with(|cell| {
431        let mut resolver = cell.borrow_mut();
432        let resolver = resolver.as_mut()?;
433        if resolver.values.is_none() {
434            resolver.values = Some(latest_config_snapshot(&resolver.db));
435        }
436        let values = resolver.values.as_ref()?;
437        values.get(&key).cloned().or_else(|| {
438            key.strip_prefix("red.config/")
439                .and_then(|rest| values.get(&format!("red.config.{rest}")).cloned())
440        })
441    })
442}
443
444fn update_current_config_value(path: &str, value: Value) {
445    let key = path.to_ascii_lowercase();
446    CURRENT_CONFIG_RESOLVER.with(|cell| {
447        if let Some(resolver) = cell.borrow_mut().as_mut() {
448            if let Some(values) = resolver.values.as_mut() {
449                values.insert(key, value);
450            }
451        }
452    });
453}
454
455fn update_current_secret_value(path: &str, value: Option<String>) {
456    let key = path.to_ascii_lowercase();
457    CURRENT_SECRET_RESOLVER.with(|cell| {
458        if let Some(resolver) = cell.borrow_mut().as_mut() {
459            let Some(values) = resolver.values.as_mut() else {
460                return;
461            };
462            match value {
463                Some(value) => {
464                    values.insert(key, value);
465                }
466                None => {
467                    values.remove(&key);
468                }
469            }
470        }
471    });
472}
473
474fn latest_config_snapshot(db: &RedDB) -> HashMap<String, Value> {
475    let mut latest: HashMap<String, (u64, Value)> = HashMap::new();
476
477    if let Some(manager) = db.store().get_collection("red_config") {
478        manager.for_each_entity(|entity| {
479            let Some(row) = entity.data.as_row() else {
480                return true;
481            };
482            let Some(Value::Text(key)) = row.get_field("key") else {
483                return true;
484            };
485            let value = row.get_field("value").cloned().unwrap_or(Value::Null);
486            let id = entity.id.raw();
487            let key = key.to_ascii_lowercase();
488            insert_latest_config_value(&mut latest, key.clone(), id, value.clone());
489            if let Some(rest) = key.strip_prefix("red.config.") {
490                insert_latest_config_value(&mut latest, format!("red.config/{rest}"), id, value);
491            }
492            true
493        });
494    }
495
496    if let Some(manager) = db.store().get_collection("red.config") {
497        manager.for_each_entity(|entity| {
498            let Some(row) = entity.data.as_row() else {
499                return true;
500            };
501            if matches!(row.get_field("tombstone"), Some(Value::Boolean(true))) {
502                return true;
503            }
504            let Some(Value::Text(key)) = row.get_field("key") else {
505                return true;
506            };
507            let value = row.get_field("value").cloned().unwrap_or(Value::Null);
508            insert_latest_config_value(
509                &mut latest,
510                format!("red.config/{}", key.to_ascii_lowercase()),
511                entity.id.raw(),
512                value,
513            );
514            true
515        });
516    }
517
518    latest
519        .into_iter()
520        .map(|(key, (_, value))| (key, value))
521        .collect()
522}
523
524fn insert_latest_config_value(
525    latest: &mut HashMap<String, (u64, Value)>,
526    key: String,
527    id: u64,
528    value: Value,
529) {
530    match latest.get(&key) {
531        Some((prev_id, _)) if *prev_id > id => {}
532        _ => {
533            latest.insert(key, (id, value));
534        }
535    }
536}
537
538struct ConfigResolver {
539    db: Arc<RedDB>,
540    values: Option<HashMap<String, Value>>,
541}
542
543pub(super) struct ConfigSnapshotGuard {
544    previous: Option<ConfigResolver>,
545}
546
547impl ConfigSnapshotGuard {
548    pub(super) fn install(db: Arc<RedDB>) -> Self {
549        let previous = CURRENT_CONFIG_RESOLVER
550            .with(|cell| cell.replace(Some(ConfigResolver { db, values: None })));
551        Self { previous }
552    }
553}
554
555impl Drop for ConfigSnapshotGuard {
556    fn drop(&mut self) {
557        let previous = self.previous.take();
558        CURRENT_CONFIG_RESOLVER.with(|cell| {
559            cell.replace(previous);
560        });
561    }
562}
563
564/// Install the MVCC snapshot used by the current thread for the duration
565/// of one statement. Paired with `clear_current_snapshot()` — callers
566/// should prefer the `CurrentSnapshotGuard` RAII wrapper so early returns
567/// still clean up.
568pub fn set_current_snapshot(ctx: SnapshotContext) {
569    CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = Some(ctx));
570    HAS_SNAPSHOT.with(|c| c.set(true));
571}
572
573pub fn clear_current_snapshot() {
574    CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = None);
575    HAS_SNAPSHOT.with(|c| c.set(false));
576}
577
578/// Drop-guard that restores the previous snapshot on scope exit. Safe to
579/// nest — each statement saves the caller's snapshot and puts it back
580/// instead of blindly clearing, so a top-level `execute_query` called
581/// from inside another statement dispatch (e.g. vector source subqueries)
582/// doesn't strip visibility from the outer scan.
583pub(crate) struct CurrentSnapshotGuard {
584    previous: Option<SnapshotContext>,
585}
586
587impl CurrentSnapshotGuard {
588    pub(crate) fn install(ctx: SnapshotContext) -> Self {
589        let previous = CURRENT_SNAPSHOT.with(|cell| cell.borrow().clone());
590        set_current_snapshot(ctx);
591        Self { previous }
592    }
593}
594
595impl Drop for CurrentSnapshotGuard {
596    fn drop(&mut self) {
597        let prev = self.previous.take();
598        let has = prev.is_some();
599        CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = prev);
600        HAS_SNAPSHOT.with(|c| c.set(has));
601    }
602}
603
604/// Is this entity visible under the current thread's MVCC snapshot?
605///
606/// Returns `true` (no filtering) when no snapshot is installed — that
607/// path is used by embedded callers and by operations that intentionally
608/// bypass MVCC (VACUUM, snapshot export, admin introspection).
609///
610/// When a snapshot is installed the result is
611///   `snapshot.sees(xmin, xmax) && !mgr.is_aborted(xmin) && !xmax_half_abort`
612/// where `xmax_half_abort` re-grants visibility for tuples whose
613/// deleting transaction rolled back.
614#[inline]
615pub fn entity_visible_under_current_snapshot(
616    entity: &crate::storage::unified::entity::UnifiedEntity,
617) -> bool {
618    // Fast path — one `Cell<bool>` read, no RefCell borrow. Autocommit
619    // reads (no active MVCC transaction) still hide superseded physical
620    // versions while avoiding a full snapshot-context lookup.
621    // This runs on every row of every scan; the slow path only fires
622    // inside an explicit transaction.
623    if !HAS_SNAPSHOT.with(|c| c.get()) {
624        return entity.xmax == 0;
625    }
626    CURRENT_SNAPSHOT.with(|cell| {
627        let guard = cell.borrow();
628        let Some(ctx) = guard.as_ref() else {
629            return true;
630        };
631        visibility_check(ctx, entity.xmin, entity.xmax)
632    })
633}
634
635/// Direct visibility check from raw `(xmin, xmax)` — bypasses the
636/// entity borrow for callers that already decomposed the tuple (e.g.
637/// pre-materialized scan caches). Same semantics as
638/// `entity_visible_under_current_snapshot`.
639#[inline]
640pub(crate) fn xids_visible_under_current_snapshot(xmin: u64, xmax: u64) -> bool {
641    if !HAS_SNAPSHOT.with(|c| c.get()) {
642        return true;
643    }
644    CURRENT_SNAPSHOT.with(|cell| {
645        let guard = cell.borrow();
646        let Some(ctx) = guard.as_ref() else {
647            return true;
648        };
649        visibility_check(ctx, xmin, xmax)
650    })
651}
652
653/// Clone the current thread's snapshot context. Parallel scan paths
654/// (`query_all_zoned` with `std::thread::scope`) call this on the main
655/// thread *before* spawning workers so the captured `SnapshotContext`
656/// can be moved into every worker closure. Worker threads do not
657/// inherit thread-locals, so calling `entity_visible_under_current_snapshot`
658/// from inside a spawned closure would silently skip the filter.
659pub fn capture_current_snapshot() -> Option<SnapshotContext> {
660    CURRENT_SNAPSHOT.with(|cell| cell.borrow().clone())
661}
662
663/// Whether the active read snapshot may need historical tuple versions
664/// that the current secondary indexes cannot prove. Index paths can still
665/// recheck visible candidates, but only a heap scan can discover versions
666/// whose indexed value was changed or deleted after this snapshot.
667pub(crate) fn current_snapshot_requires_index_fallback() -> bool {
668    if !HAS_SNAPSHOT.with(|c| c.get()) {
669        return false;
670    }
671    CURRENT_SNAPSHOT.with(|cell| {
672        cell.borrow()
673            .as_ref()
674            .is_some_and(|ctx| ctx.requires_index_fallback)
675    })
676}
677
678/// Frozen MVCC + identity context for callers that need to reinstall
679/// the same view across thread-local boundaries — long-lived cursors,
680/// background batchers, anything that detaches from the dispatch path
681/// and re-enters later.
682///
683/// The bundle bakes in the three thread-locals every read path
684/// consults: `SnapshotContext` (MVCC visibility), the auth identity
685/// (RLS policy gate), and the tenant id (RLS scope). A FETCH that
686/// reinstalls the bundle sees exactly the same rows as the DECLARE
687/// would have, regardless of writes that landed in between.
688///
689/// Cheap to clone — `SnapshotContext` is a clone of three
690/// `Arc`-backed fields, identity is a `(String, Role)`, tenant is a
691/// `String`. None of these contend with the read path.
692#[derive(Clone, Default)]
693pub struct SnapshotBundle {
694    pub snapshot: Option<SnapshotContext>,
695    pub auth: Option<(String, crate::auth::Role)>,
696    pub tenant: Option<String>,
697}
698
699/// Capture the three read-path thread-locals into a `SnapshotBundle`.
700/// Pairs with `with_snapshot_bundle` for re-entry.
701pub fn snapshot_bundle() -> SnapshotBundle {
702    SnapshotBundle {
703        snapshot: capture_current_snapshot(),
704        auth: current_auth_identity(),
705        tenant: CURRENT_TENANT_ID.with(|cell| cell.borrow().clone()),
706    }
707}
708
709/// Reinstall a captured `SnapshotBundle` for the duration of `f`.
710/// Restores the caller's previous thread-locals on exit (panic-safe via
711/// the explicit guard struct so a panic in `f` cannot leak the
712/// installed identity into the worker's next request).
713pub fn with_snapshot_bundle<R>(bundle: &SnapshotBundle, f: impl FnOnce() -> R) -> R {
714    struct Guard {
715        prev_snapshot: Option<SnapshotContext>,
716        prev_auth: Option<(String, crate::auth::Role)>,
717        prev_tenant: Option<String>,
718    }
719    impl Drop for Guard {
720        fn drop(&mut self) {
721            let snap = self.prev_snapshot.take();
722            let has = snap.is_some();
723            CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = snap);
724            HAS_SNAPSHOT.with(|c| c.set(has));
725            CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = self.prev_auth.take());
726            CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = self.prev_tenant.take());
727        }
728    }
729
730    let _guard = {
731        let prev_snapshot = CURRENT_SNAPSHOT.with(|cell| cell.borrow().clone());
732        let prev_auth = CURRENT_AUTH_IDENTITY.with(|cell| cell.borrow().clone());
733        let prev_tenant = CURRENT_TENANT_ID.with(|cell| cell.borrow().clone());
734
735        match bundle.snapshot.clone() {
736            Some(ctx) => set_current_snapshot(ctx),
737            None => clear_current_snapshot(),
738        }
739        CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = bundle.auth.clone());
740        CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = bundle.tenant.clone());
741
742        Guard {
743            prev_snapshot,
744            prev_auth,
745            prev_tenant,
746        }
747    };
748    f()
749}
750
751/// Apply the same visibility rules used by the thread-local helpers
752/// against a caller-provided context. Intended for parallel workers
753/// that captured the snapshot with `capture_current_snapshot()`.
754#[inline]
755pub fn entity_visible_with_context(
756    ctx: Option<&SnapshotContext>,
757    entity: &crate::storage::unified::entity::UnifiedEntity,
758) -> bool {
759    match ctx {
760        Some(ctx) => visibility_check(ctx, entity.xmin, entity.xmax),
761        None => true,
762    }
763}
764
765fn table_row_index_fields(
766    entity: &crate::storage::unified::entity::UnifiedEntity,
767) -> Vec<(String, crate::storage::schema::Value)> {
768    let crate::storage::EntityData::Row(row) = &entity.data else {
769        return Vec::new();
770    };
771    if let Some(named) = &row.named {
772        return named
773            .iter()
774            .map(|(name, value)| (name.clone(), value.clone()))
775            .collect();
776    }
777    if let Some(schema) = &row.schema {
778        return schema
779            .iter()
780            .zip(row.columns.iter())
781            .map(|(name, value)| (name.clone(), value.clone()))
782            .collect();
783    }
784    Vec::new()
785}
786
787#[inline]
788fn visibility_check(ctx: &SnapshotContext, xmin: u64, xmax: u64) -> bool {
789    // Writer aborted → tuple never existed from any future reader's view.
790    // Checked *before* the own-xids fast path so an aborted own-sub-xid
791    // (rolled-back savepoint) stays hidden from the parent.
792    if xmin != 0 && ctx.manager.is_aborted(xmin) {
793        return false;
794    }
795    // Deleter aborted → treat xmax as unset; fall back to xmin-only check.
796    let effective_xmax = if xmax != 0 && ctx.manager.is_aborted(xmax) {
797        0
798    } else {
799        xmax
800    };
801    // Phase 2.3.2e: own-tx writes are always visible to the connection
802    // that stamped them, even when xmin/xmax exceed `snapshot.xid` (as
803    // happens for sub-xids allocated by SAVEPOINT after BEGIN).
804    let own_xmin = xmin != 0 && ctx.own_xids.contains(&xmin);
805    let own_xmax = effective_xmax != 0 && ctx.own_xids.contains(&effective_xmax);
806    if own_xmax {
807        // This connection deleted the row via this xid — hide it from self.
808        return false;
809    }
810    if own_xmin {
811        return true;
812    }
813    ctx.snapshot.sees(xmin, effective_xmax)
814}
815
816fn runtime_pool_lock(runtime: &RedDBRuntime) -> std::sync::MutexGuard<'_, PoolState> {
817    runtime
818        .inner
819        .pool
820        .lock()
821        .unwrap_or_else(|poisoned| poisoned.into_inner())
822}
823
824fn cache_scope_insert(scopes: &mut HashSet<String>, name: &str) {
825    if name.is_empty() || name.starts_with("__subq_") || is_universal_query_source(name) {
826        return;
827    }
828    scopes.insert(name.to_string());
829}
830
831fn collect_table_source_scopes(scopes: &mut HashSet<String>, query: &TableQuery) {
832    match query.source.as_ref() {
833        Some(crate::storage::query::ast::TableSource::Name(name)) => {
834            cache_scope_insert(scopes, name)
835        }
836        Some(crate::storage::query::ast::TableSource::Subquery(subquery)) => {
837            collect_query_expr_result_cache_scopes(scopes, subquery);
838        }
839        None => cache_scope_insert(scopes, &query.table),
840    }
841}
842
843fn collect_vector_source_scopes(
844    scopes: &mut HashSet<String>,
845    source: &crate::storage::query::ast::VectorSource,
846) {
847    match source {
848        crate::storage::query::ast::VectorSource::Reference { collection, .. } => {
849            cache_scope_insert(scopes, collection);
850        }
851        crate::storage::query::ast::VectorSource::Subquery(subquery) => {
852            collect_query_expr_result_cache_scopes(scopes, subquery);
853        }
854        crate::storage::query::ast::VectorSource::Literal(_)
855        | crate::storage::query::ast::VectorSource::Text(_) => {}
856    }
857}
858
859fn collect_path_selector_scopes(
860    scopes: &mut HashSet<String>,
861    selector: &crate::storage::query::ast::NodeSelector,
862) {
863    if let crate::storage::query::ast::NodeSelector::ByRow { table, .. } = selector {
864        cache_scope_insert(scopes, table);
865    }
866}
867
868fn collect_query_expr_result_cache_scopes(scopes: &mut HashSet<String>, expr: &QueryExpr) {
869    match expr {
870        QueryExpr::Table(query) => collect_table_source_scopes(scopes, query),
871        QueryExpr::Join(query) => {
872            collect_query_expr_result_cache_scopes(scopes, &query.left);
873            collect_query_expr_result_cache_scopes(scopes, &query.right);
874        }
875        QueryExpr::Path(query) => {
876            collect_path_selector_scopes(scopes, &query.from);
877            collect_path_selector_scopes(scopes, &query.to);
878        }
879        QueryExpr::Vector(query) => {
880            cache_scope_insert(scopes, &query.collection);
881            collect_vector_source_scopes(scopes, &query.query_vector);
882        }
883        QueryExpr::Hybrid(query) => {
884            collect_query_expr_result_cache_scopes(scopes, &query.structured);
885            cache_scope_insert(scopes, &query.vector.collection);
886            collect_vector_source_scopes(scopes, &query.vector.query_vector);
887        }
888        QueryExpr::Insert(query) => cache_scope_insert(scopes, &query.table),
889        QueryExpr::Update(query) => cache_scope_insert(scopes, &query.table),
890        QueryExpr::Delete(query) => cache_scope_insert(scopes, &query.table),
891        QueryExpr::CreateTable(query) => cache_scope_insert(scopes, &query.name),
892        QueryExpr::CreateCollection(query) => cache_scope_insert(scopes, &query.name),
893        QueryExpr::CreateVector(query) => cache_scope_insert(scopes, &query.name),
894        QueryExpr::DropTable(query) => cache_scope_insert(scopes, &query.name),
895        QueryExpr::DropGraph(query) => cache_scope_insert(scopes, &query.name),
896        QueryExpr::DropVector(query) => cache_scope_insert(scopes, &query.name),
897        QueryExpr::DropDocument(query) => cache_scope_insert(scopes, &query.name),
898        QueryExpr::DropKv(query) => cache_scope_insert(scopes, &query.name),
899        QueryExpr::DropCollection(query) => cache_scope_insert(scopes, &query.name),
900        QueryExpr::Truncate(query) => cache_scope_insert(scopes, &query.name),
901        QueryExpr::AlterTable(query) => cache_scope_insert(scopes, &query.name),
902        QueryExpr::CreateIndex(query) => cache_scope_insert(scopes, &query.table),
903        QueryExpr::DropIndex(query) => cache_scope_insert(scopes, &query.table),
904        QueryExpr::CreateTimeSeries(query) => cache_scope_insert(scopes, &query.name),
905        QueryExpr::DropTimeSeries(query) => cache_scope_insert(scopes, &query.name),
906        QueryExpr::CreateQueue(query) => cache_scope_insert(scopes, &query.name),
907        QueryExpr::AlterQueue(query) => cache_scope_insert(scopes, &query.name),
908        QueryExpr::DropQueue(query) => cache_scope_insert(scopes, &query.name),
909        QueryExpr::QueueSelect(query) => cache_scope_insert(scopes, &query.queue),
910        QueryExpr::QueueCommand(query) => match query {
911            QueueCommand::Push { queue, .. }
912            | QueueCommand::Pop { queue, .. }
913            | QueueCommand::Peek { queue, .. }
914            | QueueCommand::Len { queue }
915            | QueueCommand::Purge { queue }
916            | QueueCommand::GroupCreate { queue, .. }
917            | QueueCommand::GroupRead { queue, .. }
918            | QueueCommand::Pending { queue, .. }
919            | QueueCommand::Claim { queue, .. }
920            | QueueCommand::Ack { queue, .. }
921            | QueueCommand::Nack { queue, .. } => cache_scope_insert(scopes, queue),
922            QueueCommand::Move {
923                source,
924                destination,
925                ..
926            } => {
927                cache_scope_insert(scopes, source);
928                cache_scope_insert(scopes, destination);
929            }
930        },
931        QueryExpr::EventsBackfill(query) => {
932            cache_scope_insert(scopes, &query.collection);
933            cache_scope_insert(scopes, &query.target_queue);
934        }
935        QueryExpr::CreateTree(query) => cache_scope_insert(scopes, &query.collection),
936        QueryExpr::DropTree(query) => cache_scope_insert(scopes, &query.collection),
937        QueryExpr::TreeCommand(query) => match query {
938            TreeCommand::Insert { collection, .. }
939            | TreeCommand::Move { collection, .. }
940            | TreeCommand::Delete { collection, .. }
941            | TreeCommand::Validate { collection, .. }
942            | TreeCommand::Rebalance { collection, .. } => cache_scope_insert(scopes, collection),
943        },
944        QueryExpr::SearchCommand(query) => match query {
945            SearchCommand::Similar { collection, .. }
946            | SearchCommand::Hybrid { collection, .. }
947            | SearchCommand::SpatialRadius { collection, .. }
948            | SearchCommand::SpatialBbox { collection, .. }
949            | SearchCommand::SpatialNearest { collection, .. } => {
950                cache_scope_insert(scopes, collection);
951            }
952            SearchCommand::Text { collection, .. }
953            | SearchCommand::Multimodal { collection, .. }
954            | SearchCommand::Index { collection, .. }
955            | SearchCommand::Context { collection, .. } => {
956                if let Some(collection) = collection.as_deref() {
957                    cache_scope_insert(scopes, collection);
958                }
959            }
960        },
961        QueryExpr::Ask(query) => {
962            if let Some(collection) = query.collection.as_deref() {
963                cache_scope_insert(scopes, collection);
964            }
965        }
966        QueryExpr::ExplainAlter(query) => cache_scope_insert(scopes, &query.target.name),
967        QueryExpr::MaintenanceCommand(cmd) => match cmd {
968            crate::storage::query::ast::MaintenanceCommand::Vacuum { target, .. }
969            | crate::storage::query::ast::MaintenanceCommand::Analyze { target } => {
970                if let Some(t) = target {
971                    cache_scope_insert(scopes, t);
972                }
973            }
974        },
975        QueryExpr::CopyFrom(cmd) => cache_scope_insert(scopes, &cmd.table),
976        QueryExpr::CreateView(cmd) => {
977            cache_scope_insert(scopes, &cmd.name);
978            // Invalidating the view should also invalidate its dependencies.
979            collect_query_expr_result_cache_scopes(scopes, &cmd.query);
980        }
981        QueryExpr::DropView(cmd) => cache_scope_insert(scopes, &cmd.name),
982        QueryExpr::RefreshMaterializedView(cmd) => cache_scope_insert(scopes, &cmd.name),
983        QueryExpr::CreatePolicy(cmd) => cache_scope_insert(scopes, &cmd.table),
984        QueryExpr::DropPolicy(cmd) => cache_scope_insert(scopes, &cmd.table),
985        QueryExpr::CreateServer(_) | QueryExpr::DropServer(_) => {}
986        QueryExpr::CreateForeignTable(cmd) => cache_scope_insert(scopes, &cmd.name),
987        QueryExpr::DropForeignTable(cmd) => cache_scope_insert(scopes, &cmd.name),
988        QueryExpr::Graph(_)
989        | QueryExpr::GraphCommand(_)
990        | QueryExpr::ProbabilisticCommand(_)
991        | QueryExpr::SetConfig { .. }
992        | QueryExpr::ShowConfig { .. }
993        | QueryExpr::SetSecret { .. }
994        | QueryExpr::DeleteSecret { .. }
995        | QueryExpr::ShowSecrets { .. }
996        | QueryExpr::SetTenant(_)
997        | QueryExpr::ShowTenant
998        | QueryExpr::TransactionControl(_)
999        | QueryExpr::CreateSchema(_)
1000        | QueryExpr::DropSchema(_)
1001        | QueryExpr::CreateSequence(_)
1002        | QueryExpr::DropSequence(_)
1003        | QueryExpr::Grant(_)
1004        | QueryExpr::Revoke(_)
1005        | QueryExpr::AlterUser(_)
1006        | QueryExpr::CreateIamPolicy { .. }
1007        | QueryExpr::DropIamPolicy { .. }
1008        | QueryExpr::AttachPolicy { .. }
1009        | QueryExpr::DetachPolicy { .. }
1010        | QueryExpr::ShowPolicies { .. }
1011        | QueryExpr::ShowEffectivePermissions { .. }
1012        | QueryExpr::SimulatePolicy { .. }
1013        | QueryExpr::CreateMigration(_)
1014        | QueryExpr::ApplyMigration(_)
1015        | QueryExpr::RollbackMigration(_)
1016        | QueryExpr::ExplainMigration(_)
1017        | QueryExpr::EventsBackfillStatus { .. } => {}
1018        QueryExpr::KvCommand(cmd) => {
1019            use crate::storage::query::ast::KvCommand;
1020            match cmd {
1021                KvCommand::Put { collection, .. }
1022                | KvCommand::InvalidateTags { collection, .. }
1023                | KvCommand::Get { collection, .. }
1024                | KvCommand::Unseal { collection, .. }
1025                | KvCommand::Rotate { collection, .. }
1026                | KvCommand::History { collection, .. }
1027                | KvCommand::List { collection, .. }
1028                | KvCommand::Purge { collection, .. }
1029                | KvCommand::Watch { collection, .. }
1030                | KvCommand::Delete { collection, .. }
1031                | KvCommand::Incr { collection, .. }
1032                | KvCommand::Cas { collection, .. } => cache_scope_insert(scopes, collection),
1033            }
1034        }
1035        QueryExpr::ConfigCommand(cmd) => {
1036            use crate::storage::query::ast::ConfigCommand;
1037            match cmd {
1038                ConfigCommand::Put { collection, .. }
1039                | ConfigCommand::Get { collection, .. }
1040                | ConfigCommand::Resolve { collection, .. }
1041                | ConfigCommand::Rotate { collection, .. }
1042                | ConfigCommand::Delete { collection, .. }
1043                | ConfigCommand::History { collection, .. }
1044                | ConfigCommand::List { collection, .. }
1045                | ConfigCommand::Watch { collection, .. }
1046                | ConfigCommand::InvalidVolatileOperation { collection, .. } => {
1047                    cache_scope_insert(scopes, collection)
1048                }
1049            }
1050        }
1051    }
1052}
1053
1054/// Combine matching RLS policies for a table + action into a single
1055/// `Filter` suitable for AND-ing into a caller's `WHERE` clause.
1056///
1057/// Returns `None` when RLS is disabled or no policy admits the caller's
1058/// role — callers use that to short-circuit the mutation (for DELETE /
1059/// UPDATE we simply skip the operation, which PG expresses as "no rows
1060/// match the policy + predicate combination").
1061pub(crate) fn rls_policy_filter(
1062    runtime: &RedDBRuntime,
1063    table: &str,
1064    action: crate::storage::query::ast::PolicyAction,
1065) -> Option<crate::storage::query::ast::Filter> {
1066    rls_policy_filter_for_kind(
1067        runtime,
1068        table,
1069        action,
1070        crate::storage::query::ast::PolicyTargetKind::Table,
1071    )
1072}
1073
1074/// Kind-aware policy filter combiner (Phase 2.5.5 RLS universal).
1075/// Graph / vector / queue / timeseries scans pass the concrete kind;
1076/// policies targeting other kinds are ignored. Legacy Table-scoped
1077/// policies still apply cross-kind — callers register auto-tenancy
1078/// policies as Table today.
1079pub(crate) fn rls_policy_filter_for_kind(
1080    runtime: &RedDBRuntime,
1081    table: &str,
1082    action: crate::storage::query::ast::PolicyAction,
1083    kind: crate::storage::query::ast::PolicyTargetKind,
1084) -> Option<crate::storage::query::ast::Filter> {
1085    use crate::storage::query::ast::Filter;
1086
1087    if !runtime.inner.rls_enabled_tables.read().contains(table) {
1088        return None;
1089    }
1090    let role = current_auth_identity().map(|(_, role)| role);
1091    let role_str = role.map(|r| r.as_str().to_string());
1092    let policies = runtime.matching_rls_policies_for_kind(table, role_str.as_deref(), action, kind);
1093    if policies.is_empty() {
1094        return None;
1095    }
1096    policies
1097        .into_iter()
1098        .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1099}
1100
1101/// Returns true when the table has RLS enforcement enabled. Convenience
1102/// shortcut so DML paths can gate the AND-combine work without reaching
1103/// into `runtime.inner.rls_enabled_tables` directly.
1104pub(crate) fn rls_is_enabled(runtime: &RedDBRuntime, table: &str) -> bool {
1105    runtime.inner.rls_enabled_tables.read().contains(table)
1106}
1107
1108/// Per-entity gate used by the graph materialiser for `GraphNode`
1109/// entities. RLS is checked against the source collection with
1110/// `kind = Nodes`, which `matching_rls_policies_for_kind` resolves to
1111/// either `Nodes`-targeted policies or legacy `Table`-targeted ones
1112/// (for back-compat with auto-tenancy declarations). Cached per
1113/// collection so big graphs only resolve the policy chain once.
1114fn node_passes_rls(
1115    runtime: &RedDBRuntime,
1116    collection: &str,
1117    role: Option<&str>,
1118    cache: &mut std::collections::HashMap<String, Option<crate::storage::query::ast::Filter>>,
1119    entity: &crate::storage::unified::entity::UnifiedEntity,
1120) -> bool {
1121    use crate::storage::query::ast::{Filter, PolicyAction, PolicyTargetKind};
1122
1123    if !runtime.inner.rls_enabled_tables.read().contains(collection) {
1124        return true;
1125    }
1126    let filter = cache.entry(collection.to_string()).or_insert_with(|| {
1127        let policies = runtime.matching_rls_policies_for_kind(
1128            collection,
1129            role,
1130            PolicyAction::Select,
1131            PolicyTargetKind::Nodes,
1132        );
1133        if policies.is_empty() {
1134            None
1135        } else {
1136            policies
1137                .into_iter()
1138                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1139        }
1140    });
1141    let Some(filter) = filter else {
1142        return false;
1143    };
1144    crate::runtime::query_exec::evaluate_entity_filter_with_db(
1145        Some(&runtime.inner.db),
1146        entity,
1147        filter,
1148        collection,
1149        collection,
1150    )
1151}
1152
1153/// Edge counterpart of `node_passes_rls`. Same caching strategy with
1154/// `kind = Edges`.
1155fn edge_passes_rls(
1156    runtime: &RedDBRuntime,
1157    collection: &str,
1158    role: Option<&str>,
1159    cache: &mut std::collections::HashMap<String, Option<crate::storage::query::ast::Filter>>,
1160    entity: &crate::storage::unified::entity::UnifiedEntity,
1161) -> bool {
1162    use crate::storage::query::ast::{Filter, PolicyAction, PolicyTargetKind};
1163
1164    if !runtime.inner.rls_enabled_tables.read().contains(collection) {
1165        return true;
1166    }
1167    let filter = cache.entry(collection.to_string()).or_insert_with(|| {
1168        let policies = runtime.matching_rls_policies_for_kind(
1169            collection,
1170            role,
1171            PolicyAction::Select,
1172            PolicyTargetKind::Edges,
1173        );
1174        if policies.is_empty() {
1175            None
1176        } else {
1177            policies
1178                .into_iter()
1179                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1180        }
1181    });
1182    let Some(filter) = filter else {
1183        return false;
1184    };
1185    crate::runtime::query_exec::evaluate_entity_filter_with_db(
1186        Some(&runtime.inner.db),
1187        entity,
1188        filter,
1189        collection,
1190        collection,
1191    )
1192}
1193
1194/// RLS policy injection (Phase 2.5.2 PG parity).
1195///
1196/// Fetch every matching policy for the current thread-local role and
1197/// fold them into the query's filter. Semantics mirror PostgreSQL:
1198///
1199/// * Multiple policies on the same table combine with **OR** — a row is
1200///   visible if *any* policy admits it.
1201/// * The combined policy predicate is **AND**-ed into the caller's
1202///   existing `WHERE` clause so explicit predicates continue to trim
1203///   the policy-allowed set.
1204/// * No matching policies + RLS enabled = zero rows (PG's
1205///   restrictive-default). Callers get `None` and return an empty
1206///   `UnifiedResult` without ever dispatching the scan.
1207///
1208/// This runs only when `RuntimeInner::rls_enabled_tables` already
1209/// contains the table name — callers gate the hot path upfront to
1210/// avoid the lock acquisition on tables without RLS.
1211///
1212/// Returns `None` when no policy admits the current role; returns
1213/// `Some(mutated_table)` with policy filters folded in otherwise.
1214fn inject_rls_filters(
1215    runtime: &RedDBRuntime,
1216    frame: &dyn super::statement_frame::ReadFrame,
1217    mut table: crate::storage::query::ast::TableQuery,
1218) -> Option<crate::storage::query::ast::TableQuery> {
1219    use crate::storage::query::ast::{Filter, PolicyAction};
1220
1221    // `None` role falls through to policies with no `TO role` clause.
1222    let role = frame.identity().map(|(_, role)| role);
1223    let role_str = role.map(|r| r.as_str().to_string());
1224    let policies =
1225        runtime.matching_rls_policies(&table.table, role_str.as_deref(), PolicyAction::Select);
1226
1227    if policies.is_empty() {
1228        // RLS enabled + no policy match = deny everything. Signal the
1229        // caller to short-circuit with an empty result set.
1230        return None;
1231    }
1232
1233    // Combine policy predicates with OR (PG's permissive default).
1234    let combined = policies
1235        .into_iter()
1236        .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1237        .expect("policies non-empty");
1238
1239    // AND into the caller's existing filter.
1240    table.filter = Some(match table.filter.take() {
1241        Some(existing) => Filter::And(Box::new(existing), Box::new(combined)),
1242        None => combined,
1243    });
1244    Some(table)
1245}
1246
1247/// Apply per-table RLS to a `JoinQuery` by folding each side's policy
1248/// predicate into the join's outer filter. Walking the merged record
1249/// at the join layer (rather than mutating the per-side scan filter)
1250/// keeps the planner's strategy choice and per-side index selection
1251/// undisturbed — the policy predicate uses the qualified `t.col` form
1252/// that resolves cleanly against the merged record's keys.
1253///
1254/// Returns `None` when any leaf has RLS enabled and no policy admits
1255/// the caller — the join short-circuits to an empty result.
1256fn inject_rls_into_join(
1257    runtime: &RedDBRuntime,
1258    frame: &dyn super::statement_frame::ReadFrame,
1259    mut join: crate::storage::query::ast::JoinQuery,
1260) -> Option<crate::storage::query::ast::JoinQuery> {
1261    use crate::storage::query::ast::Filter;
1262
1263    let mut policy_filters: Vec<Filter> = Vec::new();
1264    if !collect_join_side_policy(runtime, frame, join.left.as_ref(), &mut policy_filters) {
1265        return None;
1266    }
1267    if !collect_join_side_policy(runtime, frame, join.right.as_ref(), &mut policy_filters) {
1268        return None;
1269    }
1270
1271    if policy_filters.is_empty() {
1272        return Some(join);
1273    }
1274
1275    let combined = policy_filters
1276        .into_iter()
1277        .reduce(|acc, f| Filter::And(Box::new(acc), Box::new(f)))
1278        .expect("policy_filters non-empty");
1279
1280    join.filter = Some(match join.filter.take() {
1281        Some(existing) => Filter::And(Box::new(existing), Box::new(combined)),
1282        None => combined,
1283    });
1284
1285    Some(join)
1286}
1287
1288/// For each `Table` leaf reachable through nested joins, append the
1289/// RLS-policy filter (combined with OR across that side's matching
1290/// policies) into `out`. Returns `false` when a side has RLS enabled
1291/// but no policy admits the caller — the join must short-circuit.
1292fn collect_join_side_policy(
1293    runtime: &RedDBRuntime,
1294    frame: &dyn super::statement_frame::ReadFrame,
1295    expr: &crate::storage::query::ast::QueryExpr,
1296    out: &mut Vec<crate::storage::query::ast::Filter>,
1297) -> bool {
1298    use crate::storage::query::ast::{Filter, PolicyAction, QueryExpr};
1299    match expr {
1300        QueryExpr::Table(t) => {
1301            if !runtime.inner.rls_enabled_tables.read().contains(&t.table) {
1302                return true;
1303            }
1304            let role = frame.identity().map(|(_, role)| role);
1305            let role_str = role.map(|r| r.as_str().to_string());
1306            let policies =
1307                runtime.matching_rls_policies(&t.table, role_str.as_deref(), PolicyAction::Select);
1308            if policies.is_empty() {
1309                return false;
1310            }
1311            let combined = policies
1312                .into_iter()
1313                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1314                .expect("policies non-empty");
1315            out.push(combined);
1316            true
1317        }
1318        QueryExpr::Join(inner) => {
1319            collect_join_side_policy(runtime, frame, inner.left.as_ref(), out)
1320                && collect_join_side_policy(runtime, frame, inner.right.as_ref(), out)
1321        }
1322        _ => true,
1323    }
1324}
1325
1326/// Foreign-table post-scan filter application (Phase 3.2.2 PG parity).
1327///
1328/// Phase 3.2 FDW wrappers don't advertise filter pushdown, so the runtime
1329/// applies `WHERE` / `ORDER BY` / `LIMIT` / `OFFSET` after the wrapper
1330/// materialises all rows. Projections are best-effort — when the query
1331/// lists explicit columns we keep only those; a `SELECT *` keeps every
1332/// wrapper-emitted field verbatim.
1333///
1334/// When a wrapper later opts into pushdown (`supports_pushdown = true`)
1335/// the runtime will pass the compiled filter down instead of post-filtering.
1336fn apply_foreign_table_filters(
1337    records: Vec<crate::storage::query::unified::UnifiedRecord>,
1338    query: &crate::storage::query::ast::TableQuery,
1339) -> crate::storage::query::unified::UnifiedResult {
1340    use crate::storage::query::sql_lowering::{
1341        effective_table_filter, effective_table_projections,
1342    };
1343    use crate::storage::query::unified::UnifiedResult;
1344
1345    let filter = effective_table_filter(query);
1346    let projections = effective_table_projections(query);
1347
1348    // Step 1 — WHERE. Reuse the cross-store evaluator so the semantics
1349    // match native-collection queries (same operators, same NULL handling).
1350    let mut filtered: Vec<_> = records
1351        .into_iter()
1352        .filter(|record| match &filter {
1353            Some(f) => {
1354                super::join_filter::evaluate_runtime_filter_with_db(None, record, f, None, None)
1355            }
1356            None => true,
1357        })
1358        .collect();
1359
1360    // Step 2 — LIMIT / OFFSET. Applied after filter to match SQL semantics.
1361    if let Some(offset) = query.offset {
1362        let offset = offset as usize;
1363        if offset >= filtered.len() {
1364            filtered.clear();
1365        } else {
1366            filtered.drain(0..offset);
1367        }
1368    }
1369    if let Some(limit) = query.limit {
1370        filtered.truncate(limit as usize);
1371    }
1372
1373    // Step 3 — columns list. `SELECT *` (no explicit projections) keeps
1374    // the wrapper's column set; an explicit list trims to those names.
1375    let columns: Vec<String> = if projections.is_empty() {
1376        filtered
1377            .first()
1378            .map(|r| r.column_names().iter().map(|k| k.to_string()).collect())
1379            .unwrap_or_default()
1380    } else {
1381        projections
1382            .iter()
1383            .map(super::join_filter::projection_name)
1384            .collect()
1385    };
1386
1387    let mut result = UnifiedResult::empty();
1388    result.columns = columns;
1389    result.records = filtered;
1390    result
1391}
1392
1393/// Collect every concrete table reference inside a `QueryExpr`.
1394///
1395/// Used by view bookkeeping (dependency tracking for materialised
1396/// invalidation) and any other rewriter that needs to know the base
1397/// tables a query pulls from. Does not descend into projections/filters;
1398/// only the `FROM` side.
1399pub(crate) fn collect_table_refs(expr: &QueryExpr) -> Vec<String> {
1400    let mut scopes: HashSet<String> = HashSet::new();
1401    collect_query_expr_result_cache_scopes(&mut scopes, expr);
1402    scopes.into_iter().collect()
1403}
1404
1405fn query_expr_result_cache_scopes(expr: &QueryExpr) -> HashSet<String> {
1406    let mut scopes = HashSet::new();
1407    collect_query_expr_result_cache_scopes(&mut scopes, expr);
1408    scopes
1409}
1410
1411const RESULT_CACHE_BACKEND_KEY: &str = "runtime.result_cache.backend";
1412const RESULT_CACHE_DEFAULT_BACKEND: &str = "legacy";
1413const RESULT_CACHE_BLOB_NAMESPACE: &str = "runtime.result_cache";
1414const RESULT_CACHE_TTL_SECS: u64 = 30;
1415const RESULT_CACHE_MAX_ENTRIES: usize = 1000;
1416const RESULT_CACHE_PAYLOAD_MAGIC: &[u8; 8] = b"RDRC0001";
1417
1418#[derive(Clone, Copy, Debug, PartialEq, Eq)]
1419enum RuntimeResultCacheBackend {
1420    Legacy,
1421    BlobCache,
1422    Shadow,
1423}
1424
1425fn trim_result_cache(
1426    map: &mut HashMap<String, RuntimeResultCacheEntry>,
1427    order: &mut std::collections::VecDeque<String>,
1428) {
1429    while map.len() > RESULT_CACHE_MAX_ENTRIES {
1430        if let Some(oldest) = order.pop_front() {
1431            map.remove(&oldest);
1432        } else {
1433            break;
1434        }
1435    }
1436}
1437
1438fn result_cache_fingerprint(result: &RuntimeQueryResult) -> String {
1439    format!(
1440        "{:?}|{}|{}|{}|{}|{:?}",
1441        result.result,
1442        result.query,
1443        result.statement,
1444        result.engine,
1445        result.affected_rows,
1446        result.statement_type
1447    )
1448}
1449
1450fn mode_to_byte(mode: crate::storage::query::modes::QueryMode) -> u8 {
1451    match mode {
1452        crate::storage::query::modes::QueryMode::Sql => 0,
1453        crate::storage::query::modes::QueryMode::Gremlin => 1,
1454        crate::storage::query::modes::QueryMode::Cypher => 2,
1455        crate::storage::query::modes::QueryMode::Sparql => 3,
1456        crate::storage::query::modes::QueryMode::Path => 4,
1457        crate::storage::query::modes::QueryMode::Natural => 5,
1458        crate::storage::query::modes::QueryMode::Unknown => 255,
1459    }
1460}
1461
1462fn mode_from_byte(byte: u8) -> Option<crate::storage::query::modes::QueryMode> {
1463    match byte {
1464        0 => Some(crate::storage::query::modes::QueryMode::Sql),
1465        1 => Some(crate::storage::query::modes::QueryMode::Gremlin),
1466        2 => Some(crate::storage::query::modes::QueryMode::Cypher),
1467        3 => Some(crate::storage::query::modes::QueryMode::Sparql),
1468        4 => Some(crate::storage::query::modes::QueryMode::Path),
1469        5 => Some(crate::storage::query::modes::QueryMode::Natural),
1470        255 => Some(crate::storage::query::modes::QueryMode::Unknown),
1471        _ => None,
1472    }
1473}
1474
1475fn result_cache_static_str(value: &str) -> Option<&'static str> {
1476    match value {
1477        "select" => Some("select"),
1478        "materialized-graph" => Some("materialized-graph"),
1479        "runtime-red-schema" => Some("runtime-red-schema"),
1480        "runtime-fdw" => Some("runtime-fdw"),
1481        "runtime-table-rls" => Some("runtime-table-rls"),
1482        "runtime-table" => Some("runtime-table"),
1483        "runtime-join-rls" => Some("runtime-join-rls"),
1484        "runtime-join" => Some("runtime-join"),
1485        "runtime-vector" => Some("runtime-vector"),
1486        "runtime-hybrid" => Some("runtime-hybrid"),
1487        "runtime-secret" => Some("runtime-secret"),
1488        "runtime-config" => Some("runtime-config"),
1489        "runtime-tenant" => Some("runtime-tenant"),
1490        "runtime-explain" => Some("runtime-explain"),
1491        "runtime-tree" => Some("runtime-tree"),
1492        "runtime-kv" => Some("runtime-kv"),
1493        "runtime-queue" => Some("runtime-queue"),
1494        _ => None,
1495    }
1496}
1497
1498fn write_u32(out: &mut Vec<u8>, value: usize) -> Option<()> {
1499    let value = u32::try_from(value).ok()?;
1500    out.extend_from_slice(&value.to_le_bytes());
1501    Some(())
1502}
1503
1504fn write_string(out: &mut Vec<u8>, value: &str) -> Option<()> {
1505    write_u32(out, value.len())?;
1506    out.extend_from_slice(value.as_bytes());
1507    Some(())
1508}
1509
1510fn write_bytes(out: &mut Vec<u8>, value: &[u8]) -> Option<()> {
1511    write_u32(out, value.len())?;
1512    out.extend_from_slice(value);
1513    Some(())
1514}
1515
1516fn read_u8(input: &mut &[u8]) -> Option<u8> {
1517    let (&value, rest) = input.split_first()?;
1518    *input = rest;
1519    Some(value)
1520}
1521
1522fn read_u32(input: &mut &[u8]) -> Option<usize> {
1523    if input.len() < 4 {
1524        return None;
1525    }
1526    let value = u32::from_le_bytes(input[..4].try_into().ok()?) as usize;
1527    *input = &input[4..];
1528    Some(value)
1529}
1530
1531fn read_u64(input: &mut &[u8]) -> Option<u64> {
1532    if input.len() < 8 {
1533        return None;
1534    }
1535    let value = u64::from_le_bytes(input[..8].try_into().ok()?);
1536    *input = &input[8..];
1537    Some(value)
1538}
1539
1540fn read_string(input: &mut &[u8]) -> Option<String> {
1541    let len = read_u32(input)?;
1542    if input.len() < len {
1543        return None;
1544    }
1545    let value = String::from_utf8(input[..len].to_vec()).ok()?;
1546    *input = &input[len..];
1547    Some(value)
1548}
1549
1550fn read_bytes<'a>(input: &mut &'a [u8]) -> Option<&'a [u8]> {
1551    let len = read_u32(input)?;
1552    if input.len() < len {
1553        return None;
1554    }
1555    let value = &input[..len];
1556    *input = &input[len..];
1557    Some(value)
1558}
1559
1560fn encode_result_cache_payload(entry: &RuntimeResultCacheEntry) -> Option<Vec<u8>> {
1561    let result = &entry.result;
1562    if result.result.pre_serialized_json.is_some()
1563        || result_cache_static_str(result.statement).is_none()
1564        || result_cache_static_str(result.engine).is_none()
1565        || result_cache_static_str(result.statement_type).is_none()
1566        || result.result.records.iter().any(|record| {
1567            !record.nodes.is_empty()
1568                || !record.edges.is_empty()
1569                || !record.paths.is_empty()
1570                || !record.vector_results.is_empty()
1571        })
1572    {
1573        return None;
1574    }
1575
1576    let mut out = Vec::new();
1577    out.extend_from_slice(RESULT_CACHE_PAYLOAD_MAGIC);
1578    write_string(&mut out, &result.query)?;
1579    out.push(mode_to_byte(result.mode));
1580    write_string(&mut out, result.statement)?;
1581    write_string(&mut out, result.engine)?;
1582    out.extend_from_slice(&result.affected_rows.to_le_bytes());
1583    write_string(&mut out, result.statement_type)?;
1584
1585    write_u32(&mut out, result.result.columns.len())?;
1586    for column in &result.result.columns {
1587        write_string(&mut out, column)?;
1588    }
1589    out.extend_from_slice(&result.result.stats.nodes_scanned.to_le_bytes());
1590    out.extend_from_slice(&result.result.stats.edges_scanned.to_le_bytes());
1591    out.extend_from_slice(&result.result.stats.rows_scanned.to_le_bytes());
1592    out.extend_from_slice(&result.result.stats.exec_time_us.to_le_bytes());
1593
1594    write_u32(&mut out, result.result.records.len())?;
1595    for record in &result.result.records {
1596        let fields = record.iter_fields().collect::<Vec<_>>();
1597        write_u32(&mut out, fields.len())?;
1598        for (name, value) in fields {
1599            write_string(&mut out, name)?;
1600            let mut encoded = Vec::new();
1601            crate::storage::schema::value_codec::encode(value, &mut encoded);
1602            write_bytes(&mut out, &encoded)?;
1603        }
1604    }
1605
1606    write_u32(&mut out, entry.scopes.len())?;
1607    for scope in &entry.scopes {
1608        write_string(&mut out, scope)?;
1609    }
1610    Some(out)
1611}
1612
1613fn decode_result_cache_payload(mut input: &[u8]) -> Option<(RuntimeQueryResult, HashSet<String>)> {
1614    if input.len() < RESULT_CACHE_PAYLOAD_MAGIC.len()
1615        || &input[..RESULT_CACHE_PAYLOAD_MAGIC.len()] != RESULT_CACHE_PAYLOAD_MAGIC
1616    {
1617        return None;
1618    }
1619    input = &input[RESULT_CACHE_PAYLOAD_MAGIC.len()..];
1620
1621    let query = read_string(&mut input)?;
1622    let mode = mode_from_byte(read_u8(&mut input)?)?;
1623    let statement = result_cache_static_str(&read_string(&mut input)?)?;
1624    let engine = result_cache_static_str(&read_string(&mut input)?)?;
1625    let affected_rows = read_u64(&mut input)?;
1626    let statement_type = result_cache_static_str(&read_string(&mut input)?)?;
1627
1628    let mut columns = Vec::new();
1629    for _ in 0..read_u32(&mut input)? {
1630        columns.push(read_string(&mut input)?);
1631    }
1632    let stats = crate::storage::query::unified::QueryStats {
1633        nodes_scanned: read_u64(&mut input)?,
1634        edges_scanned: read_u64(&mut input)?,
1635        rows_scanned: read_u64(&mut input)?,
1636        exec_time_us: read_u64(&mut input)?,
1637    };
1638
1639    let mut records = Vec::new();
1640    for _ in 0..read_u32(&mut input)? {
1641        let mut record = crate::storage::query::unified::UnifiedRecord::new();
1642        for _ in 0..read_u32(&mut input)? {
1643            let name = read_string(&mut input)?;
1644            let bytes = read_bytes(&mut input)?;
1645            let (value, used) = crate::storage::schema::value_codec::decode(bytes).ok()?;
1646            if used != bytes.len() {
1647                return None;
1648            }
1649            record.set_owned(name, value);
1650        }
1651        records.push(record);
1652    }
1653
1654    let mut scopes = HashSet::new();
1655    for _ in 0..read_u32(&mut input)? {
1656        scopes.insert(read_string(&mut input)?);
1657    }
1658    if !input.is_empty() {
1659        return None;
1660    }
1661
1662    Some((
1663        RuntimeQueryResult {
1664            query,
1665            mode,
1666            statement,
1667            engine,
1668            result: crate::storage::query::unified::UnifiedResult {
1669                columns,
1670                records,
1671                stats,
1672                pre_serialized_json: None,
1673            },
1674            affected_rows,
1675            statement_type,
1676        },
1677        scopes,
1678    ))
1679}
1680
1681/// Heuristic: does the raw SQL reference a built-in whose output
1682/// varies by connection, clock, or randomness? Such queries must
1683/// skip the 30s result cache — see the call site for rationale.
1684///
1685/// ASCII case-insensitive substring match. False positives (the
1686/// token appears in a quoted string) only skip caching, which is
1687/// the conservative direction.
1688/// If `sql` starts with `EXPLAIN` followed by a non-`ALTER` token,
1689/// return the trimmed inner statement; otherwise `None`.
1690///
1691/// `EXPLAIN ALTER FOR CREATE TABLE ...` is a separate schema-diff
1692/// command handled inside the normal SQL parser, so we leave it
1693/// alone here.
1694fn strip_explain_prefix(sql: &str) -> Option<&str> {
1695    let trimmed = sql.trim_start();
1696    let (head, rest) = trimmed.split_at(
1697        trimmed
1698            .find(|c: char| c.is_whitespace())
1699            .unwrap_or(trimmed.len()),
1700    );
1701    if !head.eq_ignore_ascii_case("EXPLAIN") {
1702        return None;
1703    }
1704    let rest = rest.trim_start();
1705    if rest.is_empty() {
1706        return None;
1707    }
1708    // Peek the next token — if ALTER or ASK, defer to the normal parser.
1709    // `EXPLAIN ASK` is an executable read path: it runs retrieval and
1710    // provider selection, then short-circuits before the LLM call.
1711    let next_head_end = rest.find(|c: char| c.is_whitespace()).unwrap_or(rest.len());
1712    if rest[..next_head_end].eq_ignore_ascii_case("ALTER")
1713        || rest[..next_head_end].eq_ignore_ascii_case("ASK")
1714    {
1715        return None;
1716    }
1717    Some(rest)
1718}
1719
1720/// Cheap prefix check for a leading `WITH` keyword. Used to gate the
1721/// CTE-aware parse in `execute_query` without paying for a full
1722/// lexer pass on every statement. Treats `WITHIN` as not-a-CTE so
1723/// `WITHIN TENANT '...' SELECT ...` doesn't mis-route.
1724pub(super) fn has_with_prefix(sql: &str) -> bool {
1725    let trimmed = sql.trim_start();
1726    let head_end = trimmed
1727        .find(|c: char| c.is_whitespace() || c == '(')
1728        .unwrap_or(trimmed.len());
1729    trimmed[..head_end].eq_ignore_ascii_case("WITH")
1730}
1731
1732/// If the query is a plain SELECT whose top-level `TableQuery`
1733/// carries an `AS OF` clause, return a typed spec that the runtime
1734/// can feed to `vcs_resolve_as_of`. Returns `None` for any other
1735/// shape — joins, DML, EXPLAIN, or parse failures — so callers fall
1736/// back to the connection's regular MVCC snapshot. A cheap textual
1737/// prefilter skips the parse entirely when the source doesn't
1738/// mention `AS OF` / `as of`, keeping the autocommit hot path free.
1739fn peek_top_level_as_of(sql: &str) -> Option<crate::application::vcs::AsOfSpec> {
1740    peek_top_level_as_of_with_table(sql).map(|(spec, _)| spec)
1741}
1742
1743/// Same as `peek_top_level_as_of` but also returns the table name
1744/// targeted by the AS OF clause (when the FROM clause names a
1745/// concrete table). `None` for the table slot means scalar SELECT
1746/// or a subquery source — callers treat those as "no enforcement".
1747pub(super) fn peek_top_level_as_of_with_table(
1748    sql: &str,
1749) -> Option<(crate::application::vcs::AsOfSpec, Option<String>)> {
1750    if !sql
1751        .as_bytes()
1752        .windows(5)
1753        .any(|w| w.eq_ignore_ascii_case(b"as of"))
1754    {
1755        return None;
1756    }
1757    let parsed = crate::storage::query::parser::parse(sql).ok()?;
1758    let crate::storage::query::ast::QueryExpr::Table(table) = parsed.query else {
1759        return None;
1760    };
1761    let clause = table.as_of?;
1762    let table_name = if table.table.is_empty() || table.table == "any" {
1763        None
1764    } else {
1765        Some(table.table.clone())
1766    };
1767    let spec = match clause {
1768        crate::storage::query::ast::AsOfClause::Commit(h) => {
1769            crate::application::vcs::AsOfSpec::Commit(h)
1770        }
1771        crate::storage::query::ast::AsOfClause::Branch(b) => {
1772            crate::application::vcs::AsOfSpec::Branch(b)
1773        }
1774        crate::storage::query::ast::AsOfClause::Tag(t) => crate::application::vcs::AsOfSpec::Tag(t),
1775        crate::storage::query::ast::AsOfClause::TimestampMs(ts) => {
1776            crate::application::vcs::AsOfSpec::TimestampMs(ts)
1777        }
1778        crate::storage::query::ast::AsOfClause::Snapshot(x) => {
1779            crate::application::vcs::AsOfSpec::Snapshot(x)
1780        }
1781    };
1782    Some((spec, table_name))
1783}
1784
1785pub(super) fn query_has_volatile_builtin(sql: &str) -> bool {
1786    // Lowercase the bytes up to the first null/newline into a small
1787    // stack buffer for cheap contains() checks. Most SQL fits in the
1788    // buffer; longer queries fall back to owned lowercase.
1789    const VOLATILE_TOKENS: &[&str] = &[
1790        "pg_advisory_lock",
1791        "pg_try_advisory_lock",
1792        "pg_advisory_unlock",
1793        "random()",
1794        // NOW() / CURRENT_TIMESTAMP / CURRENT_DATE intentionally
1795        // omitted for now — they ARE volatile but today's tests rely
1796        // on caching them. Revisit once a tighter volatility story
1797        // lands.
1798    ];
1799    let lowered = sql.to_ascii_lowercase();
1800    VOLATILE_TOKENS.iter().any(|t| lowered.contains(t))
1801}
1802
1803pub(super) fn query_is_ask_statement(sql: &str) -> bool {
1804    let trimmed = sql.trim_start();
1805    let head_end = trimmed
1806        .find(|c: char| c.is_whitespace() || c == '(' || c == ';')
1807        .unwrap_or(trimmed.len());
1808    trimmed[..head_end].eq_ignore_ascii_case("ASK")
1809}
1810
1811/// Pick the `(global_mode, collection_mode)` pair for an expression,
1812/// or `None` for variants that opt out of intent-locking entirely
1813/// (admin statements like `SHOW CONFIG`, transaction control, tenant
1814/// toggles).
1815///
1816/// Phase-1 contract:
1817/// - Reads  — `(IX-compatible) (Global, IS) → (Collection, IS)`
1818/// - Writes — `(IX-compatible) (Global, IX) → (Collection, IX)`
1819/// - DDL    — `(strong)        (Global, IX) → (Collection, X)`
1820pub(super) fn intent_lock_modes_for(
1821    expr: &QueryExpr,
1822) -> Option<(
1823    crate::storage::transaction::lock::LockMode,
1824    crate::storage::transaction::lock::LockMode,
1825)> {
1826    use crate::storage::transaction::lock::LockMode::{Exclusive, IntentExclusive, IntentShared};
1827
1828    match expr {
1829        // Reads — IS / IS.
1830        QueryExpr::Table(_)
1831        | QueryExpr::Join(_)
1832        | QueryExpr::Vector(_)
1833        | QueryExpr::Hybrid(_)
1834        | QueryExpr::Graph(_)
1835        | QueryExpr::Path(_)
1836        | QueryExpr::Ask(_)
1837        | QueryExpr::SearchCommand(_)
1838        | QueryExpr::GraphCommand(_)
1839        | QueryExpr::QueueSelect(_) => Some((IntentShared, IntentShared)),
1840
1841        // Writes — IX / IX. Non-tabular mutations (vector insert,
1842        // graph node insert, queue push, timeseries point insert)
1843        // don't carry their own dispatch arm here; they ride through
1844        // the Insert variant or a command variant covered by the
1845        // read-side arm above. P1.T4 expands only the TableQuery-ish
1846        // writes; non-tabular kinds inherit when their DML variants
1847        // land in later phases.
1848        QueryExpr::Insert(_)
1849        | QueryExpr::Update(_)
1850        | QueryExpr::Delete(_)
1851        | QueryExpr::QueueCommand(QueueCommand::Move { .. }) => {
1852            Some((IntentExclusive, IntentExclusive))
1853        }
1854        QueryExpr::QueueCommand(_) => Some((IntentShared, IntentShared)),
1855
1856        // DDL — IX / X. A DDL against collection `c` blocks all
1857        // other writers + readers on `c` but leaves other collections
1858        // running (because Global stays IX, not X).
1859        QueryExpr::CreateTable(_)
1860        | QueryExpr::CreateCollection(_)
1861        | QueryExpr::CreateVector(_)
1862        | QueryExpr::DropTable(_)
1863        | QueryExpr::DropGraph(_)
1864        | QueryExpr::DropVector(_)
1865        | QueryExpr::DropDocument(_)
1866        | QueryExpr::DropKv(_)
1867        | QueryExpr::DropCollection(_)
1868        | QueryExpr::Truncate(_)
1869        | QueryExpr::AlterTable(_)
1870        | QueryExpr::CreateIndex(_)
1871        | QueryExpr::DropIndex(_)
1872        | QueryExpr::CreateTimeSeries(_)
1873        | QueryExpr::DropTimeSeries(_)
1874        | QueryExpr::CreateQueue(_)
1875        | QueryExpr::AlterQueue(_)
1876        | QueryExpr::DropQueue(_)
1877        | QueryExpr::CreateTree(_)
1878        | QueryExpr::DropTree(_)
1879        | QueryExpr::CreatePolicy(_)
1880        | QueryExpr::DropPolicy(_)
1881        | QueryExpr::CreateView(_)
1882        | QueryExpr::DropView(_)
1883        | QueryExpr::RefreshMaterializedView(_)
1884        | QueryExpr::CreateSchema(_)
1885        | QueryExpr::DropSchema(_)
1886        | QueryExpr::CreateSequence(_)
1887        | QueryExpr::DropSequence(_)
1888        | QueryExpr::CreateServer(_)
1889        | QueryExpr::DropServer(_)
1890        | QueryExpr::CreateForeignTable(_)
1891        | QueryExpr::DropForeignTable(_) => Some((IntentExclusive, Exclusive)),
1892
1893        // Admin / control — skip intent locks. `SET TENANT`,
1894        // `BEGIN / COMMIT / ROLLBACK`, `SET CONFIG`, `SHOW CONFIG`,
1895        // `VACUUM`, etc. don't touch collection data the same way
1896        // and the existing transaction layer already serialises the
1897        // pieces that matter.
1898        _ => None,
1899    }
1900}
1901
1902/// Best-effort collection inventory for an expression. Used to pick
1903/// `Collection(...)` resources for the intent-lock guard. Overshoots
1904/// are fine (take an extra IS, benign); undershoots leak writes past
1905/// DDL X locks, so err on the side of listing more names.
1906pub(super) fn collections_referenced(expr: &QueryExpr) -> Vec<String> {
1907    let mut out = Vec::new();
1908    walk_collections(expr, &mut out);
1909    out.sort();
1910    out.dedup();
1911    out
1912}
1913
1914fn walk_collections(expr: &QueryExpr, out: &mut Vec<String>) {
1915    match expr {
1916        QueryExpr::Table(t) => out.push(t.table.clone()),
1917        QueryExpr::Join(j) => {
1918            walk_collections(&j.left, out);
1919            walk_collections(&j.right, out);
1920        }
1921        QueryExpr::Insert(i) => out.push(i.table.clone()),
1922        QueryExpr::Update(u) => out.push(u.table.clone()),
1923        QueryExpr::Delete(d) => out.push(d.table.clone()),
1924        QueryExpr::QueueSelect(q) => out.push(q.queue.clone()),
1925
1926        // DDL — include the target collection so DDL takes
1927        // `(Collection, X)` and blocks concurrent readers / writers
1928        // on the same collection. Other collections stay live
1929        // because Global is still IX.
1930        QueryExpr::CreateTable(q) => out.push(q.name.clone()),
1931        QueryExpr::CreateCollection(q) => out.push(q.name.clone()),
1932        QueryExpr::CreateVector(q) => out.push(q.name.clone()),
1933        QueryExpr::DropTable(q) => out.push(q.name.clone()),
1934        QueryExpr::DropGraph(q) => out.push(q.name.clone()),
1935        QueryExpr::DropVector(q) => out.push(q.name.clone()),
1936        QueryExpr::DropDocument(q) => out.push(q.name.clone()),
1937        QueryExpr::DropKv(q) => out.push(q.name.clone()),
1938        QueryExpr::DropCollection(q) => out.push(q.name.clone()),
1939        QueryExpr::Truncate(q) => out.push(q.name.clone()),
1940        QueryExpr::AlterTable(q) => out.push(q.name.clone()),
1941        QueryExpr::CreateIndex(q) => out.push(q.table.clone()),
1942        QueryExpr::DropIndex(q) => out.push(q.table.clone()),
1943        QueryExpr::CreateTimeSeries(q) => out.push(q.name.clone()),
1944        QueryExpr::DropTimeSeries(q) => out.push(q.name.clone()),
1945        QueryExpr::CreateQueue(q) => out.push(q.name.clone()),
1946        QueryExpr::AlterQueue(q) => out.push(q.name.clone()),
1947        QueryExpr::DropQueue(q) => out.push(q.name.clone()),
1948        QueryExpr::QueueCommand(QueueCommand::Move {
1949            source,
1950            destination,
1951            ..
1952        }) => {
1953            out.push(source.clone());
1954            out.push(destination.clone());
1955        }
1956        QueryExpr::CreatePolicy(q) => out.push(q.table.clone()),
1957        QueryExpr::CreateView(q) => out.push(q.name.clone()),
1958        QueryExpr::DropView(q) => out.push(q.name.clone()),
1959        QueryExpr::RefreshMaterializedView(q) => out.push(q.name.clone()),
1960
1961        // Vector / Hybrid / Graph / Path / commands reference
1962        // collections through fields whose shape varies; without a
1963        // uniform accessor we fall back to the global lock only —
1964        // benign because every runtime path still holds the global
1965        // mode.
1966        _ => {}
1967    }
1968}
1969
1970impl RedDBRuntime {
1971    pub fn in_memory() -> RedDBResult<Self> {
1972        Self::with_options(RedDBOptions::in_memory())
1973    }
1974
1975    /// Handle to the intent-lock manager for tests + introspection.
1976    /// Production code acquires via `LockerGuard::new(rt.lock_manager())`
1977    /// rather than touching the manager directly.
1978    pub fn lock_manager(&self) -> std::sync::Arc<crate::storage::transaction::lock::LockManager> {
1979        self.inner.lock_manager.clone()
1980    }
1981
1982    #[inline(never)]
1983    pub fn with_options(options: RedDBOptions) -> RedDBResult<Self> {
1984        Self::with_pool(options, ConnectionPoolConfig::default())
1985    }
1986
1987    pub fn with_pool(
1988        options: RedDBOptions,
1989        pool_config: ConnectionPoolConfig,
1990    ) -> RedDBResult<Self> {
1991        // PLAN.md Phase 9.1 — capture wall-clock before storage
1992        // open so the cold-start phase markers can be backfilled
1993        // once Lifecycle is constructed below. Storage open
1994        // encapsulates auto-restore + WAL replay; we treat the
1995        // whole window as one combined "restore" + "wal_replay"
1996        // phase split at the same boundary because the storage
1997        // layer doesn't yet emit a finer signal.
1998        let boot_open_start_ms = std::time::SystemTime::now()
1999            .duration_since(std::time::UNIX_EPOCH)
2000            .map(|d| d.as_millis() as u64)
2001            .unwrap_or(0);
2002        let db = Arc::new(
2003            RedDB::open_with_options(&options)
2004                .map_err(|err| RedDBError::Internal(err.to_string()))?,
2005        );
2006        let result_blob_cache = crate::storage::cache::BlobCache::open_with_l2(
2007            crate::storage::cache::BlobCacheConfig::default().with_l2_path(
2008                options
2009                    .resolved_path("data.rdb")
2010                    .with_extension("result-cache.l2"),
2011            ),
2012        )
2013        .map_err(|err| {
2014            RedDBError::Internal(format!("open result Blob Cache L2 failed: {err:?}"))
2015        })?;
2016        let storage_ready_ms = std::time::SystemTime::now()
2017            .duration_since(std::time::UNIX_EPOCH)
2018            .map(|d| d.as_millis() as u64)
2019            .unwrap_or(0);
2020
2021        let runtime = Self {
2022            inner: Arc::new(RuntimeInner {
2023                db,
2024                layout: PhysicalLayout::from_options(&options),
2025                indices: IndexCatalog::register_default_vector_graph(
2026                    options.has_capability(crate::api::Capability::Table),
2027                    options.has_capability(crate::api::Capability::Graph),
2028                ),
2029                pool_config,
2030                pool: Mutex::new(PoolState::default()),
2031                started_at_unix_ms: SystemTime::now()
2032                    .duration_since(UNIX_EPOCH)
2033                    .unwrap_or_default()
2034                    .as_millis(),
2035                probabilistic: super::probabilistic_store::ProbabilisticStore::new(),
2036                index_store: super::index_store::IndexStore::new(),
2037                cdc: crate::replication::cdc::CdcBuffer::new(100_000),
2038                backup_scheduler: crate::replication::scheduler::BackupScheduler::new(3600),
2039                query_cache: parking_lot::RwLock::new(
2040                    crate::storage::query::planner::cache::PlanCache::new(1000),
2041                ),
2042                result_cache: parking_lot::RwLock::new((
2043                    HashMap::new(),
2044                    std::collections::VecDeque::new(),
2045                )),
2046                result_blob_cache,
2047                result_blob_entries: parking_lot::RwLock::new((
2048                    HashMap::new(),
2049                    std::collections::VecDeque::new(),
2050                )),
2051                ask_answer_cache_entries: parking_lot::RwLock::new((
2052                    HashSet::new(),
2053                    std::collections::VecDeque::new(),
2054                )),
2055                result_cache_shadow_divergences: std::sync::atomic::AtomicU64::new(0),
2056                ask_daily_spend: parking_lot::RwLock::new(HashMap::new()),
2057                queue_message_locks: parking_lot::RwLock::new(HashMap::new()),
2058                rmw_locks: RmwLockTable::new(),
2059                planner_dirty_tables: parking_lot::RwLock::new(HashSet::new()),
2060                ec_registry: Arc::new(crate::ec::config::EcRegistry::new()),
2061                ec_worker: crate::ec::worker::EcWorker::new(),
2062                auth_store: parking_lot::RwLock::new(None),
2063                oauth_validator: parking_lot::RwLock::new(None),
2064                views: parking_lot::RwLock::new(HashMap::new()),
2065                materialized_views: parking_lot::RwLock::new(
2066                    crate::storage::cache::result::MaterializedViewCache::new(),
2067                ),
2068                snapshot_manager: Arc::new(
2069                    crate::storage::transaction::snapshot::SnapshotManager::new(),
2070                ),
2071                tx_contexts: parking_lot::RwLock::new(HashMap::new()),
2072                tx_local_tenants: parking_lot::RwLock::new(HashMap::new()),
2073                env_config_overrides: crate::runtime::config_overlay::collect_env_overrides(),
2074                lock_manager: Arc::new({
2075                    // Sourced from the matrix: Tier B key
2076                    // `concurrency.locking.deadlock_timeout_ms`
2077                    // (default 5000). Env var wins at boot so
2078                    // operators can tune without touching red_config.
2079                    let env = crate::runtime::config_overlay::collect_env_overrides();
2080                    let timeout_ms = env
2081                        .get("concurrency.locking.deadlock_timeout_ms")
2082                        .and_then(|raw| raw.parse::<u64>().ok())
2083                        .unwrap_or_else(|| {
2084                            match crate::runtime::config_matrix::default_for(
2085                                "concurrency.locking.deadlock_timeout_ms",
2086                            ) {
2087                                Some(crate::serde_json::Value::Number(n)) => n as u64,
2088                                _ => 5000,
2089                            }
2090                        });
2091                    let cfg = crate::storage::transaction::lock::LockConfig {
2092                        default_timeout: std::time::Duration::from_millis(timeout_ms),
2093                        ..Default::default()
2094                    };
2095                    crate::storage::transaction::lock::LockManager::new(cfg)
2096                }),
2097                rls_policies: parking_lot::RwLock::new(HashMap::new()),
2098                rls_enabled_tables: parking_lot::RwLock::new(HashSet::new()),
2099                foreign_tables: Arc::new(crate::storage::fdw::ForeignTableRegistry::with_builtins()),
2100                pending_tombstones: parking_lot::RwLock::new(HashMap::new()),
2101                pending_versioned_updates: parking_lot::RwLock::new(HashMap::new()),
2102                pending_kv_watch_events: parking_lot::RwLock::new(HashMap::new()),
2103                pending_store_wal_actions: parking_lot::RwLock::new(HashMap::new()),
2104                tenant_tables: parking_lot::RwLock::new(HashMap::new()),
2105                ddl_epoch: std::sync::atomic::AtomicU64::new(0),
2106                write_gate: Arc::new(crate::runtime::write_gate::WriteGate::from_options(
2107                    &options,
2108                )),
2109                lifecycle: crate::runtime::lifecycle::Lifecycle::new(),
2110                resource_limits: crate::runtime::resource_limits::ResourceLimits::from_env(),
2111                audit_log: {
2112                    // Default audit-log path for the in-memory case
2113                    // sits in the system temp dir; persistent runs
2114                    // place it next to data.rdb.
2115                    let data_path = options
2116                        .data_path
2117                        .clone()
2118                        .unwrap_or_else(|| std::env::temp_dir().join("reddb"));
2119                    Arc::new(crate::runtime::audit_log::AuditLogger::for_data_path(
2120                        &data_path,
2121                    ))
2122                },
2123                lease_lifecycle: std::sync::OnceLock::new(),
2124                replica_apply_metrics: crate::replication::logical::ReplicaApplyMetrics::default(),
2125                quota_bucket: crate::runtime::quota_bucket::QuotaBucket::from_env(),
2126                schema_vocabulary: parking_lot::RwLock::new(
2127                    crate::runtime::schema_vocabulary::SchemaVocabulary::new(),
2128                ),
2129                slow_query_logger: {
2130                    // Issue #205 — slow-query sink lives in the same
2131                    // directory the audit log uses, so backup/restore
2132                    // ships them together. Threshold + sample-pct
2133                    // default conservatively (1 s, 100% sampling) so
2134                    // emitted lines are rare and complete. Operators
2135                    // tune via env / config matrix in a follow-up.
2136                    //
2137                    // `data_path` points at the primary `.rdb` *file*
2138                    // (mirrors AuditLogger::for_data_path), so we
2139                    // anchor the slow log at its parent directory.
2140                    let log_dir = options
2141                        .data_path
2142                        .as_ref()
2143                        .and_then(|p| p.parent().map(std::path::PathBuf::from))
2144                        .unwrap_or_else(|| std::env::temp_dir().join("reddb"));
2145                    let threshold_ms = std::env::var("RED_SLOW_QUERY_THRESHOLD_MS")
2146                        .ok()
2147                        .and_then(|s| s.parse::<u64>().ok())
2148                        .unwrap_or(1000);
2149                    let sample_pct = std::env::var("RED_SLOW_QUERY_SAMPLE_PCT")
2150                        .ok()
2151                        .and_then(|s| s.parse::<u8>().ok())
2152                        .unwrap_or(100);
2153                    crate::telemetry::slow_query_logger::SlowQueryLogger::new(
2154                        crate::telemetry::slow_query_logger::SlowQueryOpts {
2155                            log_dir,
2156                            threshold_ms,
2157                            sample_pct,
2158                        },
2159                    )
2160                },
2161                kv_stats: crate::runtime::KvStatsCounters::default(),
2162                metrics_ingest_stats: crate::runtime::MetricsIngestCounters::default(),
2163                metrics_tenant_activity_stats:
2164                    crate::runtime::MetricsTenantActivityCounters::default(),
2165                kv_tag_index: crate::runtime::KvTagIndex::default(),
2166            }),
2167        };
2168
2169        // Issue #205 — install the process-wide OperatorEvent sink so
2170        // emit sites buried in storage / replication / signal handlers
2171        // can record without threading an `&AuditLogger` through every
2172        // call stack. First registration wins; subsequent in-memory
2173        // runtimes (test harnesses) fall through to tracing+eprintln.
2174        crate::telemetry::operator_event::install_global_audit_sink(Arc::clone(
2175            &runtime.inner.audit_log,
2176        ));
2177
2178        // PLAN.md Phase 9.1 — backfill cold-start phase markers
2179        // from the wall-clock captured before storage open. The
2180        // entire `RedDB::open_with_options` call covers both
2181        // auto-restore (when configured) and WAL replay. We
2182        // record both phases against the same boundary today;
2183        // a follow-up will split them once the storage layer
2184        // surfaces a finer-grained event.
2185        runtime
2186            .inner
2187            .lifecycle
2188            .set_restore_started_at_ms(boot_open_start_ms);
2189        runtime
2190            .inner
2191            .lifecycle
2192            .set_restore_ready_at_ms(storage_ready_ms);
2193        runtime
2194            .inner
2195            .lifecycle
2196            .set_wal_replay_started_at_ms(boot_open_start_ms);
2197        runtime
2198            .inner
2199            .lifecycle
2200            .set_wal_replay_ready_at_ms(storage_ready_ms);
2201
2202        let restored_cdc_lsn = runtime
2203            .inner
2204            .db
2205            .replication
2206            .as_ref()
2207            .map(|repl| {
2208                repl.logical_wal_spool
2209                    .as_ref()
2210                    .map(|spool| spool.current_lsn())
2211                    .unwrap_or(0)
2212            })
2213            .unwrap_or(0)
2214            .max(runtime.config_u64("red.config.timeline.last_archived_lsn", 0));
2215        runtime.inner.cdc.set_current_lsn(restored_cdc_lsn);
2216        runtime.rehydrate_snapshot_xid_floor();
2217        runtime.bootstrap_system_keyed_collections()?;
2218        runtime.rehydrate_declared_column_schemas();
2219        runtime.load_probabilistic_state()?;
2220
2221        // Phase 2.5.4: replay `tenant_tables.{table}.column` markers so
2222        // tables declared via `TENANT BY (col)` survive restart. Each
2223        // entry re-registers the auto-policy and flips RLS on again.
2224        runtime.rehydrate_tenant_tables();
2225        if let Some(repl) = &runtime.inner.db.replication {
2226            repl.wal_buffer.set_current_lsn(restored_cdc_lsn);
2227        }
2228
2229        // Save system info to red_config on boot
2230        {
2231            let sys = SystemInfo::collect();
2232            runtime.inner.db.store().set_config_tree(
2233                "red.system",
2234                &crate::serde_json::json!({
2235                    "pid": sys.pid,
2236                    "cpu_cores": sys.cpu_cores,
2237                    "total_memory_bytes": sys.total_memory_bytes,
2238                    "available_memory_bytes": sys.available_memory_bytes,
2239                    "os": sys.os,
2240                    "arch": sys.arch,
2241                    "hostname": sys.hostname,
2242                    "started_at": SystemTime::now()
2243                        .duration_since(UNIX_EPOCH)
2244                        .unwrap_or_default()
2245                        .as_millis() as u64
2246                }),
2247            );
2248
2249            // Seed defaults on first boot (only if red_config is empty or missing defaults)
2250            let store = runtime.inner.db.store();
2251            if store
2252                .get_collection("red_config")
2253                .map(|m| m.query_all(|_| true).len())
2254                .unwrap_or(0)
2255                <= 10
2256            {
2257                store.set_config_tree("red.ai", &crate::json!({
2258                    "default": crate::json!({
2259                        "provider": "openai",
2260                        "model": crate::ai::DEFAULT_OPENAI_PROMPT_MODEL
2261                    }),
2262                    "max_embedding_inputs": 256,
2263                    "max_prompt_batch": 256,
2264                    "timeout": crate::json!({ "connect_secs": 10, "read_secs": 90, "write_secs": 30 })
2265                }));
2266                store.set_config_tree(
2267                    "red.server",
2268                    &crate::json!({
2269                        "max_scan_limit": 1000,
2270                        "max_body_size": 1048576,
2271                        "read_timeout_ms": 5000,
2272                        "write_timeout_ms": 5000
2273                    }),
2274                );
2275                store.set_config_tree(
2276                    "red.storage",
2277                    &crate::json!({
2278                        "page_size": 4096,
2279                        "page_cache_capacity": 100000,
2280                        "auto_checkpoint_pages": 1000,
2281                        "snapshot_retention": 16,
2282                        "verify_checksums": true,
2283                        "segment": crate::json!({
2284                            "max_entities": 100000,
2285                            "max_bytes": 268435456_u64,
2286                            "compression_level": 6
2287                        }),
2288                        "hnsw": crate::json!({ "m": 16, "ef_construction": 100, "ef_search": 50 }),
2289                        "ivf": crate::json!({ "n_lists": 100, "n_probes": 10 }),
2290                        "bm25": crate::json!({ "k1": 1.2, "b": 0.75 })
2291                    }),
2292                );
2293                store.set_config_tree(
2294                    "red.search",
2295                    &crate::json!({
2296                        "rag": crate::json!({
2297                            "max_chunks_per_source": 10,
2298                            "max_total_chunks": 25,
2299                            "similarity_threshold": 0.8,
2300                            "graph_depth": 2,
2301                            "min_relevance": 0.3
2302                        }),
2303                        "fusion": crate::json!({
2304                            "vector_weight": 0.5,
2305                            "graph_weight": 0.3,
2306                            "table_weight": 0.2,
2307                            "dedup_threshold": 0.85
2308                        })
2309                    }),
2310                );
2311                store.set_config_tree(
2312                    "red.auth",
2313                    &crate::json!({
2314                        "enabled": false,
2315                        "session_ttl_secs": 3600,
2316                        "require_auth": false
2317                    }),
2318                );
2319                store.set_config_tree(
2320                    "red.query",
2321                    &crate::json!({
2322                        "connection_pool": crate::json!({ "max_connections": 64, "max_idle": 16 }),
2323                        "max_recursion_depth": 1000
2324                    }),
2325                );
2326                store.set_config_tree(
2327                    "red.indexes",
2328                    &crate::json!({
2329                        "auto_select": true,
2330                        "bloom_filter": crate::json!({
2331                            "enabled": true,
2332                            "false_positive_rate": 0.01,
2333                            "prune_on_scan": true
2334                        }),
2335                        "hash": crate::json!({ "enabled": true }),
2336                        "bitmap": crate::json!({ "enabled": true, "max_cardinality": 1000 }),
2337                        "spatial": crate::json!({ "enabled": true })
2338                    }),
2339                );
2340                store.set_config_tree(
2341                    "red.memtable",
2342                    &crate::json!({
2343                        "enabled": true,
2344                        "max_bytes": 67108864_u64,
2345                        "flush_threshold": 0.75
2346                    }),
2347                );
2348                store.set_config_tree(
2349                    "red.probabilistic",
2350                    &crate::json!({
2351                        "hll_registers": 16384,
2352                        "sketch_default_width": 1000,
2353                        "sketch_default_depth": 5,
2354                        "filter_default_capacity": 100000
2355                    }),
2356                );
2357                store.set_config_tree(
2358                    "red.timeseries",
2359                    &crate::json!({
2360                        "default_chunk_size": 1024,
2361                        "compression": crate::json!({
2362                            "timestamps": "delta_of_delta",
2363                            "values": "gorilla_xor"
2364                        }),
2365                        "default_retention_days": 0
2366                    }),
2367                );
2368                store.set_config_tree(
2369                    "red.queue",
2370                    &crate::json!({
2371                        "default_max_size": 0,
2372                        "default_max_attempts": 3,
2373                        "visibility_timeout_ms": 30000,
2374                        "consumer_idle_timeout_ms": 60000
2375                    }),
2376                );
2377                store.set_config_tree(
2378                    "red.backup",
2379                    &crate::json!({
2380                        "enabled": false,
2381                        "interval_secs": 3600,
2382                        "retention_count": 24,
2383                        "upload": false,
2384                        "backend": "local"
2385                    }),
2386                );
2387                store.set_config_tree(
2388                    "red.wal",
2389                    &crate::json!({
2390                        "archive": crate::json!({
2391                            "enabled": false,
2392                            "retention_hours": 168,
2393                            "prefix": "wal/"
2394                        })
2395                    }),
2396                );
2397                store.set_config_tree(
2398                    "red.cdc",
2399                    &crate::json!({
2400                        "enabled": true,
2401                        "buffer_size": 100000
2402                    }),
2403                );
2404                store.set_config_tree(
2405                    "red.config.secret",
2406                    &crate::json!({
2407                        "auto_encrypt": true,
2408                        "auto_decrypt": true
2409                    }),
2410                );
2411            }
2412
2413            // Perf-parity config matrix: heal the Tier A (critical)
2414            // keys unconditionally on every boot. Idempotent — only
2415            // writes the default when the key is missing. Keeps
2416            // `SHOW CONFIG` showing every guarantee the operator has
2417            // (durability.mode, concurrency.locking.enabled, …) even
2418            // on long-running datadirs that predate the matrix.
2419            crate::runtime::config_matrix::heal_critical_keys(store.as_ref());
2420
2421            // Phase 5 — Lehman-Yao runtime flag. Read the Tier A
2422            // `storage.btree.lehman_yao` value from the matrix (env
2423            // > file > red_config > default) and publish it to the
2424            // storage layer's atomic so the B-tree read / split
2425            // paths can branch without re-reading the config on
2426            // every hot-path call.
2427            let lehman_yao = runtime.config_bool("storage.btree.lehman_yao", true);
2428            crate::storage::engine::btree::lehman_yao::set_enabled(lehman_yao);
2429            if lehman_yao {
2430                tracing::info!(
2431                    "storage.btree.lehman_yao=true — lock-free concurrent descent enabled"
2432                );
2433            }
2434
2435            // Config file overlay — mounted `/etc/reddb/config.json`
2436            // (override path via REDDB_CONFIG_FILE). Writes keys with
2437            // write-if-absent semantics so a later user `SET CONFIG`
2438            // always wins. Missing file = silent no-op.
2439            let overlay_path = crate::runtime::config_overlay::config_file_path();
2440            let _ =
2441                crate::runtime::config_overlay::apply_config_file(store.as_ref(), &overlay_path);
2442        }
2443
2444        // VCS ("Git for Data") — create the `red_*` metadata
2445        // collections on first boot. Idempotent: `get_or_create_collection`
2446        // is a no-op if the collection already exists.
2447        {
2448            let store = runtime.inner.db.store();
2449            for name in crate::application::vcs_collections::ALL {
2450                let _ = store.get_or_create_collection(*name);
2451            }
2452            // Seed VCS config namespace with sensible defaults on first
2453            // boot, matching the pattern used by red.ai / red.storage.
2454            store.set_config_tree(
2455                crate::application::vcs_collections::CONFIG_NAMESPACE,
2456                &crate::json!({
2457                    "default_branch": "main",
2458                    "author": crate::json!({
2459                        "name": "reddb",
2460                        "email": "reddb@localhost"
2461                    }),
2462                    "protected_branches": crate::json!(["main"]),
2463                    "closure": crate::json!({
2464                        "enabled": true,
2465                        "lazy": true
2466                    }),
2467                    "merge": crate::json!({
2468                        "default_strategy": "auto",
2469                        "fast_forward": true
2470                    })
2471                }),
2472            );
2473        }
2474
2475        // Migrations — create the `red_migrations` / `red_migration_deps`
2476        // system collections on first boot. Idempotent.
2477        {
2478            let store = runtime.inner.db.store();
2479            for name in crate::application::migration_collections::ALL {
2480                let _ = store.get_or_create_collection(*name);
2481            }
2482        }
2483
2484        // Start background maintenance thread (context index refresh +
2485        // session purge). Held by a WEAK reference to `RuntimeInner`
2486        // so dropping the last `RedDBRuntime` handle actually releases
2487        // the underlying Arc<Pager> (and its file lock). Polling at
2488        // 200ms means shutdown latency is bounded; the real 60-second
2489        // work cadence is tracked independently via a `last_work`
2490        // timestamp.
2491        //
2492        // The previous version captured `rt = runtime.clone()` by
2493        // strong reference and ran an unterminated `loop`, which held
2494        // Arc<RuntimeInner> forever — reopening a persistent database
2495        // in the same process failed with "Database is locked" because
2496        // the pager could never drop. See the regression test
2497        // `finding_1_select_after_bulk_insert_persistent_reopen`.
2498        {
2499            let weak = Arc::downgrade(&runtime.inner);
2500            std::thread::Builder::new()
2501                .name("reddb-maintenance".into())
2502                .spawn(move || {
2503                    let tick = std::time::Duration::from_millis(200);
2504                    let work_interval = std::time::Duration::from_secs(60);
2505                    let mut last_work = std::time::Instant::now();
2506                    loop {
2507                        std::thread::sleep(tick);
2508                        let Some(inner) = weak.upgrade() else {
2509                            // All strong references dropped — the
2510                            // runtime is gone, exit cleanly.
2511                            break;
2512                        };
2513                        if last_work.elapsed() >= work_interval {
2514                            let _stats = inner.db.store().context_index().stats();
2515                            last_work = std::time::Instant::now();
2516                        }
2517                    }
2518                })
2519                .ok();
2520        }
2521
2522        // Start backup scheduler if enabled via red_config
2523        {
2524            let store = runtime.inner.db.store();
2525            let mut backup_enabled = false;
2526            let mut backup_interval = 3600u64;
2527
2528            if let Some(manager) = store.get_collection("red_config") {
2529                manager.for_each_entity(|entity| {
2530                    if let Some(row) = entity.data.as_row() {
2531                        let key = row.get_field("key").and_then(|v| match v {
2532                            crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2533                            _ => None,
2534                        });
2535                        let val = row.get_field("value");
2536                        if key == Some("red.config.backup.enabled") {
2537                            backup_enabled = match val {
2538                                Some(crate::storage::schema::Value::Boolean(true)) => true,
2539                                Some(crate::storage::schema::Value::Text(s)) => &**s == "true",
2540                                _ => false,
2541                            };
2542                        } else if key == Some("red.config.backup.interval_secs") {
2543                            if let Some(crate::storage::schema::Value::Integer(n)) = val {
2544                                backup_interval = *n as u64;
2545                            }
2546                        }
2547                    }
2548                    true
2549                });
2550            }
2551
2552            if backup_enabled {
2553                runtime.inner.backup_scheduler.set_interval(backup_interval);
2554                let rt = runtime.clone();
2555                runtime
2556                    .inner
2557                    .backup_scheduler
2558                    .start(move || rt.trigger_backup().map_err(|e| format!("{}", e)));
2559            }
2560        }
2561
2562        // Load EC registry from red_config and start worker
2563        {
2564            runtime
2565                .inner
2566                .ec_registry
2567                .load_from_config_store(runtime.inner.db.store().as_ref());
2568            if !runtime.inner.ec_registry.async_configs().is_empty() {
2569                runtime.inner.ec_worker.start(
2570                    Arc::clone(&runtime.inner.ec_registry),
2571                    Arc::clone(&runtime.inner.db.store()),
2572                );
2573            }
2574        }
2575
2576        if let crate::replication::ReplicationRole::Replica { primary_addr } =
2577            runtime.inner.db.options().replication.role.clone()
2578        {
2579            let rt = runtime.clone();
2580            std::thread::Builder::new()
2581                .name("reddb-replica".into())
2582                .spawn(move || rt.run_replica_loop(primary_addr))
2583                .ok();
2584        }
2585
2586        // PLAN.md Phase 1 — Lifecycle Contract. Mark Ready once every
2587        // boot stage above has completed (WAL replay, restore-from-
2588        // remote, replica-loop spawn). Health probes flip from 503 to
2589        // 200 here; shutdown begins from this state.
2590        runtime.inner.lifecycle.mark_ready();
2591
2592        Ok(runtime)
2593    }
2594
2595    fn rehydrate_snapshot_xid_floor(&self) {
2596        let store = self.inner.db.store();
2597        for collection in store.list_collections() {
2598            let Some(manager) = store.get_collection(&collection) else {
2599                continue;
2600            };
2601            for entity in manager.query_all(|_| true) {
2602                self.inner
2603                    .snapshot_manager
2604                    .observe_committed_xid(entity.xmin);
2605                self.inner
2606                    .snapshot_manager
2607                    .observe_committed_xid(entity.xmax);
2608            }
2609        }
2610    }
2611
2612    fn bootstrap_system_keyed_collections(&self) -> RedDBResult<()> {
2613        let mut changed = false;
2614        for (name, model) in [
2615            ("red.config", crate::catalog::CollectionModel::Config),
2616            ("red.vault", crate::catalog::CollectionModel::Vault),
2617        ] {
2618            if self.inner.db.store().get_collection(name).is_none() {
2619                self.inner.db.store().get_or_create_collection(name);
2620                changed = true;
2621            }
2622            if self.inner.db.collection_contract(name).is_none() {
2623                self.inner
2624                    .db
2625                    .save_collection_contract(system_keyed_collection_contract(name, model))
2626                    .map_err(|err| RedDBError::Internal(err.to_string()))?;
2627                changed = true;
2628            }
2629        }
2630        if changed {
2631            self.inner
2632                .db
2633                .persist_metadata()
2634                .map_err(|err| RedDBError::Internal(err.to_string()))?;
2635        }
2636        Ok(())
2637    }
2638
2639    pub fn db(&self) -> Arc<RedDB> {
2640        Arc::clone(&self.inner.db)
2641    }
2642
2643    /// Direct access to the runtime's secondary-index store.
2644    /// Used by bulk-insert entry points (gRPC binary bulk, HTTP bulk,
2645    /// wire bulk) that need to push new rows through the per-index
2646    /// maintenance hook after `store.bulk_insert` returns.
2647    pub fn index_store_ref(&self) -> &super::index_store::IndexStore {
2648        &self.inner.index_store
2649    }
2650
2651    /// Apply a DDL event to the schema-vocabulary reverse index
2652    /// (issue #120). Called by DDL execution paths after the catalog
2653    /// mutation has succeeded so the index never holds entries for
2654    /// half-applied DDL.
2655    pub(crate) fn schema_vocabulary_apply(
2656        &self,
2657        event: crate::runtime::schema_vocabulary::DdlEvent,
2658    ) {
2659        self.inner.schema_vocabulary.write().on_ddl(event);
2660    }
2661
2662    /// Lookup `token` in the schema-vocabulary reverse index. Returns
2663    /// an owned `Vec<VocabHit>` because the underlying read lock
2664    /// cannot be borrowed across the call boundary; the slice from
2665    /// `SchemaVocabulary::lookup` is cloned per hit.
2666    pub fn schema_vocabulary_lookup(
2667        &self,
2668        token: &str,
2669    ) -> Vec<crate::runtime::schema_vocabulary::VocabHit> {
2670        self.inner.schema_vocabulary.read().lookup(token).to_vec()
2671    }
2672
2673    /// Inject an AuthStore into the runtime. Called by server boot
2674    /// after the vault has been bootstrapped, so that `Value::Secret`
2675    /// auto-encrypt/decrypt can reach the vault AES key.
2676    pub fn set_auth_store(&self, store: Arc<crate::auth::store::AuthStore>) {
2677        *self.inner.auth_store.write() = Some(store);
2678    }
2679
2680    /// Read a vault KV secret from the configured AuthStore, if present.
2681    pub fn vault_kv_get(&self, key: &str) -> Option<String> {
2682        self.inner
2683            .auth_store
2684            .read()
2685            .as_ref()
2686            .and_then(|store| store.vault_kv_get(key))
2687    }
2688
2689    /// Write a vault KV secret and fail if the encrypted vault write is
2690    /// unavailable or cannot be made durable.
2691    pub fn vault_kv_try_set(&self, key: String, value: String) -> RedDBResult<()> {
2692        let store = self.inner.auth_store.read().clone().ok_or_else(|| {
2693            RedDBError::Query("secret storage requires an enabled, unsealed vault".to_string())
2694        })?;
2695        store
2696            .vault_kv_try_set(key, value)
2697            .map_err(|err| RedDBError::Query(err.to_string()))
2698    }
2699
2700    /// Inject an `OAuthValidator` into the runtime. When set, HTTP and
2701    /// wire transports try OAuth JWT validation before falling back to
2702    /// the local AuthStore lookup. Pass `None` to disable.
2703    pub fn set_oauth_validator(&self, validator: Option<Arc<crate::auth::oauth::OAuthValidator>>) {
2704        *self.inner.oauth_validator.write() = validator;
2705    }
2706
2707    /// Returns a clone of the configured `OAuthValidator` Arc, if any.
2708    /// Hot path: called per HTTP request when an Authorization header
2709    /// is present, so we hand back a cheap Arc clone.
2710    pub fn oauth_validator(&self) -> Option<Arc<crate::auth::oauth::OAuthValidator>> {
2711        self.inner.oauth_validator.read().clone()
2712    }
2713
2714    /// Returns the vault AES key (`red.secret.aes_key`) if an auth
2715    /// store is wired and a key has been generated. Used by the
2716    /// `Value::Secret` encrypt/decrypt pipeline.
2717    pub(crate) fn secret_aes_key(&self) -> Option<[u8; 32]> {
2718        let guard = self.inner.auth_store.read();
2719        guard.as_ref().and_then(|s| s.vault_secret_key())
2720    }
2721
2722    /// Resolve a boolean flag from `red_config`. Defaults to `default`
2723    /// when the key is missing or not coercible. If the same key has
2724    /// been written multiple times (SET CONFIG appends new rows), the
2725    /// most recent entity wins. Env-var overrides
2726    /// (`REDDB_<UP_DOTTED>`) take highest precedence.
2727    pub(crate) fn config_bool(&self, key: &str, default: bool) -> bool {
2728        if let Some(raw) = self.inner.env_config_overrides.get(key) {
2729            if let Some(crate::storage::schema::Value::Boolean(b)) =
2730                crate::runtime::config_overlay::coerce_env_value(key, raw)
2731            {
2732                return b;
2733            }
2734        }
2735        let store = self.inner.db.store();
2736        let Some(manager) = store.get_collection("red_config") else {
2737            return default;
2738        };
2739        let mut result = default;
2740        let mut latest_id: u64 = 0;
2741        manager.for_each_entity(|entity| {
2742            if let Some(row) = entity.data.as_row() {
2743                let entry_key = row.get_field("key").and_then(|v| match v {
2744                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2745                    _ => None,
2746                });
2747                if entry_key == Some(key) {
2748                    let id = entity.id.raw();
2749                    if id >= latest_id {
2750                        latest_id = id;
2751                        result = match row.get_field("value") {
2752                            Some(crate::storage::schema::Value::Boolean(b)) => *b,
2753                            Some(crate::storage::schema::Value::Text(s)) => {
2754                                matches!(s.as_ref(), "true" | "TRUE" | "True" | "1")
2755                            }
2756                            Some(crate::storage::schema::Value::Integer(n)) => *n != 0,
2757                            _ => default,
2758                        };
2759                    }
2760                }
2761            }
2762            true
2763        });
2764        result
2765    }
2766
2767    pub(crate) fn config_u64(&self, key: &str, default: u64) -> u64 {
2768        if let Some(raw) = self.inner.env_config_overrides.get(key) {
2769            if let Some(crate::storage::schema::Value::UnsignedInteger(n)) =
2770                crate::runtime::config_overlay::coerce_env_value(key, raw)
2771            {
2772                return n;
2773            }
2774        }
2775        let store = self.inner.db.store();
2776        let Some(manager) = store.get_collection("red_config") else {
2777            return default;
2778        };
2779        let mut result = default;
2780        let mut latest_id: u64 = 0;
2781        manager.for_each_entity(|entity| {
2782            if let Some(row) = entity.data.as_row() {
2783                let entry_key = row.get_field("key").and_then(|v| match v {
2784                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2785                    _ => None,
2786                });
2787                if entry_key == Some(key) {
2788                    let id = entity.id.raw();
2789                    if id >= latest_id {
2790                        latest_id = id;
2791                        result = match row.get_field("value") {
2792                            Some(crate::storage::schema::Value::Integer(n)) => *n as u64,
2793                            Some(crate::storage::schema::Value::UnsignedInteger(n)) => *n,
2794                            Some(crate::storage::schema::Value::Text(s)) => {
2795                                s.parse::<u64>().unwrap_or(default)
2796                            }
2797                            _ => default,
2798                        };
2799                    }
2800                }
2801            }
2802            true
2803        });
2804        result
2805    }
2806
2807    pub(crate) fn config_f64(&self, key: &str, default: f64) -> f64 {
2808        if let Some(raw) = self.inner.env_config_overrides.get(key) {
2809            if let Ok(n) = raw.parse::<f64>() {
2810                return n;
2811            }
2812        }
2813        let store = self.inner.db.store();
2814        let Some(manager) = store.get_collection("red_config") else {
2815            return default;
2816        };
2817        let mut result = default;
2818        let mut latest_id: u64 = 0;
2819        manager.for_each_entity(|entity| {
2820            if let Some(row) = entity.data.as_row() {
2821                let entry_key = row.get_field("key").and_then(|v| match v {
2822                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2823                    _ => None,
2824                });
2825                if entry_key == Some(key) {
2826                    let id = entity.id.raw();
2827                    if id >= latest_id {
2828                        latest_id = id;
2829                        result = match row.get_field("value") {
2830                            Some(crate::storage::schema::Value::Float(n)) => *n,
2831                            Some(crate::storage::schema::Value::Integer(n)) => *n as f64,
2832                            Some(crate::storage::schema::Value::UnsignedInteger(n)) => *n as f64,
2833                            Some(crate::storage::schema::Value::Text(s)) => {
2834                                s.parse::<f64>().unwrap_or(default)
2835                            }
2836                            _ => default,
2837                        };
2838                    }
2839                }
2840            }
2841            true
2842        });
2843        result
2844    }
2845
2846    pub(crate) fn config_string(&self, key: &str, default: &str) -> String {
2847        if let Some(raw) = self.inner.env_config_overrides.get(key) {
2848            return raw.clone();
2849        }
2850        let store = self.inner.db.store();
2851        let Some(manager) = store.get_collection("red_config") else {
2852            return default.to_string();
2853        };
2854        let mut result = default.to_string();
2855        let mut latest_id: u64 = 0;
2856        manager.for_each_entity(|entity| {
2857            if let Some(row) = entity.data.as_row() {
2858                let entry_key = row.get_field("key").and_then(|v| match v {
2859                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2860                    _ => None,
2861                });
2862                if entry_key == Some(key) {
2863                    let id = entity.id.raw();
2864                    if id >= latest_id {
2865                        latest_id = id;
2866                        if let Some(crate::storage::schema::Value::Text(value)) =
2867                            row.get_field("value")
2868                        {
2869                            result = value.to_string();
2870                        }
2871                    }
2872                }
2873            }
2874            true
2875        });
2876        result
2877    }
2878
2879    fn latest_metadata_for(
2880        &self,
2881        collection: &str,
2882        entity_id: u64,
2883    ) -> Option<crate::serde_json::Value> {
2884        self.inner
2885            .db
2886            .store()
2887            .get_metadata(collection, EntityId::new(entity_id))
2888            .map(|metadata| metadata_to_json(&metadata))
2889    }
2890
2891    fn persist_replica_lsn(&self, lsn: u64) {
2892        self.inner.db.store().set_config_tree(
2893            "red.replication",
2894            &crate::json!({
2895                "last_applied_lsn": lsn
2896            }),
2897        );
2898    }
2899
2900    fn persist_replication_health(
2901        &self,
2902        state: &str,
2903        last_error: &str,
2904        primary_lsn: Option<u64>,
2905        oldest_available_lsn: Option<u64>,
2906    ) {
2907        self.inner.db.store().set_config_tree(
2908            "red.replication",
2909            &crate::json!({
2910                "state": state,
2911                "last_error": last_error,
2912                "last_seen_primary_lsn": primary_lsn.unwrap_or(0),
2913                "last_seen_oldest_lsn": oldest_available_lsn.unwrap_or(0),
2914                "updated_at_unix_ms": SystemTime::now()
2915                    .duration_since(UNIX_EPOCH)
2916                    .unwrap_or_default()
2917                    .as_millis() as u64
2918            }),
2919        );
2920    }
2921
2922    /// Whether `SECRET('...')` literals should be encrypted with the
2923    /// vault AES key on INSERT. Default `true`.
2924    pub(crate) fn secret_auto_encrypt(&self) -> bool {
2925        self.config_bool("red.config.secret.auto_encrypt", true)
2926    }
2927
2928    /// Whether `Value::Secret` columns should be decrypted back to
2929    /// plaintext on SELECT when the vault is unsealed. Default `true`.
2930    /// Turning this off keeps secrets masked as `***` even while the
2931    /// vault is open — useful for audit trails or read-only exports.
2932    pub(crate) fn secret_auto_decrypt(&self) -> bool {
2933        self.config_bool("red.config.secret.auto_decrypt", true)
2934    }
2935
2936    /// Walk every record in `result` and swap `Value::Secret(bytes)`
2937    /// for the decrypted plaintext when the runtime has the vault
2938    /// AES key AND `red.config.secret.auto_decrypt = true`. If the
2939    /// key is missing, the vault is sealed, or auto_decrypt is off,
2940    /// secrets are left as `Value::Secret` which every formatter
2941    /// (Display, JSON) already masks as `***`.
2942    pub(crate) fn apply_secret_decryption(&self, result: &mut RuntimeQueryResult) {
2943        if !self.secret_auto_decrypt() {
2944            return;
2945        }
2946        let Some(key) = self.secret_aes_key() else {
2947            return;
2948        };
2949        for record in result.result.records.iter_mut() {
2950            for value in record.values_mut() {
2951                if let Value::Secret(ref bytes) = value {
2952                    if let Some(plain) =
2953                        super::impl_dml::decrypt_secret_payload(&key, bytes.as_slice())
2954                    {
2955                        if let Ok(text) = String::from_utf8(plain) {
2956                            *value = Value::text(text);
2957                        }
2958                    }
2959                }
2960            }
2961        }
2962    }
2963
2964    /// Emit a CDC change event and replicate to WAL buffer.
2965    /// Create a `MutationEngine` bound to this runtime.
2966    ///
2967    /// The engine is cheap to construct (no allocation) and should be
2968    /// dropped after `apply` returns. Use this from application-layer
2969    /// `create_row` / `create_rows_batch` instead of calling
2970    /// `bulk_insert` + `index_entity_insert` + `cdc_emit` separately.
2971    pub(crate) fn mutation_engine(&self) -> crate::runtime::mutation::MutationEngine<'_> {
2972        crate::runtime::mutation::MutationEngine::new(self)
2973    }
2974
2975    /// Public-mutation gate snapshot (PLAN.md W1).
2976    ///
2977    /// Surfaces that accept untrusted client requests (SQL DML/DDL,
2978    /// gRPC mutating RPCs, HTTP/native wire mutations, admin
2979    /// maintenance, serverless lifecycle) call `check_write` before
2980    /// dispatching to storage. Returns `RedDBError::ReadOnly` on any
2981    /// instance running as a replica or with `options.read_only =
2982    /// true`. The replica internal logical-WAL apply path reaches into
2983    /// the store directly and never calls this method, so legitimate
2984    /// replica catch-up still works.
2985    pub fn check_write(&self, kind: crate::runtime::write_gate::WriteKind) -> RedDBResult<()> {
2986        self.inner.write_gate.check(kind)
2987    }
2988
2989    /// Read-only handle to the gate, useful for transports that want
2990    /// to surface the policy in health/status output without taking on
2991    /// a dependency on the concrete enum.
2992    pub fn write_gate(&self) -> &crate::runtime::write_gate::WriteGate {
2993        &self.inner.write_gate
2994    }
2995
2996    /// Process lifecycle handle (PLAN.md Phase 1). Health probes,
2997    /// admin/shutdown, and signal handlers consult this single
2998    /// state machine.
2999    pub fn lifecycle(&self) -> &crate::runtime::lifecycle::Lifecycle {
3000        &self.inner.lifecycle
3001    }
3002
3003    /// Operator-imposed resource limits (PLAN.md Phase 4.1).
3004    pub fn resource_limits(&self) -> &crate::runtime::resource_limits::ResourceLimits {
3005        &self.inner.resource_limits
3006    }
3007
3008    /// Append-only audit log for admin mutations (PLAN.md Phase 6.5).
3009    pub fn audit_log(&self) -> &crate::runtime::audit_log::AuditLogger {
3010        &self.inner.audit_log
3011    }
3012
3013    /// Shared `Arc` to the audit logger — used by collaborators (the
3014    /// lease lifecycle, future request-context plumbing) that need to
3015    /// keep the logger alive past the runtime's stack frame.
3016    pub fn audit_log_arc(&self) -> Arc<crate::runtime::audit_log::AuditLogger> {
3017        Arc::clone(&self.inner.audit_log)
3018    }
3019
3020    /// Shared `Arc` to the write gate. Same rationale as
3021    /// `audit_log_arc`: collaborators (lease lifecycle, refresh
3022    /// thread) need a clone-cheap handle they can move into a
3023    /// background thread.
3024    pub fn write_gate_arc(&self) -> Arc<crate::runtime::write_gate::WriteGate> {
3025        Arc::clone(&self.inner.write_gate)
3026    }
3027
3028    /// Serverless writer-lease state machine. `None` when the operator
3029    /// did not opt into lease fencing (`RED_LEASE_REQUIRED` unset).
3030    pub fn lease_lifecycle(&self) -> Option<&Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>> {
3031        self.inner.lease_lifecycle.get()
3032    }
3033
3034    /// Install the lease lifecycle. Idempotent; subsequent calls
3035    /// return the previously stored value untouched.
3036    pub fn set_lease_lifecycle(
3037        &self,
3038        lifecycle: Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>,
3039    ) -> Result<(), Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>> {
3040        self.inner.lease_lifecycle.set(lifecycle)
3041    }
3042
3043    /// Reject the call when the requested batch size exceeds
3044    /// `RED_MAX_BATCH_SIZE`. Returns `RedDBError::QuotaExceeded`
3045    /// shaped so the HTTP layer can map it to 413 Payload Too
3046    /// Large (PLAN.md Phase 4.1).
3047    pub fn check_batch_size(&self, requested: usize) -> RedDBResult<()> {
3048        if self.inner.resource_limits.batch_size_exceeded(requested) {
3049            let max = self.inner.resource_limits.max_batch_size.unwrap_or(0);
3050            return Err(RedDBError::QuotaExceeded(format!(
3051                "max_batch_size:{requested}:{max}"
3052            )));
3053        }
3054        Ok(())
3055    }
3056
3057    /// Reject the call when the local DB file exceeds
3058    /// `RED_MAX_DB_SIZE_BYTES`. Reads file metadata once per call —
3059    /// the cost is a single `stat()` syscall, negligible against the
3060    /// I/O the caller is about to do. Returns `QuotaExceeded` shaped
3061    /// for HTTP 507 Insufficient Storage.
3062    pub fn check_db_size(&self) -> RedDBResult<()> {
3063        let Some(limit) = self.inner.resource_limits.max_db_size_bytes else {
3064            return Ok(());
3065        };
3066        if limit == 0 {
3067            return Ok(());
3068        }
3069        let Some(path) = self.inner.db.path() else {
3070            return Ok(());
3071        };
3072        let current = std::fs::metadata(path).map(|m| m.len()).unwrap_or(0);
3073        if current > limit {
3074            return Err(RedDBError::QuotaExceeded(format!(
3075                "max_db_size_bytes:{current}:{limit}"
3076            )));
3077        }
3078        Ok(())
3079    }
3080
3081    /// Graceful shutdown coordinator (PLAN.md Phase 1.1).
3082    ///
3083    /// Steps, in order, all idempotent across re-entrant calls:
3084    ///   1. Move lifecycle into `ShuttingDown` (concurrent callers
3085    ///      observe `Stopped` after first finishes).
3086    ///   2. Flush WAL + run final checkpoint via `db.flush()` so
3087    ///      every acked write is durable on disk.
3088    ///   3. If `backup_on_shutdown == true` and a remote backend is
3089    ///      configured, run a synchronous `trigger_backup()` so the
3090    ///      remote head reflects the final state.
3091    ///   4. Stamp the report and move to `Stopped`. Subsequent calls
3092    ///      return the cached report without re-running anything.
3093    ///
3094    /// On any error, the runtime is still marked `Stopped` so the
3095    /// process can exit; the caller logs the error context but does
3096    /// not retry the same shutdown — the operator can inspect the
3097    /// report fields to see which step failed.
3098    pub fn graceful_shutdown(
3099        &self,
3100        backup_on_shutdown: bool,
3101    ) -> RedDBResult<crate::runtime::lifecycle::ShutdownReport> {
3102        if !self.inner.lifecycle.begin_shutdown() {
3103            // Someone else already shut down (or is in flight). Return
3104            // the cached report so the HTTP caller and SIGTERM handler
3105            // get the same idempotent answer.
3106            return Ok(self.inner.lifecycle.shutdown_report().unwrap_or_default());
3107        }
3108
3109        let started_ms = std::time::SystemTime::now()
3110            .duration_since(std::time::UNIX_EPOCH)
3111            .map(|d| d.as_millis() as u64)
3112            .unwrap_or(0);
3113        let mut report = crate::runtime::lifecycle::ShutdownReport {
3114            started_at_ms: started_ms,
3115            ..Default::default()
3116        };
3117
3118        // Flush WAL + run any pending checkpoint. Local fsync is
3119        // unconditional — even a lease-lost replica needs its WAL on
3120        // disk before exit so a future restore has the latest tail.
3121        // The remote upload is gated separately so a lost-lease writer
3122        // doesn't clobber the new holder's state on its way out.
3123        let flush_res = self.inner.db.flush_local_only();
3124        report.flushed_wal = flush_res.is_ok();
3125        report.final_checkpoint = flush_res.is_ok();
3126        if let Err(err) = &flush_res {
3127            tracing::error!(
3128                target: "reddb::lifecycle",
3129                error = %err,
3130                "graceful_shutdown: local flush failed"
3131            );
3132        } else if let Err(lease_err) =
3133            self.assert_remote_write_allowed("shutdown/checkpoint_upload")
3134        {
3135            tracing::warn!(
3136                target: "reddb::serverless::lease",
3137                error = %lease_err,
3138                "graceful_shutdown: remote upload skipped — lease not held"
3139            );
3140        } else if let Err(err) = self.inner.db.upload_to_remote_backend() {
3141            tracing::error!(
3142                target: "reddb::lifecycle",
3143                error = %err,
3144                "graceful_shutdown: remote upload failed"
3145            );
3146        }
3147
3148        // Optional final backup. Skipped silently when no remote
3149        // backend is configured — `trigger_backup()` returns Err
3150        // anyway in that case, but logging it as a shutdown failure
3151        // would be misleading on a standalone (no-backend) runtime.
3152        if backup_on_shutdown && self.inner.db.remote_backend.is_some() {
3153            // The trigger_backup gate now reads `WriteKind::Backup`,
3154            // which a replica/read_only instance refuses. That's
3155            // intentional — replicas don't drive backups; only the
3156            // primary does. We still want shutdown to flush its WAL
3157            // even if the backup branch is gated off.
3158            match self.trigger_backup() {
3159                Ok(result) => {
3160                    report.backup_uploaded = result.uploaded;
3161                }
3162                Err(err) => {
3163                    tracing::warn!(
3164                        target: "reddb::lifecycle",
3165                        error = %err,
3166                        "graceful_shutdown: final backup skipped"
3167                    );
3168                }
3169            }
3170        }
3171
3172        let completed_ms = std::time::SystemTime::now()
3173            .duration_since(std::time::UNIX_EPOCH)
3174            .map(|d| d.as_millis() as u64)
3175            .unwrap_or(started_ms);
3176        report.completed_at_ms = completed_ms;
3177        report.duration_ms = completed_ms.saturating_sub(started_ms);
3178
3179        self.inner.lifecycle.finish_shutdown(report.clone());
3180        Ok(report)
3181    }
3182
3183    /// Emit a CDC record without invalidating the result cache.
3184    ///
3185    /// Used by `MutationEngine::append_batch` which calls
3186    /// `invalidate_result_cache` once for the whole batch before this
3187    /// loop, avoiding N write-lock acquisitions.
3188    pub(crate) fn cdc_emit_no_cache_invalidate(
3189        &self,
3190        operation: crate::replication::cdc::ChangeOperation,
3191        collection: &str,
3192        entity_id: u64,
3193        entity_kind: &str,
3194    ) -> u64 {
3195        let lsn = self
3196            .inner
3197            .cdc
3198            .emit(operation, collection, entity_id, entity_kind);
3199
3200        // Append to logical WAL replication buffer (if primary mode)
3201        if let Some(ref primary) = self.inner.db.replication {
3202            let store = self.inner.db.store();
3203            let entity = if operation == crate::replication::cdc::ChangeOperation::Delete {
3204                None
3205            } else {
3206                store.get(collection, EntityId::new(entity_id))
3207            };
3208            let record = ChangeRecord {
3209                lsn,
3210                timestamp: SystemTime::now()
3211                    .duration_since(UNIX_EPOCH)
3212                    .unwrap_or_default()
3213                    .as_millis() as u64,
3214                operation,
3215                collection: collection.to_string(),
3216                entity_id,
3217                entity_kind: entity_kind.to_string(),
3218                entity_bytes: entity
3219                    .as_ref()
3220                    .map(|e| UnifiedStore::serialize_entity(e, store.format_version())),
3221                metadata: self.latest_metadata_for(collection, entity_id),
3222            };
3223            let encoded = record.encode();
3224            primary.wal_buffer.append(record.lsn, encoded.clone());
3225            if let Some(spool) = &primary.logical_wal_spool {
3226                let _ = spool.append(record.lsn, &encoded);
3227            }
3228        }
3229        lsn
3230    }
3231
3232    pub(crate) fn cdc_emit_insert_batch_no_cache_invalidate(
3233        &self,
3234        collection: &str,
3235        ids: &[EntityId],
3236        entity_kind: &str,
3237    ) -> Vec<u64> {
3238        if ids.is_empty() {
3239            return Vec::new();
3240        }
3241
3242        // Without logical replication, CDC only needs the in-memory event
3243        // ring. Reserve all LSNs and push the batch under one mutex instead
3244        // of taking the ring lock once per inserted row.
3245        if self.inner.db.replication.is_none() {
3246            return self.inner.cdc.emit_batch_same_collection(
3247                crate::replication::cdc::ChangeOperation::Insert,
3248                collection,
3249                entity_kind,
3250                ids.iter().map(|id| id.raw()),
3251            );
3252        }
3253
3254        // Replication needs one logical-WAL record per entity with the
3255        // serialized entity bytes, so keep the existing per-row path.
3256        ids.iter()
3257            .map(|id| {
3258                self.cdc_emit_no_cache_invalidate(
3259                    crate::replication::cdc::ChangeOperation::Insert,
3260                    collection,
3261                    id.raw(),
3262                    entity_kind,
3263                )
3264            })
3265            .collect()
3266    }
3267
3268    pub fn cdc_emit(
3269        &self,
3270        operation: crate::replication::cdc::ChangeOperation,
3271        collection: &str,
3272        entity_id: u64,
3273        entity_kind: &str,
3274    ) -> u64 {
3275        let lsn = self
3276            .inner
3277            .cdc
3278            .emit(operation, collection, entity_id, entity_kind);
3279        // Perf: prior to this we called `invalidate_result_cache()`
3280        // which wipes EVERY cached query, across every table, under
3281        // a write lock — turning each INSERT into a serialisation
3282        // point for all readers. Swap to the per-table variant so
3283        // unrelated query caches survive.
3284        self.invalidate_result_cache_for_table(collection);
3285
3286        // Append to logical WAL replication buffer (if primary mode)
3287        if let Some(ref primary) = self.inner.db.replication {
3288            let store = self.inner.db.store();
3289            let entity = if operation == crate::replication::cdc::ChangeOperation::Delete {
3290                None
3291            } else {
3292                store.get(collection, EntityId::new(entity_id))
3293            };
3294            let record = ChangeRecord {
3295                lsn,
3296                timestamp: SystemTime::now()
3297                    .duration_since(UNIX_EPOCH)
3298                    .unwrap_or_default()
3299                    .as_millis() as u64,
3300                operation,
3301                collection: collection.to_string(),
3302                entity_id,
3303                entity_kind: entity_kind.to_string(),
3304                entity_bytes: entity
3305                    .as_ref()
3306                    .map(|entity| UnifiedStore::serialize_entity(entity, store.format_version())),
3307                metadata: self.latest_metadata_for(collection, entity_id),
3308            };
3309            let encoded = record.encode();
3310            primary.wal_buffer.append(record.lsn, encoded.clone());
3311            if let Some(spool) = &primary.logical_wal_spool {
3312                let _ = spool.append(record.lsn, &encoded);
3313            }
3314        }
3315        lsn
3316    }
3317
3318    pub(crate) fn cdc_emit_kv(
3319        &self,
3320        operation: crate::replication::cdc::ChangeOperation,
3321        collection: &str,
3322        key: &str,
3323        entity_id: u64,
3324        before: Option<crate::json::Value>,
3325        after: Option<crate::json::Value>,
3326    ) -> u64 {
3327        let lsn = self
3328            .inner
3329            .cdc
3330            .emit_kv(operation, collection, key, entity_id, before, after);
3331        self.inner.kv_stats.incr_watch_events_emitted();
3332        self.invalidate_result_cache_for_table(collection);
3333        lsn
3334    }
3335
3336    pub(crate) fn record_kv_watch_event(
3337        &self,
3338        operation: crate::replication::cdc::ChangeOperation,
3339        collection: &str,
3340        key: &str,
3341        entity_id: u64,
3342        before: Option<crate::json::Value>,
3343        after: Option<crate::json::Value>,
3344    ) {
3345        if self.current_xid().is_some() {
3346            let conn_id = current_connection_id();
3347            let event = crate::replication::cdc::KvWatchEvent {
3348                collection: collection.to_string(),
3349                key: key.to_string(),
3350                op: operation,
3351                before,
3352                after,
3353                lsn: 0,
3354                committed_at: 0,
3355                dropped_event_count: 0,
3356            };
3357            self.inner
3358                .pending_kv_watch_events
3359                .write()
3360                .entry(conn_id)
3361                .or_default()
3362                .push(event);
3363            return;
3364        }
3365
3366        self.cdc_emit_kv(operation, collection, key, entity_id, before, after);
3367    }
3368
3369    pub(crate) fn cdc_emit_prebuilt(
3370        &self,
3371        operation: crate::replication::cdc::ChangeOperation,
3372        collection: &str,
3373        entity: &UnifiedEntity,
3374        entity_kind: &str,
3375        metadata: Option<&crate::storage::Metadata>,
3376        invalidate_cache: bool,
3377    ) -> u64 {
3378        self.cdc_emit_prebuilt_with_columns(
3379            operation,
3380            collection,
3381            entity,
3382            entity_kind,
3383            metadata,
3384            invalidate_cache,
3385            None,
3386        )
3387    }
3388
3389    /// `cdc_emit_prebuilt` plus the list of column names whose values
3390    /// changed on this update. Callers that have already computed a
3391    /// `RowDamageVector` pass it here so downstream CDC consumers can
3392    /// filter events by touched column without re-diffing.
3393    /// `changed_columns` is only meaningful for `Update` operations —
3394    /// insert and delete events ignore it.
3395    pub(crate) fn cdc_emit_prebuilt_with_columns(
3396        &self,
3397        operation: crate::replication::cdc::ChangeOperation,
3398        collection: &str,
3399        entity: &UnifiedEntity,
3400        entity_kind: &str,
3401        metadata: Option<&crate::storage::Metadata>,
3402        invalidate_cache: bool,
3403        changed_columns: Option<Vec<String>>,
3404    ) -> u64 {
3405        if invalidate_cache {
3406            self.invalidate_result_cache();
3407        }
3408
3409        let public_id = entity.logical_id().raw();
3410        let lsn = self.inner.cdc.emit_with_columns(
3411            operation,
3412            collection,
3413            public_id,
3414            entity_kind,
3415            changed_columns,
3416        );
3417
3418        if let Some(ref primary) = self.inner.db.replication {
3419            let store = self.inner.db.store();
3420            let record = ChangeRecord {
3421                lsn,
3422                timestamp: SystemTime::now()
3423                    .duration_since(UNIX_EPOCH)
3424                    .unwrap_or_default()
3425                    .as_millis() as u64,
3426                operation,
3427                collection: collection.to_string(),
3428                entity_id: entity.id.raw(),
3429                entity_kind: entity_kind.to_string(),
3430                entity_bytes: Some(UnifiedStore::serialize_entity(
3431                    entity,
3432                    store.format_version(),
3433                )),
3434                metadata: metadata
3435                    .map(metadata_to_json)
3436                    .or_else(|| self.latest_metadata_for(collection, entity.id.raw())),
3437            };
3438            let encoded = record.encode();
3439            primary.wal_buffer.append(record.lsn, encoded.clone());
3440            if let Some(spool) = &primary.logical_wal_spool {
3441                let _ = spool.append(record.lsn, &encoded);
3442            }
3443        }
3444
3445        lsn
3446    }
3447
3448    pub(crate) fn cdc_emit_prebuilt_batch<'a, I>(
3449        &self,
3450        operation: crate::replication::cdc::ChangeOperation,
3451        entity_kind: &str,
3452        items: I,
3453        invalidate_cache: bool,
3454    ) where
3455        I: IntoIterator<
3456            Item = (
3457                &'a str,
3458                &'a UnifiedEntity,
3459                Option<&'a crate::storage::Metadata>,
3460            ),
3461        >,
3462    {
3463        let items: Vec<(&str, &UnifiedEntity, Option<&crate::storage::Metadata>)> =
3464            items.into_iter().collect();
3465        if items.is_empty() {
3466            return;
3467        }
3468
3469        if invalidate_cache {
3470            self.invalidate_result_cache();
3471        }
3472
3473        for (collection, entity, metadata) in items {
3474            self.cdc_emit_prebuilt(operation, collection, entity, entity_kind, metadata, false);
3475        }
3476    }
3477
3478    fn run_replica_loop(&self, primary_addr: String) {
3479        let endpoint = if primary_addr.starts_with("http") {
3480            primary_addr
3481        } else {
3482            format!("http://{primary_addr}")
3483        };
3484        let poll_ms = self.inner.db.options().replication.poll_interval_ms;
3485        let max_count = self.inner.db.options().replication.max_batch_size;
3486        let mut since_lsn = self.config_u64("red.replication.last_applied_lsn", 0);
3487
3488        let runtime = match tokio::runtime::Builder::new_current_thread()
3489            .enable_all()
3490            .build()
3491        {
3492            Ok(runtime) => runtime,
3493            Err(_) => return,
3494        };
3495
3496        runtime.block_on(async move {
3497            use crate::grpc::proto::red_db_client::RedDbClient;
3498            use crate::grpc::proto::JsonPayloadRequest;
3499
3500            let mut client = loop {
3501                match RedDbClient::connect(endpoint.clone()).await {
3502                    Ok(client) => {
3503                        self.persist_replication_health("connecting", "", None, None);
3504                        break client;
3505                    }
3506                    Err(_) => {
3507                        self.persist_replication_health(
3508                            "connecting",
3509                            "waiting for primary connection",
3510                            None,
3511                            None,
3512                        );
3513                        std::thread::sleep(std::time::Duration::from_millis(poll_ms.max(250)))
3514                    }
3515                }
3516            };
3517
3518            // PLAN.md Phase 11.5 — stateful applier guards LSN
3519            // monotonicity across pulls. Seed with the persisted
3520            // `last_applied_lsn` so reboots don't lose the chain
3521            // pointer.
3522            let applier = crate::replication::logical::LogicalChangeApplier::new(since_lsn);
3523
3524            loop {
3525                let payload = crate::json!({
3526                    "since_lsn": since_lsn,
3527                    "max_count": max_count
3528                });
3529                let request = tonic::Request::new(JsonPayloadRequest {
3530                    payload_json: crate::json::to_string(&payload)
3531                        .unwrap_or_else(|_| "{}".to_string()),
3532                });
3533
3534                if let Ok(response) = client.pull_wal_records(request).await {
3535                    if let Ok(value) =
3536                        crate::json::from_str::<crate::json::Value>(&response.into_inner().payload)
3537                    {
3538                        let current_lsn =
3539                            value.get("current_lsn").and_then(crate::json::Value::as_u64);
3540                        let oldest_available_lsn = value
3541                            .get("oldest_available_lsn")
3542                            .and_then(crate::json::Value::as_u64);
3543                        if since_lsn > 0
3544                            && oldest_available_lsn
3545                                .map(|oldest| oldest > since_lsn.saturating_add(1))
3546                                .unwrap_or(false)
3547                        {
3548                            self.persist_replication_health(
3549                                "stalled_gap",
3550                                "replica is behind the oldest logical WAL available on primary; re-bootstrap required",
3551                                current_lsn,
3552                                oldest_available_lsn,
3553                            );
3554                            std::thread::sleep(std::time::Duration::from_millis(poll_ms.max(250)));
3555                            continue;
3556                        }
3557                        if let Some(records) =
3558                            value.get("records").and_then(crate::json::Value::as_array)
3559                        {
3560                            for record in records {
3561                                let Some(data_hex) =
3562                                    record.get("data").and_then(crate::json::Value::as_str)
3563                                else {
3564                                    continue;
3565                                };
3566                                let Ok(data) = hex::decode(data_hex) else {
3567                                    self.inner.replica_apply_metrics.record(
3568                                        crate::replication::logical::ApplyErrorKind::Decode,
3569                                    );
3570                                    self.persist_replication_health(
3571                                        "apply_error",
3572                                        "failed to decode WAL record hex payload",
3573                                        current_lsn,
3574                                        oldest_available_lsn,
3575                                    );
3576                                    continue;
3577                                };
3578                                let Ok(change) = ChangeRecord::decode(&data) else {
3579                                    self.inner.replica_apply_metrics.record(
3580                                        crate::replication::logical::ApplyErrorKind::Decode,
3581                                    );
3582                                    self.persist_replication_health(
3583                                        "apply_error",
3584                                        "failed to decode logical WAL record",
3585                                        current_lsn,
3586                                        oldest_available_lsn,
3587                                    );
3588                                    continue;
3589                                };
3590                                match applier.apply(
3591                                    self.inner.db.as_ref(),
3592                                    &change,
3593                                    ApplyMode::Replica,
3594                                ) {
3595                                    Ok(crate::replication::logical::ApplyOutcome::Applied) => {
3596                                        self.invalidate_result_cache_for_table(&change.collection);
3597                                        since_lsn = since_lsn.max(change.lsn);
3598                                        self.persist_replica_lsn(since_lsn);
3599                                    }
3600                                    Ok(_) => {
3601                                        // Idempotent / Skipped: no advance, no error.
3602                                    }
3603                                    Err(err) => {
3604                                        self.inner.replica_apply_metrics.record(err.kind());
3605                                        // Issue #205 — emit operator-grade event
3606                                        // for the two replication-fatal kinds. `Gap`
3607                                        // / `Apply` / `Decode` already persist via
3608                                        // `persist_replication_health`; the
3609                                        // OperatorEvent variants only cover the
3610                                        // two "stream is broken" / "follower
3611                                        // diverged" conditions an operator must act
3612                                        // on out-of-band.
3613                                        match &err {
3614                                            crate::replication::logical::LogicalApplyError::Divergence { lsn, expected: _, got: _ } => {
3615                                                crate::telemetry::operator_event::OperatorEvent::Divergence {
3616                                                    peer: "primary".to_string(),
3617                                                    leader_lsn: *lsn,
3618                                                    follower_lsn: since_lsn,
3619                                                }
3620                                                .emit_global();
3621                                            }
3622                                            crate::replication::logical::LogicalApplyError::Gap { last, next } => {
3623                                                crate::telemetry::operator_event::OperatorEvent::ReplicationBroken {
3624                                                    peer: "primary".to_string(),
3625                                                    reason: format!("stalled gap last={last} next={next}"),
3626                                                }
3627                                                .emit_global();
3628                                            }
3629                                            _ => {}
3630                                        }
3631                                        let kind = match &err {
3632                                            crate::replication::logical::LogicalApplyError::Gap { .. } => "stalled_gap",
3633                                            crate::replication::logical::LogicalApplyError::Divergence { .. } => "divergence",
3634                                            _ => "apply_error",
3635                                        };
3636                                        self.persist_replication_health(
3637                                            kind,
3638                                            &format!("replica apply rejected: {err}"),
3639                                            current_lsn,
3640                                            oldest_available_lsn,
3641                                        );
3642                                        // Stop applying this batch. The
3643                                        // outer loop will retry on next
3644                                        // pull, which on a real Gap will
3645                                        // not magically heal — operator
3646                                        // must rebootstrap. For
3647                                        // Divergence, we explicitly do
3648                                        // not advance; this keeps the
3649                                        // replica visibly unhealthy
3650                                        // instead of silently swallowing
3651                                        // corruption.
3652                                        break;
3653                                    }
3654                                }
3655                            }
3656                        }
3657                        self.persist_replication_health(
3658                            "healthy",
3659                            "",
3660                            current_lsn,
3661                            oldest_available_lsn,
3662                        );
3663                    } else {
3664                        self.persist_replication_health(
3665                            "apply_error",
3666                            "failed to parse pull_wal_records response",
3667                            None,
3668                            None,
3669                        );
3670                    }
3671                } else {
3672                    self.persist_replication_health(
3673                        "connecting",
3674                        "primary pull_wal_records request failed",
3675                        None,
3676                        None,
3677                    );
3678                }
3679
3680                std::thread::sleep(std::time::Duration::from_millis(poll_ms));
3681            }
3682        });
3683    }
3684
3685    /// Poll CDC events since a given LSN.
3686    pub fn cdc_poll(
3687        &self,
3688        since_lsn: u64,
3689        max_count: usize,
3690    ) -> Vec<crate::replication::cdc::ChangeEvent> {
3691        self.inner.cdc.poll(since_lsn, max_count)
3692    }
3693
3694    /// PLAN.md Phase 11.4 — current CDC LSN. Public mutation
3695    /// surfaces (HTTP query, gRPC entity ops) call this immediately
3696    /// after a successful write to feed `enforce_commit_policy`.
3697    pub fn cdc_current_lsn(&self) -> u64 {
3698        self.inner.cdc.current_lsn()
3699    }
3700
3701    pub fn kv_watch_events_since(
3702        &self,
3703        collection: &str,
3704        key: &str,
3705        since_lsn: u64,
3706        max_count: usize,
3707    ) -> Vec<crate::replication::cdc::KvWatchEvent> {
3708        self.inner
3709            .cdc
3710            .poll(since_lsn, max_count)
3711            .into_iter()
3712            .filter_map(|event| event.kv)
3713            .filter(|event| event.collection == collection && event.key == key)
3714            .collect()
3715    }
3716
3717    pub fn kv_watch_events_since_prefix(
3718        &self,
3719        collection: &str,
3720        prefix: &str,
3721        since_lsn: u64,
3722        max_count: usize,
3723    ) -> Vec<crate::replication::cdc::KvWatchEvent> {
3724        self.inner
3725            .cdc
3726            .poll(since_lsn, max_count)
3727            .into_iter()
3728            .filter_map(|event| event.kv)
3729            .filter(|event| event.collection == collection && event.key.starts_with(prefix))
3730            .collect()
3731    }
3732
3733    pub(crate) fn kv_watch_subscribe<'a>(
3734        &'a self,
3735        collection: impl Into<String>,
3736        key: impl Into<String>,
3737        from_lsn: Option<u64>,
3738    ) -> crate::runtime::kv_watch::KvWatchStream<'a> {
3739        crate::runtime::kv_watch::KvWatchStream::subscribe(
3740            &self.inner.cdc,
3741            &self.inner.kv_stats,
3742            collection,
3743            key,
3744            from_lsn,
3745            self.kv_watch_idle_timeout_ms(),
3746        )
3747    }
3748
3749    pub(crate) fn kv_watch_subscribe_prefix<'a>(
3750        &'a self,
3751        collection: impl Into<String>,
3752        prefix: impl Into<String>,
3753        from_lsn: Option<u64>,
3754    ) -> crate::runtime::kv_watch::KvWatchStream<'a> {
3755        crate::runtime::kv_watch::KvWatchStream::subscribe_prefix(
3756            &self.inner.cdc,
3757            &self.inner.kv_stats,
3758            collection,
3759            prefix,
3760            from_lsn,
3761            self.kv_watch_idle_timeout_ms(),
3762        )
3763    }
3764
3765    pub(crate) fn kv_watch_idle_timeout_ms(&self) -> u64 {
3766        self.config_u64("red.config.kv.watch.idle_timeout_ms", 60_000)
3767    }
3768
3769    /// Get backup scheduler status.
3770    pub fn backup_status(&self) -> crate::replication::scheduler::BackupStatus {
3771        self.inner.backup_scheduler.status()
3772    }
3773
3774    /// Borrow the runtime's result Blob Cache.
3775    ///
3776    /// Wired for the `/admin/blob_cache/sweep` and
3777    /// `/admin/blob_cache/flush_namespace` HTTP handlers (issue #148
3778    /// follow-up): both delegate to
3779    /// `crate::storage::cache::sweeper::BlobCacheSweeper`, which takes a
3780    /// `&BlobCache`. Also used by `trigger_backup` when
3781    /// `red.config.backup.include_blob_cache=true` to locate the L2
3782    /// directory for archival.
3783    pub fn result_blob_cache(&self) -> &crate::storage::cache::BlobCache {
3784        &self.inner.result_blob_cache
3785    }
3786
3787    /// PLAN.md Phase 11.4 — owned snapshot of every registered
3788    /// replica's state on this primary. Returns empty vec on
3789    /// non-primary instances or when no replicas are registered yet.
3790    pub fn primary_replica_snapshots(&self) -> Vec<crate::replication::primary::ReplicaState> {
3791        self.inner
3792            .db
3793            .replication
3794            .as_ref()
3795            .map(|repl| repl.replica_snapshots())
3796            .unwrap_or_default()
3797    }
3798
3799    /// PLAN.md Phase 11.4 — active commit policy. Reads
3800    /// `RED_PRIMARY_COMMIT_POLICY` once at runtime construction;
3801    /// future env reloads will need a reload endpoint. Default is
3802    /// `Local` — current behavior, no replica blocking.
3803    pub fn commit_policy(&self) -> crate::replication::CommitPolicy {
3804        crate::replication::CommitPolicy::from_env()
3805    }
3806
3807    /// PLAN.md Phase 11.5 — accessor for replica-side apply error
3808    /// counters (gap / divergence / apply / decode). Returned
3809    /// snapshot is consistent across the four counters; the labels
3810    /// match `reddb_replica_apply_errors_total{kind}`.
3811    pub fn replica_apply_error_counts(
3812        &self,
3813    ) -> [(crate::replication::logical::ApplyErrorKind, u64); 4] {
3814        self.inner.replica_apply_metrics.snapshot()
3815    }
3816
3817    /// PLAN.md Phase 4.4 — per-caller quota bucket. Always
3818    /// returned; `is_configured()` lets callers short-circuit.
3819    pub fn quota_bucket(&self) -> &crate::runtime::quota_bucket::QuotaBucket {
3820        &self.inner.quota_bucket
3821    }
3822
3823    /// PLAN.md Phase 11.4 — observability snapshot of every
3824    /// replica's durable LSN as known to the commit waiter. Empty
3825    /// vec on non-primary instances or when no replica has acked.
3826    pub fn commit_waiter_snapshot(&self) -> Vec<(String, u64)> {
3827        self.inner
3828            .db
3829            .replication
3830            .as_ref()
3831            .map(|repl| repl.commit_waiter.snapshot())
3832            .unwrap_or_default()
3833    }
3834
3835    /// PLAN.md Phase 11.4 — `(reached, timed_out, not_required, last_micros)`
3836    /// counters for /metrics. Always-zero on non-primary instances.
3837    pub fn commit_waiter_metrics_snapshot(&self) -> (u64, u64, u64, u64) {
3838        self.inner
3839            .db
3840            .replication
3841            .as_ref()
3842            .map(|repl| repl.commit_waiter.metrics_snapshot())
3843            .unwrap_or((0, 0, 0, 0))
3844    }
3845
3846    /// PLAN.md Phase 11.4 — block until at least `count` replicas
3847    /// have durably applied through `target_lsn`, or `timeout`
3848    /// elapses. Returns the `AwaitOutcome` so the caller can decide
3849    /// whether to surface a timeout error to the client or continue
3850    /// (the policy mapping lives in the commit dispatcher).
3851    ///
3852    /// Foundation only — the write commit path doesn't yet call
3853    /// this. Wiring it is a per-surface task gated on the operator
3854    /// flipping `RED_PRIMARY_COMMIT_POLICY` away from `local`.
3855    pub fn await_replica_acks(
3856        &self,
3857        target_lsn: u64,
3858        count: u32,
3859        timeout: std::time::Duration,
3860    ) -> crate::replication::AwaitOutcome {
3861        match &self.inner.db.replication {
3862            Some(repl) => repl.commit_waiter.await_acks(target_lsn, count, timeout),
3863            None => {
3864                // No replication configured: policy must be `Local`.
3865                // Treat as immediate `NotRequired` so callers don't
3866                // block on a degenerate setup.
3867                crate::replication::AwaitOutcome::NotRequired
3868            }
3869        }
3870    }
3871
3872    /// PLAN.md Phase 11.4 — enforce the configured commit policy
3873    /// against `post_lsn` (the LSN of the just-completed write).
3874    /// Returns `Ok(AwaitOutcome)` on every successful enforcement
3875    /// (including `Reached` and `TimedOut` when fail-on-timeout is
3876    /// off). Returns `Err(ReadOnly)` only when:
3877    ///   * policy is `AckN(n)` with `n > 0`
3878    ///   * the wait timed out
3879    ///   * `RED_COMMIT_FAIL_ON_TIMEOUT=true` is set
3880    ///
3881    /// The HTTP / gRPC / wire surfaces map the error to 504 / wire
3882    /// backoff. Default behaviour (env unset) logs warn and returns
3883    /// success — matches PLAN.md "default v1 stays local" semantics
3884    /// while still letting the operator opt into hard-blocking.
3885    pub fn enforce_commit_policy(
3886        &self,
3887        post_lsn: u64,
3888    ) -> RedDBResult<crate::replication::AwaitOutcome> {
3889        let n = match self.commit_policy() {
3890            crate::replication::CommitPolicy::AckN(n) if n > 0 => n,
3891            _ => return Ok(crate::replication::AwaitOutcome::NotRequired),
3892        };
3893        let timeout_ms = std::env::var("RED_REPLICATION_ACK_TIMEOUT_MS")
3894            .ok()
3895            .and_then(|v| v.parse::<u64>().ok())
3896            .unwrap_or(5_000);
3897        let outcome =
3898            self.await_replica_acks(post_lsn, n, std::time::Duration::from_millis(timeout_ms));
3899        if let crate::replication::AwaitOutcome::TimedOut { observed, required } = &outcome {
3900            tracing::warn!(
3901                target: "reddb::commit",
3902                post_lsn,
3903                observed = *observed,
3904                required = *required,
3905                timeout_ms,
3906                "ack_n: timed out waiting for replicas"
3907            );
3908            let fail = std::env::var("RED_COMMIT_FAIL_ON_TIMEOUT")
3909                .ok()
3910                .map(|v| {
3911                    let t = v.trim();
3912                    t.eq_ignore_ascii_case("true") || t == "1" || t.eq_ignore_ascii_case("yes")
3913                })
3914                .unwrap_or(false);
3915            if fail {
3916                return Err(RedDBError::ReadOnly(format!(
3917                    "commit policy timed out at lsn {post_lsn}: observed={observed} required={required} (RED_COMMIT_FAIL_ON_TIMEOUT=true)"
3918                )));
3919            }
3920        }
3921        Ok(outcome)
3922    }
3923
3924    /// PLAN.md Phase 6.3 — whether at-rest encryption is configured.
3925    /// Reads `RED_ENCRYPTION_KEY` / `RED_ENCRYPTION_KEY_FILE` lazily;
3926    /// returns `("enabled", None)` when a key is loadable, `("error", Some(msg))`
3927    /// when the operator set the env but it doesn't parse, and
3928    /// `("disabled", None)` when no key is configured. The pager
3929    /// hookup is deferred — this accessor surfaces the operator's
3930    /// intent for /admin/status without yet using the key in writes.
3931    pub fn encryption_at_rest_status(&self) -> (&'static str, Option<String>) {
3932        match crate::crypto::page_encryption::key_from_env() {
3933            Ok(Some(_)) => ("enabled", None),
3934            Ok(None) => ("disabled", None),
3935            Err(err) => ("error", Some(err)),
3936        }
3937    }
3938
3939    /// PLAN.md Phase 11.5 — current replica apply health label
3940    /// (`ok`, `gap`, `divergence`, `apply_error`, `connecting`,
3941    /// `stalled_gap`). Read from the persisted `red.replication.state`
3942    /// config key updated by the replica loop. Returns `None` on
3943    /// non-replica instances or when no apply has run yet.
3944    pub fn replica_apply_health(&self) -> Option<String> {
3945        let state = self.config_string("red.replication.state", "");
3946        if state.is_empty() {
3947            None
3948        } else {
3949            Some(state)
3950        }
3951    }
3952
3953    /// Current local LSN paired with the LSN of the most recently
3954    /// archived WAL segment. The difference is the replication /
3955    /// archive lag operators alert on (PLAN.md Phase 5.1). Returns
3956    /// `(0, 0)` when neither replication nor archiving is configured.
3957    pub fn wal_archive_progress(&self) -> (u64, u64) {
3958        let current_lsn = self
3959            .inner
3960            .db
3961            .replication
3962            .as_ref()
3963            .map(|repl| {
3964                repl.logical_wal_spool
3965                    .as_ref()
3966                    .map(|spool| spool.current_lsn())
3967                    .unwrap_or_else(|| repl.wal_buffer.current_lsn())
3968            })
3969            .unwrap_or_else(|| self.inner.cdc.current_lsn());
3970        let last_archived_lsn = self.config_u64("red.config.timeline.last_archived_lsn", 0);
3971        (current_lsn, last_archived_lsn)
3972    }
3973
3974    /// Trigger an immediate backup.
3975    pub fn trigger_backup(&self) -> RedDBResult<crate::replication::scheduler::BackupResult> {
3976        self.check_write(crate::runtime::write_gate::WriteKind::Backup)?;
3977        // Defense in depth — check_write above already rejects when
3978        // the lease is NotHeld, but log + audit the lease angle here
3979        // explicitly so dashboards distinguish "lease lost" from a
3980        // generic read-only refusal.
3981        self.assert_remote_write_allowed("admin/backup")?;
3982        let started = std::time::Instant::now();
3983        let snapshot = self.create_snapshot()?;
3984        let mut uploaded = false;
3985
3986        if let (Some(backend), Some(path)) = (&self.inner.db.remote_backend, self.inner.db.path()) {
3987            let default_snapshot_prefix = self.inner.db.options().default_snapshot_prefix();
3988            let default_wal_prefix = self.inner.db.options().default_wal_archive_prefix();
3989            let default_head_key = self.inner.db.options().default_backup_head_key();
3990            let snapshot_prefix = self.config_string(
3991                "red.config.backup.snapshot_prefix",
3992                &default_snapshot_prefix,
3993            );
3994            let wal_prefix =
3995                self.config_string("red.config.wal.archive.prefix", &default_wal_prefix);
3996            let head_key = self.config_string("red.config.backup.head_key", &default_head_key);
3997            let timeline_id = self.config_string("red.config.timeline.id", "main");
3998            let snapshot_key = crate::storage::wal::archive_snapshot(
3999                backend.as_ref(),
4000                path,
4001                snapshot.snapshot_id,
4002                &snapshot_prefix,
4003            )
4004            .map_err(|err| RedDBError::Internal(err.to_string()))?;
4005            let current_lsn = self
4006                .inner
4007                .db
4008                .replication
4009                .as_ref()
4010                .map(|repl| {
4011                    repl.logical_wal_spool
4012                        .as_ref()
4013                        .map(|spool| spool.current_lsn())
4014                        .unwrap_or_else(|| repl.wal_buffer.current_lsn())
4015                })
4016                .unwrap_or_else(|| self.inner.cdc.current_lsn());
4017            let last_archived_lsn = self.config_u64("red.config.timeline.last_archived_lsn", 0);
4018            // Hash the local snapshot bytes so the manifest can carry
4019            // the digest for restore-side verification (PLAN.md
4020            // Phase 4). Failure to hash is non-fatal — we still
4021            // publish the manifest, just without a checksum, so a
4022            // future fix can backfill rather than losing the backup.
4023            let snapshot_sha256 =
4024                crate::storage::wal::SnapshotManifest::compute_snapshot_sha256(path)
4025                    .map_err(|err| {
4026                        tracing::warn!(
4027                            target: "reddb::backup",
4028                            error = %err,
4029                            snapshot_id = snapshot.snapshot_id,
4030                            "snapshot hash failed; manifest will lack checksum"
4031                        );
4032                    })
4033                    .ok();
4034            let manifest = crate::storage::wal::SnapshotManifest {
4035                timeline_id: timeline_id.clone(),
4036                snapshot_key: snapshot_key.clone(),
4037                snapshot_id: snapshot.snapshot_id,
4038                snapshot_time: snapshot.created_at_unix_ms as u64,
4039                base_lsn: current_lsn,
4040                schema_version: crate::api::REDDB_FORMAT_VERSION,
4041                format_version: crate::api::REDDB_FORMAT_VERSION,
4042                snapshot_sha256,
4043            };
4044            crate::storage::wal::publish_snapshot_manifest(backend.as_ref(), &manifest)
4045                .map_err(|err| RedDBError::Internal(err.to_string()))?;
4046
4047            // PLAN.md Phase 11.3 — read the head of the WAL hash chain
4048            // so the new segment can link back. `None` means we're
4049            // starting a fresh timeline (after a clean restore or on
4050            // first archive ever); the segment's `prev_hash` will be
4051            // `None` and restore-side validation accepts that only for
4052            // the first segment in `plan.wal_segments`.
4053            let prev_segment_hash = self.config_string("red.config.timeline.last_segment_hash", "");
4054            let prev_hash_arg = if prev_segment_hash.is_empty() {
4055                None
4056            } else {
4057                Some(prev_segment_hash)
4058            };
4059
4060            let archived_lsn = if let Some(primary) = &self.inner.db.replication {
4061                let oldest = primary
4062                    .logical_wal_spool
4063                    .as_ref()
4064                    .and_then(|spool| spool.oldest_lsn().ok().flatten())
4065                    .or_else(|| primary.wal_buffer.oldest_lsn())
4066                    .unwrap_or(last_archived_lsn);
4067                if last_archived_lsn > 0 && last_archived_lsn < oldest.saturating_sub(1) {
4068                    return Err(RedDBError::Internal(format!(
4069                        "logical WAL gap detected: last_archived_lsn={last_archived_lsn}, oldest_available_lsn={oldest}"
4070                    )));
4071                }
4072                let records = if let Some(spool) = &primary.logical_wal_spool {
4073                    spool
4074                        .read_since(last_archived_lsn, usize::MAX)
4075                        .map_err(|err| RedDBError::Internal(err.to_string()))?
4076                } else {
4077                    primary.wal_buffer.read_since(last_archived_lsn, usize::MAX)
4078                };
4079                if let Some(meta) = crate::storage::wal::archive_change_records(
4080                    backend.as_ref(),
4081                    &wal_prefix,
4082                    &records,
4083                    prev_hash_arg,
4084                )
4085                .map_err(|err| RedDBError::Internal(err.to_string()))?
4086                {
4087                    if let Some(spool) = &primary.logical_wal_spool {
4088                        let _ = spool.prune_through(meta.lsn_end);
4089                    }
4090                    // Advance the chain head so the next archive call
4091                    // links to this segment's hash. If the segment has
4092                    // no sha256 (legacy / hashing failed) we leave the
4093                    // head as-is — the next segment then carries the
4094                    // prior chain head, preserving continuity.
4095                    if let Some(sha) = &meta.sha256 {
4096                        self.inner.db.store().set_config_tree(
4097                            "red.config.timeline",
4098                            &crate::json!({ "last_segment_hash": sha }),
4099                        );
4100                    }
4101                    meta.lsn_end
4102                } else {
4103                    last_archived_lsn
4104                }
4105            } else {
4106                last_archived_lsn
4107            };
4108
4109            let head = crate::storage::wal::BackupHead {
4110                timeline_id,
4111                snapshot_key,
4112                snapshot_id: snapshot.snapshot_id,
4113                snapshot_time: snapshot.created_at_unix_ms as u64,
4114                current_lsn,
4115                last_archived_lsn: archived_lsn,
4116                wal_prefix,
4117            };
4118            crate::storage::wal::publish_backup_head(backend.as_ref(), &head_key, &head)
4119                .map_err(|err| RedDBError::Internal(err.to_string()))?;
4120            self.inner.db.store().set_config_tree(
4121                "red.config.timeline",
4122                &crate::json!({
4123                    "last_archived_lsn": archived_lsn,
4124                    "id": head.timeline_id
4125                }),
4126            );
4127
4128            // PLAN.md Phase 2.4 — refresh the unified `MANIFEST.json`
4129            // at the prefix root so external tooling sees a single
4130            // catalog of every snapshot + WAL segment with their
4131            // checksums. Best-effort: a manifest publish failure
4132            // doesn't fail the backup (the per-artifact sidecars
4133            // already give restore-side integrity), but it does log
4134            // so dashboards can flag stale catalogs.
4135            if let Err(err) = crate::storage::wal::publish_unified_manifest_for_prefix(
4136                backend.as_ref(),
4137                &snapshot_prefix,
4138            ) {
4139                tracing::warn!(
4140                    target: "reddb::backup",
4141                    error = %err,
4142                    snapshot_prefix = %snapshot_prefix,
4143                    "unified MANIFEST.json refresh failed; per-artifact sidecars unaffected"
4144                );
4145            }
4146
4147            // PLAN.md Phase 11.4 — when the operator picked a
4148            // commit policy that demands replica durability, block
4149            // until the configured count of replicas has acked the
4150            // archived LSN (or the timeout fires). For backup the
4151            // policy decides the *DR posture* — `local` returns
4152            // immediately, `ack_n` ensures at least N replicas saw
4153            // the new tail before we report success to the
4154            // operator. A `TimedOut` is logged but does NOT fail
4155            // the backup: the local WAL + remote upload are durable
4156            // regardless; the missing acks are reported via
4157            // /metrics and /admin/status so the operator can decide.
4158            match self.commit_policy() {
4159                crate::replication::CommitPolicy::AckN(n) if n > 0 => {
4160                    let timeout = std::env::var("RED_REPLICATION_ACK_TIMEOUT_MS")
4161                        .ok()
4162                        .and_then(|v| v.parse::<u64>().ok())
4163                        .unwrap_or(5_000);
4164                    let outcome = self.await_replica_acks(
4165                        archived_lsn,
4166                        n,
4167                        std::time::Duration::from_millis(timeout),
4168                    );
4169                    match outcome {
4170                        crate::replication::AwaitOutcome::Reached(count) => {
4171                            tracing::debug!(
4172                                target: "reddb::backup",
4173                                archived_lsn,
4174                                n,
4175                                count,
4176                                "ack_n: replicas synced before backup return"
4177                            );
4178                        }
4179                        crate::replication::AwaitOutcome::TimedOut { observed, required } => {
4180                            tracing::warn!(
4181                                target: "reddb::backup",
4182                                archived_lsn,
4183                                observed,
4184                                required,
4185                                timeout_ms = timeout,
4186                                "ack_n: timed out waiting for replicas; backup uploaded but DR posture degraded"
4187                            );
4188                        }
4189                        crate::replication::AwaitOutcome::NotRequired => {}
4190                    }
4191                }
4192                _ => {} // Local / RemoteWal / Quorum: no blocking yet
4193            }
4194
4195            // Issue #148 follow-up — opt-in archive of the L2 Blob Cache
4196            // directory tree. Default off so a standard backup stays
4197            // small; flip via `red.config.backup.include_blob_cache=true`
4198            // when warm-cache restore is required (per
4199            // docs/operations/blob-cache-backup-restore.md §1).
4200            //
4201            // The L2 tree is *derived* state (ADR 0006) — its absence
4202            // never causes data loss; it only affects post-restore
4203            // p99 latency until the cache re-warms. We therefore log
4204            // (not fail) on per-file upload errors so a partial L2
4205            // upload never aborts a healthy snapshot+WAL backup.
4206            if self.config_bool("red.config.backup.include_blob_cache", false) {
4207                let blob_cache_prefix = self.config_string(
4208                    "red.config.backup.blob_cache_prefix",
4209                    &format!("{snapshot_prefix}blob_cache/"),
4210                );
4211                if let Some(l2_path) = self.inner.result_blob_cache.l2_path() {
4212                    match crate::storage::cache::archive_blob_cache_l2(
4213                        backend.as_ref(),
4214                        l2_path,
4215                        &blob_cache_prefix,
4216                    ) {
4217                        Ok(count) => {
4218                            tracing::info!(
4219                                target: "reddb::backup",
4220                                files_uploaded = count,
4221                                blob_cache_prefix = %blob_cache_prefix,
4222                                "include_blob_cache: archived L2 directory"
4223                            );
4224                        }
4225                        Err(err) => {
4226                            tracing::warn!(
4227                                target: "reddb::backup",
4228                                error = %err,
4229                                blob_cache_prefix = %blob_cache_prefix,
4230                                "include_blob_cache: L2 archive failed; backup proceeding (cache is derived state)"
4231                            );
4232                        }
4233                    }
4234                } else {
4235                    tracing::debug!(
4236                        target: "reddb::backup",
4237                        "include_blob_cache=true but no L2 path configured; nothing to archive"
4238                    );
4239                }
4240            }
4241
4242            uploaded = true;
4243        }
4244
4245        Ok(crate::replication::scheduler::BackupResult {
4246            snapshot_id: snapshot.snapshot_id,
4247            uploaded,
4248            duration_ms: started.elapsed().as_millis() as u64,
4249            timestamp: snapshot.created_at_unix_ms as u64,
4250        })
4251    }
4252
4253    pub fn acquire(&self) -> RedDBResult<RuntimeConnection> {
4254        let mut pool = self
4255            .inner
4256            .pool
4257            .lock()
4258            .map_err(|e| RedDBError::Internal(format!("connection pool lock poisoned: {e}")))?;
4259        if pool.active >= self.inner.pool_config.max_connections {
4260            return Err(RedDBError::Internal(
4261                "connection pool exhausted".to_string(),
4262            ));
4263        }
4264
4265        let id = if let Some(id) = pool.idle.pop() {
4266            id
4267        } else {
4268            let id = pool.next_id;
4269            pool.next_id += 1;
4270            id
4271        };
4272        pool.active += 1;
4273        pool.total_checkouts += 1;
4274        drop(pool);
4275
4276        Ok(RuntimeConnection {
4277            id,
4278            inner: Arc::clone(&self.inner),
4279        })
4280    }
4281
4282    pub fn checkpoint(&self) -> RedDBResult<()> {
4283        // Local fsync always allowed — losing the lease shouldn't
4284        // prevent us from durably persisting what's already in memory.
4285        // The remote upload is the side-effect that risks clobbering a
4286        // peer's state, so it's behind the lease gate.
4287        self.inner.db.flush_local_only().map_err(|err| {
4288            // Issue #205 — local flush failure is a CheckpointFailed
4289            // operator-grade event. The local-flush path also covers
4290            // the WAL fsync we depend on, so a failure here doubles as
4291            // the WalFsyncFailed signal for the runtime entry point.
4292            let msg = err.to_string();
4293            crate::telemetry::operator_event::OperatorEvent::CheckpointFailed {
4294                lsn: 0,
4295                error: msg.clone(),
4296            }
4297            .emit_global();
4298            crate::telemetry::operator_event::OperatorEvent::WalFsyncFailed {
4299                path: "<flush_local_only>".to_string(),
4300                error: msg.clone(),
4301            }
4302            .emit_global();
4303            RedDBError::Engine(msg)
4304        })?;
4305        if let Err(err) = self.assert_remote_write_allowed("checkpoint") {
4306            tracing::warn!(
4307                target: "reddb::serverless::lease",
4308                error = %err,
4309                "checkpoint: skipping remote upload — lease not held"
4310            );
4311            return Ok(());
4312        }
4313        self.inner
4314            .db
4315            .upload_to_remote_backend()
4316            .map_err(|err| RedDBError::Engine(err.to_string()))
4317    }
4318
4319    /// Guard remote-mutating operations on the writer lease.
4320    /// Returns `Ok(())` when no remote backend is configured (the
4321    /// lease is irrelevant) or the lease state is `NotRequired` /
4322    /// `Held`. Returns `RedDBError::ReadOnly` when the lease is
4323    /// `NotHeld`, with an audit-friendly action label so the caller
4324    /// can record the rejection.
4325    pub(crate) fn assert_remote_write_allowed(&self, action: &str) -> RedDBResult<()> {
4326        if self.inner.db.remote_backend.is_none() {
4327            return Ok(());
4328        }
4329        match self.inner.write_gate.lease_state() {
4330            crate::runtime::write_gate::LeaseGateState::NotHeld => {
4331                self.inner.audit_log.record(
4332                    action,
4333                    "system",
4334                    "remote_backend",
4335                    "err: writer lease not held",
4336                    crate::json::Value::Null,
4337                );
4338                Err(RedDBError::ReadOnly(format!(
4339                    "writer lease not held — {action} blocked (serverless fence)"
4340                )))
4341            }
4342            _ => Ok(()),
4343        }
4344    }
4345
4346    pub fn run_maintenance(&self) -> RedDBResult<()> {
4347        self.inner
4348            .db
4349            .run_maintenance()
4350            .map_err(|err| RedDBError::Internal(err.to_string()))
4351    }
4352
4353    pub fn scan_collection(
4354        &self,
4355        collection: &str,
4356        cursor: Option<ScanCursor>,
4357        limit: usize,
4358    ) -> RedDBResult<ScanPage> {
4359        let store = self.inner.db.store();
4360        let manager = store
4361            .get_collection(collection)
4362            .ok_or_else(|| RedDBError::NotFound(collection.to_string()))?;
4363
4364        let mut entities = manager.query_all(|_| true);
4365        entities.sort_by_key(|entity| entity.id.raw());
4366
4367        let offset = cursor.map(|cursor| cursor.offset).unwrap_or(0);
4368        let total = entities.len();
4369        let end = total.min(offset.saturating_add(limit.max(1)));
4370        let items = if offset >= total {
4371            Vec::new()
4372        } else {
4373            entities[offset..end].to_vec()
4374        };
4375        let next = (end < total).then_some(ScanCursor { offset: end });
4376
4377        Ok(ScanPage {
4378            collection: collection.to_string(),
4379            items,
4380            next,
4381            total,
4382        })
4383    }
4384
4385    pub fn catalog(&self) -> CatalogModelSnapshot {
4386        self.inner.db.catalog_model_snapshot()
4387    }
4388
4389    pub fn catalog_consistency_report(&self) -> crate::catalog::CatalogConsistencyReport {
4390        self.inner.db.catalog_consistency_report()
4391    }
4392
4393    pub fn catalog_attention_summary(&self) -> CatalogAttentionSummary {
4394        crate::catalog::attention_summary(&self.catalog())
4395    }
4396
4397    pub fn collection_attention(&self) -> Vec<CollectionDescriptor> {
4398        crate::catalog::collection_attention(&self.catalog())
4399    }
4400
4401    pub fn index_attention(&self) -> Vec<CatalogIndexStatus> {
4402        crate::catalog::index_attention(&self.catalog())
4403    }
4404
4405    pub fn graph_projection_attention(&self) -> Vec<CatalogGraphProjectionStatus> {
4406        crate::catalog::graph_projection_attention(&self.catalog())
4407    }
4408
4409    pub fn analytics_job_attention(&self) -> Vec<CatalogAnalyticsJobStatus> {
4410        crate::catalog::analytics_job_attention(&self.catalog())
4411    }
4412
4413    pub fn stats(&self) -> RuntimeStats {
4414        let pool = runtime_pool_lock(self);
4415        RuntimeStats {
4416            active_connections: pool.active,
4417            idle_connections: pool.idle.len(),
4418            total_checkouts: pool.total_checkouts,
4419            paged_mode: self.inner.db.is_paged(),
4420            started_at_unix_ms: self.inner.started_at_unix_ms,
4421            store: self.inner.db.stats(),
4422            system: SystemInfo::collect(),
4423            result_blob_cache: self.inner.result_blob_cache.stats(),
4424            kv: self.inner.kv_stats.snapshot(),
4425            metrics_ingest: self.inner.metrics_ingest_stats.snapshot(),
4426        }
4427    }
4428
4429    pub(crate) fn record_metrics_ingest(
4430        &self,
4431        accepted_samples: u64,
4432        accepted_series: u64,
4433        rejected_samples: u64,
4434        rejected_series: u64,
4435    ) {
4436        self.inner.metrics_ingest_stats.record(
4437            accepted_samples,
4438            accepted_series,
4439            rejected_samples,
4440            rejected_series,
4441        );
4442    }
4443
4444    pub(crate) fn record_metrics_cardinality_budget_rejections(&self, rejected_series: u64) {
4445        self.inner
4446            .metrics_ingest_stats
4447            .record_cardinality_budget_rejections(rejected_series);
4448    }
4449
4450    pub(crate) fn record_metrics_tenant_activity(
4451        &self,
4452        tenant: &str,
4453        namespace: &str,
4454        operation: &str,
4455    ) {
4456        self.inner
4457            .metrics_tenant_activity_stats
4458            .record(tenant, namespace, operation);
4459    }
4460
4461    pub(crate) fn metrics_tenant_activity_snapshot(
4462        &self,
4463    ) -> Vec<crate::runtime::MetricsTenantActivityStats> {
4464        self.inner.metrics_tenant_activity_stats.snapshot()
4465    }
4466
4467    /// Execute a query under a typed scope override without embedding
4468    /// the tenant / user / role values into the SQL string. Use this
4469    /// from transport middleware (HTTP / gRPC / worker loops) where the
4470    /// scope is resolved from auth claims and the SQL is a parameterised
4471    /// template — avoids the string-concat injection risk of building
4472    /// `WITHIN TENANT '<id>' …` manually, and is drop-in compatible with
4473    /// prepared statements that didn't know about tenancy.
4474    ///
4475    /// Precedence matches the `WITHIN` clause: the passed `scope`
4476    /// overrides `SET LOCAL TENANT`, which overrides `SET TENANT`.
4477    /// The override is pushed on the thread-local scope stack for the
4478    /// duration of the call and popped on return — pool-shared
4479    /// connections cannot leak it across requests.
4480    pub fn execute_query_with_scope(
4481        &self,
4482        query: &str,
4483        scope: crate::runtime::within_clause::ScopeOverride,
4484    ) -> RedDBResult<RuntimeQueryResult> {
4485        if scope.is_empty() {
4486            return self.execute_query(query);
4487        }
4488        let _scope_guard = ScopeOverrideGuard::install(scope);
4489        self.execute_query(query)
4490    }
4491
4492    /// Issue #205 — single lifecycle exit for slow-query logging.
4493    ///
4494    /// `execute_query_inner` does the real work; this wrapper times it
4495    /// and, if elapsed exceeds the configured threshold, hands the
4496    /// triple `(QueryKind, elapsed_ms, sql_redacted, scope)` to the
4497    /// SlowQueryLogger. The threshold + sample_pct were captured at
4498    /// SlowQueryLogger construction (runtime startup), so the per-call
4499    /// cost on below-threshold paths is one relaxed atomic load.
4500    pub fn execute_query(&self, query: &str) -> RedDBResult<RuntimeQueryResult> {
4501        let started = std::time::Instant::now();
4502        let result = self.execute_query_inner(query);
4503        let elapsed_ms = started.elapsed().as_millis() as u64;
4504
4505        // Build EffectiveScope from the same thread-locals frame-build
4506        // consults — keeps the slow-log row consistent with the audit /
4507        // RLS view of "this statement". `ai_scope()` is the canonical
4508        // builder.
4509        let scope = self.ai_scope();
4510        let kind = match result
4511            .as_ref()
4512            .map(|r| r.statement_type)
4513            .unwrap_or("select")
4514        {
4515            "select" => crate::telemetry::slow_query_logger::QueryKind::Select,
4516            "insert" => crate::telemetry::slow_query_logger::QueryKind::Insert,
4517            "update" => crate::telemetry::slow_query_logger::QueryKind::Update,
4518            "delete" => crate::telemetry::slow_query_logger::QueryKind::Delete,
4519            _ => crate::telemetry::slow_query_logger::QueryKind::Internal,
4520        };
4521        // SQL redaction: pass the raw query through. The slow-query
4522        // logger writes structured JSON so embedded literals stay
4523        // escape-safe at the JSON boundary (proven by
4524        // `adversarial_sql_is_escape_safe` in slow_query_logger.rs).
4525        // PII redaction (e.g. literal masking) is a follow-up.
4526        self.inner
4527            .slow_query_logger
4528            .record(kind, elapsed_ms, query.to_string(), &scope);
4529
4530        result
4531    }
4532
4533    #[inline(never)]
4534    fn execute_query_inner(&self, query: &str) -> RedDBResult<RuntimeQueryResult> {
4535        // ── ULTRA-TURBO: autocommit `SELECT * FROM t WHERE _entity_id = N` ──
4536        //
4537        // Moved above every boot-cost the normal path pays (WITHIN
4538        // strip, SET LOCAL parse, tx_local_tenants read, snapshot
4539        // guard, tracing span, tx_contexts read) because the bench's
4540        // `select_point` scenario was observed at 28× vs PostgreSQL —
4541        // the dominant cost wasn't the entity fetch but the ceremony
4542        // before it. Only fires when there's no ambient transaction
4543        // context or WITHIN override, so the snapshot install we skip
4544        // truly is a no-op for this query.
4545        if !has_scope_override_active()
4546            && !query.trim_start().starts_with("WITHIN")
4547            && !query.trim_start().starts_with("within")
4548            && !self
4549                .inner
4550                .tx_contexts
4551                .read()
4552                .contains_key(&current_connection_id())
4553        {
4554            if let Some(result) = self.try_fast_entity_lookup(query) {
4555                return result;
4556            }
4557        }
4558
4559        // `WITHIN TENANT '<id>' [USER '<u>'] [AS ROLE '<r>'] <stmt>` —
4560        // strip the prefix, push a stack-scoped override, recurse on
4561        // the inner statement, pop on return. Stack lives in a
4562        // thread-local but is balanced by the RAII guard, so a
4563        // pool-shared connection cannot leak the override across
4564        // requests and an early `?` return still pops cleanly.
4565        match crate::runtime::within_clause::try_strip_within_prefix(query) {
4566            Ok(Some((scope, inner))) => {
4567                let _scope_guard = ScopeOverrideGuard::install(scope);
4568                // Re-enter the inner path, NOT `execute_query`, so the
4569                // slow-query lifecycle hook records exactly one row per
4570                // top-level statement (the WITHIN-stripped form would
4571                // double-record).
4572                return self.execute_query_inner(inner);
4573            }
4574            Ok(None) => {}
4575            Err(msg) => return Err(RedDBError::Query(msg)),
4576        }
4577
4578        // `EXPLAIN <stmt>` — introspection. Runs the planner on the
4579        // inner statement (WITHOUT executing it) and returns the
4580        // CanonicalLogicalNode tree as rows so the caller can see the
4581        // operator shape and estimated cost. `EXPLAIN ALTER FOR ...`
4582        // is a distinct schema-diff command and continues down the
4583        // regular SQL path.
4584        if let Some(inner) = strip_explain_prefix(query) {
4585            return self.explain_as_rows(query, inner);
4586        }
4587
4588        // `SET LOCAL TENANT '<id>'` — write the per-transaction tenant
4589        // override and return. Outside a transaction the statement is
4590        // an error (matches PG semantics: SET LOCAL only takes effect
4591        // within an active transaction).
4592        if let Some(value) = parse_set_local_tenant(query)? {
4593            let conn_id = current_connection_id();
4594            if !self.inner.tx_contexts.read().contains_key(&conn_id) {
4595                return Err(RedDBError::Query(
4596                    "SET LOCAL TENANT requires an active transaction".to_string(),
4597                ));
4598            }
4599            self.inner
4600                .tx_local_tenants
4601                .write()
4602                .insert(conn_id, value.clone());
4603            return Ok(RuntimeQueryResult::ok_message(
4604                query.to_string(),
4605                &match &value {
4606                    Some(id) => format!("local tenant set: {id}"),
4607                    None => "local tenant cleared".to_string(),
4608                },
4609                "set_local_tenant",
4610            ));
4611        }
4612
4613        if super::red_schema::is_system_schema_write(query) {
4614            return Err(RedDBError::Query(
4615                super::red_schema::READ_ONLY_ERROR.to_string(),
4616            ));
4617        }
4618
4619        let rewritten_query = super::red_schema::rewrite_virtual_names(query);
4620        let execution_query = rewritten_query.as_deref().unwrap_or(query);
4621
4622        let frame = super::statement_frame::StatementExecutionFrame::build(self, execution_query)?;
4623        let _frame_guards = frame.install(self);
4624
4625        // Phase 6 logging: enter a span stamped with conn_id / tenant
4626        // / query_len. Every downstream tracing::info!/warn!/error!
4627        // inherits these fields — no need to thread them manually
4628        // through storage/scan layers. Entered AFTER the WITHIN /
4629        // SET LOCAL TENANT resolution above so the span reflects the
4630        // effective scope for this statement.
4631        let _log_span = crate::telemetry::span::query_span(query).entered();
4632
4633        // ── CTE prelude (#41) — `WITH x AS (...) SELECT ... FROM x` ──
4634        if let Some(rewritten) = frame.prepare_cte(execution_query)? {
4635            return self.execute_query_expr(rewritten);
4636        }
4637
4638        // ── TURBO: bypass SQL parse for SELECT * FROM x WHERE _entity_id = N ──
4639        if let Some(result) = self.try_fast_entity_lookup(execution_query) {
4640            return result;
4641        }
4642
4643        // ── Result cache: return cached result if still fresh (30s TTL) ──
4644        if let Some(result) = frame.read_result_cache(self) {
4645            return Ok(result);
4646        }
4647
4648        let prepared = frame.prepare_statement(self, execution_query)?;
4649        let mode = prepared.mode;
4650        let expr = prepared.expr;
4651
4652        let statement = query_expr_name(&expr);
4653        let result_cache_scopes = query_expr_result_cache_scopes(&expr);
4654
4655        let _lock_guard = frame.prepare_dispatch(self, &expr)?;
4656        let frame_iface: &dyn super::statement_frame::ReadFrame = &frame;
4657
4658        let query_result = match expr {
4659            QueryExpr::Graph(_) | QueryExpr::Path(_) => {
4660                // Apply MVCC visibility + RLS gate while materialising the
4661                // graph: every node entity is screened against the source
4662                // collection's policy chain (basic and `Nodes`-targeted)
4663                // and dropped when the caller's tenant / role doesn't
4664                // admit it. Edges are pruned automatically because the
4665                // graph builder skips edges whose endpoints aren't in
4666                // `allowed_nodes`.
4667                let (graph, node_properties, edge_properties) =
4668                    self.materialize_graph_with_rls()?;
4669                let result =
4670                    crate::storage::query::unified::UnifiedExecutor::execute_on_with_graph_properties(
4671                        &graph,
4672                        &expr,
4673                        node_properties,
4674                        edge_properties,
4675                    )
4676                        .map_err(|err| RedDBError::Query(err.to_string()))?;
4677
4678                Ok(RuntimeQueryResult {
4679                    query: query.to_string(),
4680                    mode,
4681                    statement,
4682                    engine: "materialized-graph",
4683                    result,
4684                    affected_rows: 0,
4685                    statement_type: "select",
4686                })
4687            }
4688            QueryExpr::Table(table) => {
4689                let table = self.resolve_table_expr_subqueries(
4690                    table,
4691                    &frame as &dyn super::statement_frame::ReadFrame,
4692                )?;
4693                if super::red_schema::is_virtual_table(&table.table) {
4694                    return Ok(RuntimeQueryResult {
4695                        query: query.to_string(),
4696                        mode,
4697                        statement,
4698                        engine: "runtime-red-schema",
4699                        result: super::red_schema::red_query(
4700                            self,
4701                            &table.table,
4702                            &table,
4703                            &frame as &dyn super::statement_frame::ReadFrame,
4704                        )?,
4705                        affected_rows: 0,
4706                        statement_type: "select",
4707                    });
4708                }
4709
4710                if let Some(result) = self.execute_probabilistic_select(&table)? {
4711                    return Ok(RuntimeQueryResult {
4712                        query: query.to_string(),
4713                        mode,
4714                        statement,
4715                        engine: "runtime-probabilistic",
4716                        result,
4717                        affected_rows: 0,
4718                        statement_type: "select",
4719                    });
4720                }
4721
4722                // Foreign-table intercept (Phase 3.2.2 PG parity).
4723                //
4724                // When the referenced table matches a `CREATE FOREIGN TABLE`
4725                // registration, short-circuit into the FDW scan. Phase 3.2
4726                // wrappers don't yet support pushdown, so filters/projections
4727                // apply post-scan via `apply_foreign_table_filters` — good
4728                // enough for correctness; perf work lands in 3.2.3.
4729                if self.inner.foreign_tables.is_foreign_table(&table.table) {
4730                    let records = self
4731                        .inner
4732                        .foreign_tables
4733                        .scan(&table.table)
4734                        .map_err(|e| RedDBError::Internal(e.to_string()))?;
4735                    let result = apply_foreign_table_filters(records, &table);
4736                    return Ok(RuntimeQueryResult {
4737                        query: query.to_string(),
4738                        mode,
4739                        statement,
4740                        engine: "runtime-fdw",
4741                        result,
4742                        affected_rows: 0,
4743                        statement_type: "select",
4744                    });
4745                }
4746
4747                // Row-Level Security enforcement (Phase 2.5.2 PG parity).
4748                //
4749                // When RLS is enabled on this table, fetch every policy
4750                // that applies to the current (role, SELECT) pair and
4751                // fold them into the query's WHERE clause: policies
4752                // OR-combine (any of them admitting the row is enough),
4753                // then AND into the caller's existing filter.
4754                //
4755                // Anonymous callers (no thread-local identity) pass
4756                // `role = None`; policies with a specific `TO role`
4757                // clause skip, but `TO PUBLIC` policies still apply.
4758                //
4759                // When `inject_rls_filters` returns `None` the table has
4760                // RLS enabled but no policy admits the caller's role —
4761                // short-circuit with an empty result set instead of
4762                // synthesising a contradiction filter.
4763                let Some(table_with_rls) = self.authorize_relational_table_select(
4764                    table,
4765                    &frame as &dyn super::statement_frame::ReadFrame,
4766                )?
4767                else {
4768                    let empty = crate::storage::query::unified::UnifiedResult::empty();
4769                    return Ok(RuntimeQueryResult {
4770                        query: query.to_string(),
4771                        mode,
4772                        statement,
4773                        engine: "runtime-table-rls",
4774                        result: empty,
4775                        affected_rows: 0,
4776                        statement_type: "select",
4777                    });
4778                };
4779                Ok(RuntimeQueryResult {
4780                    query: query.to_string(),
4781                    mode,
4782                    statement,
4783                    engine: "runtime-table",
4784                    result: execute_runtime_table_query(
4785                        &self.inner.db,
4786                        &table_with_rls,
4787                        Some(&self.inner.index_store),
4788                    )?,
4789                    affected_rows: 0,
4790                    statement_type: "select",
4791                })
4792            }
4793            QueryExpr::Join(join) => {
4794                // Fold per-table RLS filters into each `QueryExpr::Table`
4795                // leaf of the join tree before executing. Without this
4796                // the join executor scans both tables raw and ignores
4797                // policies — a `WITHIN TENANT 'x'` against a join of
4798                // two tenant-scoped tables would leak cross-tenant rows.
4799                // When any leaf has RLS enabled and zero matching policy,
4800                // short-circuit to an empty join result instead of
4801                // emitting a contradiction filter.
4802                let join_with_rls = match self.authorize_relational_join_select(
4803                    join,
4804                    &frame as &dyn super::statement_frame::ReadFrame,
4805                )? {
4806                    Some(j) => j,
4807                    None => {
4808                        return Ok(RuntimeQueryResult {
4809                            query: query.to_string(),
4810                            mode,
4811                            statement,
4812                            engine: "runtime-join-rls",
4813                            result: crate::storage::query::unified::UnifiedResult::empty(),
4814                            affected_rows: 0,
4815                            statement_type: "select",
4816                        });
4817                    }
4818                };
4819                Ok(RuntimeQueryResult {
4820                    query: query.to_string(),
4821                    mode,
4822                    statement,
4823                    engine: "runtime-join",
4824                    result: execute_runtime_join_query(&self.inner.db, &join_with_rls)?,
4825                    affected_rows: 0,
4826                    statement_type: "select",
4827                })
4828            }
4829            QueryExpr::Vector(vector) => Ok(RuntimeQueryResult {
4830                query: query.to_string(),
4831                mode,
4832                statement,
4833                engine: "runtime-vector",
4834                result: execute_runtime_vector_query(&self.inner.db, &vector)?,
4835                affected_rows: 0,
4836                statement_type: "select",
4837            }),
4838            QueryExpr::Hybrid(hybrid) => Ok(RuntimeQueryResult {
4839                query: query.to_string(),
4840                mode,
4841                statement,
4842                engine: "runtime-hybrid",
4843                result: execute_runtime_hybrid_query(&self.inner.db, &hybrid)?,
4844                affected_rows: 0,
4845                statement_type: "select",
4846            }),
4847            // DML execution
4848            QueryExpr::Insert(ref insert) if super::red_schema::is_virtual_table(&insert.table) => {
4849                Err(RedDBError::Query(
4850                    super::red_schema::READ_ONLY_ERROR.to_string(),
4851                ))
4852            }
4853            QueryExpr::Update(ref update) if super::red_schema::is_virtual_table(&update.table) => {
4854                Err(RedDBError::Query(
4855                    super::red_schema::READ_ONLY_ERROR.to_string(),
4856                ))
4857            }
4858            QueryExpr::Delete(ref delete) if super::red_schema::is_virtual_table(&delete.table) => {
4859                Err(RedDBError::Query(
4860                    super::red_schema::READ_ONLY_ERROR.to_string(),
4861                ))
4862            }
4863            QueryExpr::Insert(ref insert) => self
4864                .with_deferred_store_wal_for_dml(self.insert_may_emit_events(insert), || {
4865                    self.execute_insert(query, insert)
4866                }),
4867            QueryExpr::Update(ref update) => self
4868                .with_deferred_store_wal_for_dml(self.update_may_emit_events(update), || {
4869                    self.execute_update(query, update)
4870                }),
4871            QueryExpr::Delete(ref delete) => self
4872                .with_deferred_store_wal_for_dml(self.delete_may_emit_events(delete), || {
4873                    self.execute_delete(query, delete)
4874                }),
4875            // DDL execution
4876            QueryExpr::CreateTable(ref create) => self.execute_create_table(query, create),
4877            QueryExpr::CreateCollection(ref create) => {
4878                self.execute_create_collection(query, create)
4879            }
4880            QueryExpr::CreateVector(ref create) => self.execute_create_vector(query, create),
4881            QueryExpr::DropTable(ref drop_tbl) => self.execute_drop_table(query, drop_tbl),
4882            QueryExpr::DropGraph(ref drop_graph) => self.execute_drop_graph(query, drop_graph),
4883            QueryExpr::DropVector(ref drop_vector) => self.execute_drop_vector(query, drop_vector),
4884            QueryExpr::DropDocument(ref drop_document) => {
4885                self.execute_drop_document(query, drop_document)
4886            }
4887            QueryExpr::DropKv(ref drop_kv) => self.execute_drop_kv(query, drop_kv),
4888            QueryExpr::DropCollection(ref drop_collection) => {
4889                self.execute_drop_collection(query, drop_collection)
4890            }
4891            QueryExpr::Truncate(ref truncate) => self.execute_truncate(query, truncate),
4892            QueryExpr::AlterTable(ref alter) => self.execute_alter_table(query, alter),
4893            QueryExpr::ExplainAlter(ref explain) => self.execute_explain_alter(query, explain),
4894            // Graph analytics commands
4895            QueryExpr::GraphCommand(ref cmd) => self.execute_graph_command(query, cmd),
4896            // Search commands
4897            QueryExpr::SearchCommand(ref cmd) => self.execute_search_command(query, cmd),
4898            // ASK: RAG query with LLM synthesis
4899            QueryExpr::Ask(ref ask) => self.execute_ask(query, ask),
4900            QueryExpr::CreateIndex(ref create_idx) => self.execute_create_index(query, create_idx),
4901            QueryExpr::DropIndex(ref drop_idx) => self.execute_drop_index(query, drop_idx),
4902            QueryExpr::ProbabilisticCommand(ref cmd) => {
4903                self.execute_probabilistic_command(query, cmd)
4904            }
4905            // Time-series DDL
4906            QueryExpr::CreateTimeSeries(ref ts) => self.execute_create_timeseries(query, ts),
4907            QueryExpr::DropTimeSeries(ref ts) => self.execute_drop_timeseries(query, ts),
4908            // Queue DDL and commands
4909            QueryExpr::CreateQueue(ref q) => self.execute_create_queue(query, q),
4910            QueryExpr::AlterQueue(ref q) => self.execute_alter_queue(query, q),
4911            QueryExpr::DropQueue(ref q) => self.execute_drop_queue(query, q),
4912            QueryExpr::QueueSelect(ref q) => self.execute_queue_select(query, q),
4913            QueryExpr::QueueCommand(ref cmd) => self.execute_queue_command(query, cmd),
4914            QueryExpr::EventsBackfill(ref backfill) => {
4915                self.execute_events_backfill(query, backfill)
4916            }
4917            QueryExpr::EventsBackfillStatus { ref collection } => Err(RedDBError::Query(format!(
4918                "EVENTS BACKFILL STATUS for '{collection}' is not implemented in this slice"
4919            ))),
4920            QueryExpr::KvCommand(ref cmd) => self.execute_kv_command(query, cmd),
4921            QueryExpr::ConfigCommand(ref cmd) => self.execute_config_command(query, cmd),
4922            QueryExpr::CreateTree(ref tree) => self.execute_create_tree(query, tree),
4923            QueryExpr::DropTree(ref tree) => self.execute_drop_tree(query, tree),
4924            QueryExpr::TreeCommand(ref cmd) => self.execute_tree_command(query, cmd),
4925            // SET CONFIG key = value
4926            QueryExpr::SetConfig { ref key, ref value } => {
4927                if key.starts_with("red.secret.") {
4928                    return Err(RedDBError::Query(
4929                        "red.secret.* is reserved for vault secrets; use SET SECRET".to_string(),
4930                    ));
4931                }
4932                let store = self.inner.db.store();
4933                let json_val = match value {
4934                    Value::Text(s) => crate::serde_json::Value::String(s.to_string()),
4935                    Value::Integer(n) => crate::serde_json::Value::Number(*n as f64),
4936                    Value::Float(n) => crate::serde_json::Value::Number(*n),
4937                    Value::Boolean(b) => crate::serde_json::Value::Bool(*b),
4938                    _ => crate::serde_json::Value::String(value.to_string()),
4939                };
4940                store.set_config_tree(key, &json_val);
4941                update_current_config_value(key, value.clone());
4942                // Config changes can flip runtime behavior mid-session
4943                // (auto_decrypt, auto_encrypt, etc.) — invalidate the
4944                // result cache so subsequent reads re-execute against
4945                // the new config.
4946                self.invalidate_result_cache();
4947                Ok(RuntimeQueryResult::ok_message(
4948                    query.to_string(),
4949                    &format!("config set: {key}"),
4950                    "set",
4951                ))
4952            }
4953            // SET SECRET key = value
4954            QueryExpr::SetSecret { ref key, ref value } => {
4955                if key.starts_with("red.config.") {
4956                    return Err(RedDBError::Query(
4957                        "red.config.* is reserved for config; use SET CONFIG".to_string(),
4958                    ));
4959                }
4960                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
4961                    RedDBError::Query("SET SECRET requires an enabled, unsealed vault".to_string())
4962                })?;
4963                if matches!(value, Value::Null) {
4964                    auth_store
4965                        .vault_kv_try_delete(key)
4966                        .map_err(|err| RedDBError::Query(err.to_string()))?;
4967                    update_current_secret_value(key, None);
4968                    self.invalidate_result_cache();
4969                    return Ok(RuntimeQueryResult::ok_message(
4970                        query.to_string(),
4971                        &format!("secret deleted: {key}"),
4972                        "delete_secret",
4973                    ));
4974                }
4975                let value = secret_sql_value_to_string(value)?;
4976                auth_store
4977                    .vault_kv_try_set(key.clone(), value.clone())
4978                    .map_err(|err| RedDBError::Query(err.to_string()))?;
4979                update_current_secret_value(key, Some(value));
4980                self.invalidate_result_cache();
4981                Ok(RuntimeQueryResult::ok_message(
4982                    query.to_string(),
4983                    &format!("secret set: {key}"),
4984                    "set_secret",
4985                ))
4986            }
4987            // DELETE SECRET key
4988            QueryExpr::DeleteSecret { ref key } => {
4989                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
4990                    RedDBError::Query(
4991                        "DELETE SECRET requires an enabled, unsealed vault".to_string(),
4992                    )
4993                })?;
4994                let deleted = auth_store
4995                    .vault_kv_try_delete(key)
4996                    .map_err(|err| RedDBError::Query(err.to_string()))?;
4997                if deleted {
4998                    update_current_secret_value(key, None);
4999                }
5000                self.invalidate_result_cache();
5001                Ok(RuntimeQueryResult::ok_message(
5002                    query.to_string(),
5003                    &format!("secret deleted: {key}"),
5004                    if deleted {
5005                        "delete_secret"
5006                    } else {
5007                        "delete_secret_not_found"
5008                    },
5009                ))
5010            }
5011            // SHOW SECRET[S] [prefix]
5012            QueryExpr::ShowSecrets { ref prefix } => {
5013                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
5014                    RedDBError::Query("SHOW SECRET requires an enabled, unsealed vault".to_string())
5015                })?;
5016                if !auth_store.is_vault_backed() {
5017                    return Err(RedDBError::Query(
5018                        "SHOW SECRET requires an enabled, unsealed vault".to_string(),
5019                    ));
5020                }
5021                let mut keys = auth_store.vault_kv_keys();
5022                keys.sort();
5023                let mut result = UnifiedResult::with_columns(vec![
5024                    "key".into(),
5025                    "value".into(),
5026                    "status".into(),
5027                ]);
5028                for key in keys {
5029                    if let Some(ref pfx) = prefix {
5030                        if !key.starts_with(pfx) {
5031                            continue;
5032                        }
5033                    }
5034                    let mut record = UnifiedRecord::new();
5035                    record.set("key", Value::text(key));
5036                    record.set("value", Value::text("***"));
5037                    record.set("status", Value::text("active"));
5038                    result.push(record);
5039                }
5040                Ok(RuntimeQueryResult {
5041                    query: query.to_string(),
5042                    mode,
5043                    statement: "show_secrets",
5044                    engine: "runtime-secret",
5045                    result,
5046                    affected_rows: 0,
5047                    statement_type: "select",
5048                })
5049            }
5050            // SHOW CONFIG [prefix]
5051            QueryExpr::ShowConfig { ref prefix } => {
5052                let store = self.inner.db.store();
5053                let all_collections = store.list_collections();
5054                if !all_collections.contains(&"red_config".to_string()) {
5055                    let result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
5056                    return Ok(RuntimeQueryResult {
5057                        query: query.to_string(),
5058                        mode,
5059                        statement: "show_config",
5060                        engine: "runtime-config",
5061                        result,
5062                        affected_rows: 0,
5063                        statement_type: "select",
5064                    });
5065                }
5066                let manager = store
5067                    .get_collection("red_config")
5068                    .ok_or_else(|| RedDBError::NotFound("red_config".to_string()))?;
5069                let entities = manager.query_all(|_| true);
5070                let mut latest = std::collections::BTreeMap::<String, (u64, Value, Value)>::new();
5071                for entity in entities {
5072                    if let EntityData::Row(ref row) = entity.data {
5073                        if let Some(ref named) = row.named {
5074                            let key_val = named.get("key").cloned().unwrap_or(Value::Null);
5075                            let val = named.get("value").cloned().unwrap_or(Value::Null);
5076                            let key_str = match &key_val {
5077                                Value::Text(s) => s.as_ref(),
5078                                _ => continue,
5079                            };
5080                            if let Some(ref pfx) = prefix {
5081                                if !key_str.starts_with(pfx.as_str()) {
5082                                    continue;
5083                                }
5084                            }
5085                            let entity_id = entity.id.raw();
5086                            match latest.get(key_str) {
5087                                Some((prev_id, _, _)) if *prev_id > entity_id => {}
5088                                _ => {
5089                                    latest.insert(key_str.to_string(), (entity_id, key_val, val));
5090                                }
5091                            }
5092                        }
5093                    }
5094                }
5095                let mut result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
5096                for (_, key_val, val) in latest.into_values() {
5097                    let mut record = UnifiedRecord::new();
5098                    record.set("key", key_val);
5099                    record.set("value", val);
5100                    result.push(record);
5101                }
5102                Ok(RuntimeQueryResult {
5103                    query: query.to_string(),
5104                    mode,
5105                    statement: "show_config",
5106                    engine: "runtime-config",
5107                    result,
5108                    affected_rows: 0,
5109                    statement_type: "select",
5110                })
5111            }
5112            // Session-local multi-tenancy handle (Phase 2.5.3).
5113            //
5114            // SET TENANT 'id' / SET TENANT NULL / RESET TENANT — writes
5115            // the thread-local; SHOW TENANT returns it. Paired with the
5116            // CURRENT_TENANT() scalar for use in RLS policies.
5117            QueryExpr::SetTenant(ref value) => {
5118                match value {
5119                    Some(id) => set_current_tenant(id.clone()),
5120                    None => clear_current_tenant(),
5121                }
5122                Ok(RuntimeQueryResult::ok_message(
5123                    query.to_string(),
5124                    &match value {
5125                        Some(id) => format!("tenant set: {id}"),
5126                        None => "tenant cleared".to_string(),
5127                    },
5128                    "set_tenant",
5129                ))
5130            }
5131            QueryExpr::ShowTenant => {
5132                let mut result = UnifiedResult::with_columns(vec!["tenant".into()]);
5133                let mut record = UnifiedRecord::new();
5134                record.set(
5135                    "tenant",
5136                    current_tenant().map(Value::text).unwrap_or(Value::Null),
5137                );
5138                result.push(record);
5139                Ok(RuntimeQueryResult {
5140                    query: query.to_string(),
5141                    mode,
5142                    statement: "show_tenant",
5143                    engine: "runtime-tenant",
5144                    result,
5145                    affected_rows: 0,
5146                    statement_type: "select",
5147                })
5148            }
5149            // Transaction control (Phase 2.3 PG parity).
5150            //
5151            // BEGIN allocates a real `Xid` and stores a `TxnContext` keyed by
5152            // the current connection's id. COMMIT/ROLLBACK release it through
5153            // the `SnapshotManager` so future snapshots see the correct set of
5154            // active/aborted transactions.
5155            //
5156            // Tuple stamping (xmin/xmax) and read-path visibility filtering
5157            // land in Phase 2.3.2 — this dispatch only manages the snapshot
5158            // registry. Statements running outside a TxnContext still behave
5159            // as autocommit (xid=0 → visible to every snapshot).
5160            QueryExpr::TransactionControl(ref ctl) => {
5161                use crate::storage::query::ast::TxnControl;
5162                use crate::storage::transaction::snapshot::{TxnContext, Xid};
5163                use crate::storage::transaction::IsolationLevel;
5164
5165                // Phase 2.3 keys transactions by a thread-local connection id.
5166                // The stdio/gRPC paths wire a real per-connection id later;
5167                // for embedded use (one RedDBRuntime per process-ish caller)
5168                // we fall back to a deterministic placeholder.
5169                let conn_id = current_connection_id();
5170
5171                let (kind, msg) = match ctl {
5172                    TxnControl::Begin => {
5173                        let mgr = Arc::clone(&self.inner.snapshot_manager);
5174                        let xid = mgr.begin();
5175                        let snapshot = mgr.snapshot(xid);
5176                        let ctx = TxnContext {
5177                            xid,
5178                            isolation: IsolationLevel::SnapshotIsolation,
5179                            snapshot,
5180                            savepoints: Vec::new(),
5181                            released_sub_xids: Vec::new(),
5182                        };
5183                        self.inner.tx_contexts.write().insert(conn_id, ctx);
5184                        ("begin", format!("BEGIN — xid={xid} (snapshot isolation)"))
5185                    }
5186                    TxnControl::Commit => {
5187                        // SET LOCAL TENANT ends with the transaction.
5188                        self.inner.tx_local_tenants.write().remove(&conn_id);
5189                        let ctx = self.inner.tx_contexts.write().remove(&conn_id);
5190                        match ctx {
5191                            Some(ctx) => {
5192                                let mut own_xids = std::collections::HashSet::new();
5193                                own_xids.insert(ctx.xid);
5194                                for (_, sub) in &ctx.savepoints {
5195                                    own_xids.insert(*sub);
5196                                }
5197                                for sub in &ctx.released_sub_xids {
5198                                    own_xids.insert(*sub);
5199                                }
5200                                if let Err(err) = self.check_table_row_write_conflicts(
5201                                    conn_id,
5202                                    &ctx.snapshot,
5203                                    &own_xids,
5204                                ) {
5205                                    for (_, sub) in &ctx.savepoints {
5206                                        self.inner.snapshot_manager.rollback(*sub);
5207                                    }
5208                                    for sub in &ctx.released_sub_xids {
5209                                        self.inner.snapshot_manager.rollback(*sub);
5210                                    }
5211                                    self.inner.snapshot_manager.rollback(ctx.xid);
5212                                    self.revive_pending_versioned_updates(conn_id);
5213                                    self.revive_pending_tombstones(conn_id);
5214                                    self.discard_pending_kv_watch_events(conn_id);
5215                                    self.discard_pending_store_wal_actions(conn_id);
5216                                    return Err(err);
5217                                }
5218                                self.restore_pending_write_stamps(conn_id);
5219                                if let Err(err) = self.flush_pending_store_wal_actions(conn_id) {
5220                                    for (_, sub) in &ctx.savepoints {
5221                                        self.inner.snapshot_manager.rollback(*sub);
5222                                    }
5223                                    for sub in &ctx.released_sub_xids {
5224                                        self.inner.snapshot_manager.rollback(*sub);
5225                                    }
5226                                    self.inner.snapshot_manager.rollback(ctx.xid);
5227                                    self.revive_pending_versioned_updates(conn_id);
5228                                    self.revive_pending_tombstones(conn_id);
5229                                    self.discard_pending_kv_watch_events(conn_id);
5230                                    return Err(err);
5231                                }
5232                                // Phase 2.3.2e: commit every open sub-xid
5233                                // so they also become visible. Their
5234                                // work is promoted to the parent txn's
5235                                // result exactly like a RELEASE would
5236                                // have done.
5237                                for (_, sub) in &ctx.savepoints {
5238                                    self.inner.snapshot_manager.commit(*sub);
5239                                }
5240                                for sub in &ctx.released_sub_xids {
5241                                    self.inner.snapshot_manager.commit(*sub);
5242                                }
5243                                self.inner.snapshot_manager.commit(ctx.xid);
5244                                self.finalize_pending_versioned_updates(conn_id);
5245                                self.finalize_pending_tombstones(conn_id);
5246                                self.finalize_pending_kv_watch_events(conn_id);
5247                                ("commit", format!("COMMIT — xid={} committed", ctx.xid))
5248                            }
5249                            None => (
5250                                "commit",
5251                                "COMMIT outside transaction — no-op (autocommit)".to_string(),
5252                            ),
5253                        }
5254                    }
5255                    TxnControl::Rollback => {
5256                        self.inner.tx_local_tenants.write().remove(&conn_id);
5257                        let ctx = self.inner.tx_contexts.write().remove(&conn_id);
5258                        match ctx {
5259                            Some(ctx) => {
5260                                // Phase 2.3.2e: abort every open sub-xid
5261                                // too so their writes stay hidden.
5262                                for (_, sub) in &ctx.savepoints {
5263                                    self.inner.snapshot_manager.rollback(*sub);
5264                                }
5265                                for sub in &ctx.released_sub_xids {
5266                                    self.inner.snapshot_manager.rollback(*sub);
5267                                }
5268                                self.inner.snapshot_manager.rollback(ctx.xid);
5269                                // Phase 2.3.2b: tuples that the txn had
5270                                // xmax-stamped become live again — wipe xmax
5271                                // back to 0 so later snapshots see them.
5272                                self.revive_pending_versioned_updates(conn_id);
5273                                self.revive_pending_tombstones(conn_id);
5274                                self.discard_pending_kv_watch_events(conn_id);
5275                                self.discard_pending_store_wal_actions(conn_id);
5276                                ("rollback", format!("ROLLBACK — xid={} aborted", ctx.xid))
5277                            }
5278                            None => (
5279                                "rollback",
5280                                "ROLLBACK outside transaction — no-op (autocommit)".to_string(),
5281                            ),
5282                        }
5283                    }
5284                    // Phase 2.3.2e: savepoints map onto sub-xids. Each
5285                    // SAVEPOINT allocates a fresh xid and pushes it
5286                    // onto the per-txn stack so subsequent writes can
5287                    // be selectively rolled back. RELEASE pops without
5288                    // aborting; ROLLBACK TO aborts the sub-xid (and
5289                    // any nested ones) + revives their tombstones.
5290                    TxnControl::Savepoint(name) => {
5291                        let mgr = Arc::clone(&self.inner.snapshot_manager);
5292                        let mut guard = self.inner.tx_contexts.write();
5293                        match guard.get_mut(&conn_id) {
5294                            Some(ctx) => {
5295                                let sub = mgr.begin();
5296                                ctx.savepoints.push((name.clone(), sub));
5297                                ("savepoint", format!("SAVEPOINT {name} — sub_xid={sub}"))
5298                            }
5299                            None => (
5300                                "savepoint",
5301                                "SAVEPOINT outside transaction — no-op".to_string(),
5302                            ),
5303                        }
5304                    }
5305                    TxnControl::ReleaseSavepoint(name) => {
5306                        let mut guard = self.inner.tx_contexts.write();
5307                        match guard.get_mut(&conn_id) {
5308                            Some(ctx) => {
5309                                let pos = ctx
5310                                    .savepoints
5311                                    .iter()
5312                                    .position(|(n, _)| n == name)
5313                                    .ok_or_else(|| {
5314                                        RedDBError::Internal(format!(
5315                                            "savepoint {name} does not exist"
5316                                        ))
5317                                    })?;
5318                                // RELEASE pops the named savepoint and
5319                                // any nested ones. Their sub-xids move
5320                                // to `released_sub_xids` so they commit
5321                                // (or roll back) alongside the parent
5322                                // xid — PG semantics: released
5323                                // savepoints still contribute their
5324                                // work, but their names are gone.
5325                                let released = ctx.savepoints.len() - pos;
5326                                let popped: Vec<Xid> = ctx
5327                                    .savepoints
5328                                    .split_off(pos)
5329                                    .into_iter()
5330                                    .map(|(_, x)| x)
5331                                    .collect();
5332                                ctx.released_sub_xids.extend(popped);
5333                                (
5334                                    "release_savepoint",
5335                                    format!("RELEASE SAVEPOINT {name} — {released} level(s)"),
5336                                )
5337                            }
5338                            None => (
5339                                "release_savepoint",
5340                                "RELEASE outside transaction — no-op".to_string(),
5341                            ),
5342                        }
5343                    }
5344                    TxnControl::RollbackToSavepoint(name) => {
5345                        let mgr = Arc::clone(&self.inner.snapshot_manager);
5346                        // Splice out the savepoint + nested ones under
5347                        // a narrow lock, then run the snapshot-manager
5348                        // + tombstone side-effects without the tx map
5349                        // held so nothing re-enters.
5350                        let drop_result: Option<(Xid, Vec<Xid>)> = {
5351                            let mut guard = self.inner.tx_contexts.write();
5352                            if let Some(ctx) = guard.get_mut(&conn_id) {
5353                                let pos = ctx
5354                                    .savepoints
5355                                    .iter()
5356                                    .position(|(n, _)| n == name)
5357                                    .ok_or_else(|| {
5358                                        RedDBError::Internal(format!(
5359                                            "savepoint {name} does not exist"
5360                                        ))
5361                                    })?;
5362                                let savepoint_xid = ctx.savepoints[pos].1;
5363                                let aborted: Vec<Xid> = ctx
5364                                    .savepoints
5365                                    .split_off(pos)
5366                                    .into_iter()
5367                                    .map(|(_, x)| x)
5368                                    .collect();
5369                                Some((savepoint_xid, aborted))
5370                            } else {
5371                                None
5372                            }
5373                        };
5374
5375                        match drop_result {
5376                            Some((savepoint_xid, aborted)) => {
5377                                for x in &aborted {
5378                                    mgr.rollback(*x);
5379                                }
5380                                let reverted_updates =
5381                                    self.revive_versioned_updates_since(conn_id, savepoint_xid);
5382                                let revived = self.revive_tombstones_since(conn_id, savepoint_xid);
5383                                (
5384                                    "rollback_to_savepoint",
5385                                    format!(
5386                                        "ROLLBACK TO SAVEPOINT {name} — aborted {} sub_xid(s), reverted {reverted_updates} update(s), revived {revived} tombstone(s)",
5387                                        aborted.len(),
5388                                    ),
5389                                )
5390                            }
5391                            None => (
5392                                "rollback_to_savepoint",
5393                                "ROLLBACK TO outside transaction — no-op".to_string(),
5394                            ),
5395                        }
5396                    }
5397                };
5398                Ok(RuntimeQueryResult::ok_message(
5399                    query.to_string(),
5400                    &msg,
5401                    kind,
5402                ))
5403            }
5404            // Schema + Sequence DDL (Phase 1.3 PG parity).
5405            //
5406            // Schemas are lightweight logical namespaces: a CREATE SCHEMA call
5407            // just registers the name in `red_config` under `schema.{name}`.
5408            // Table lookups still happen by collection name; clients using
5409            // `schema.table` qualified names collapse to collection `schema.table`.
5410            //
5411            // Sequences persist a 64-bit counter + metadata (start, increment)
5412            // in `red_config` under `sequence.{name}.*`. Scalar callers
5413            // `nextval('name')` / `currval('name')` arrive with the MVCC phase
5414            // once we have a proper mutating-function dispatch path; for now the
5415            // DDL just establishes the catalog entry so clients don't error.
5416            QueryExpr::CreateSchema(ref q) => {
5417                let store = self.inner.db.store();
5418                let key = format!("schema.{}", q.name);
5419                if store.get_config(&key).is_some() {
5420                    if q.if_not_exists {
5421                        return Ok(RuntimeQueryResult::ok_message(
5422                            query.to_string(),
5423                            &format!("schema {} already exists — skipped", q.name),
5424                            "create_schema",
5425                        ));
5426                    }
5427                    return Err(RedDBError::Internal(format!(
5428                        "schema {} already exists",
5429                        q.name
5430                    )));
5431                }
5432                store.set_config_tree(&key, &crate::serde_json::Value::Bool(true));
5433                Ok(RuntimeQueryResult::ok_message(
5434                    query.to_string(),
5435                    &format!("schema {} created", q.name),
5436                    "create_schema",
5437                ))
5438            }
5439            QueryExpr::DropSchema(ref q) => {
5440                let store = self.inner.db.store();
5441                let key = format!("schema.{}", q.name);
5442                let existed = store.get_config(&key).is_some();
5443                if !existed && !q.if_exists {
5444                    return Err(RedDBError::Internal(format!(
5445                        "schema {} does not exist",
5446                        q.name
5447                    )));
5448                }
5449                // Remove marker from red_config via set to null.
5450                store.set_config_tree(&key, &crate::serde_json::Value::Null);
5451                let suffix = if q.cascade {
5452                    " (CASCADE accepted — tables untouched)"
5453                } else {
5454                    ""
5455                };
5456                Ok(RuntimeQueryResult::ok_message(
5457                    query.to_string(),
5458                    &format!("schema {} dropped{}", q.name, suffix),
5459                    "drop_schema",
5460                ))
5461            }
5462            QueryExpr::CreateSequence(ref q) => {
5463                let store = self.inner.db.store();
5464                let base = format!("sequence.{}", q.name);
5465                let start_key = format!("{base}.start");
5466                let incr_key = format!("{base}.increment");
5467                let curr_key = format!("{base}.current");
5468                if store.get_config(&start_key).is_some() {
5469                    if q.if_not_exists {
5470                        return Ok(RuntimeQueryResult::ok_message(
5471                            query.to_string(),
5472                            &format!("sequence {} already exists — skipped", q.name),
5473                            "create_sequence",
5474                        ));
5475                    }
5476                    return Err(RedDBError::Internal(format!(
5477                        "sequence {} already exists",
5478                        q.name
5479                    )));
5480                }
5481                // Persist start + increment, and set current so the first
5482                // nextval returns `start`.
5483                let initial_current = q.start - q.increment;
5484                store.set_config_tree(
5485                    &start_key,
5486                    &crate::serde_json::Value::Number(q.start as f64),
5487                );
5488                store.set_config_tree(
5489                    &incr_key,
5490                    &crate::serde_json::Value::Number(q.increment as f64),
5491                );
5492                store.set_config_tree(
5493                    &curr_key,
5494                    &crate::serde_json::Value::Number(initial_current as f64),
5495                );
5496                Ok(RuntimeQueryResult::ok_message(
5497                    query.to_string(),
5498                    &format!(
5499                        "sequence {} created (start={}, increment={})",
5500                        q.name, q.start, q.increment
5501                    ),
5502                    "create_sequence",
5503                ))
5504            }
5505            QueryExpr::DropSequence(ref q) => {
5506                let store = self.inner.db.store();
5507                let base = format!("sequence.{}", q.name);
5508                let existed = store.get_config(&format!("{base}.start")).is_some();
5509                if !existed && !q.if_exists {
5510                    return Err(RedDBError::Internal(format!(
5511                        "sequence {} does not exist",
5512                        q.name
5513                    )));
5514                }
5515                for k in ["start", "increment", "current"] {
5516                    store.set_config_tree(&format!("{base}.{k}"), &crate::serde_json::Value::Null);
5517                }
5518                Ok(RuntimeQueryResult::ok_message(
5519                    query.to_string(),
5520                    &format!("sequence {} dropped", q.name),
5521                    "drop_sequence",
5522                ))
5523            }
5524            // Views — CREATE [MATERIALIZED] VIEW (Phase 2.1 PG parity).
5525            //
5526            // The view definition is stored in-memory on RuntimeInner (not
5527            // persisted). SELECTs that reference the view name will substitute
5528            // the stored `QueryExpr` via `resolve_view_reference` during
5529            // planning (same entry point used by table-name resolution).
5530            //
5531            // Materialized views additionally allocate a slot in
5532            // `MaterializedViewCache`; a REFRESH repopulates that slot.
5533            QueryExpr::CreateView(ref q) => {
5534                let mut views = self.inner.views.write();
5535                if views.contains_key(&q.name) && !q.or_replace {
5536                    if q.if_not_exists {
5537                        return Ok(RuntimeQueryResult::ok_message(
5538                            query.to_string(),
5539                            &format!("view {} already exists — skipped", q.name),
5540                            "create_view",
5541                        ));
5542                    }
5543                    return Err(RedDBError::Internal(format!(
5544                        "view {} already exists",
5545                        q.name
5546                    )));
5547                }
5548                views.insert(q.name.clone(), Arc::new(q.clone()));
5549                drop(views);
5550
5551                // Materialized view: register cache slot (data is empty until REFRESH).
5552                if q.materialized {
5553                    use crate::storage::cache::result::{MaterializedViewDef, RefreshPolicy};
5554                    let def = MaterializedViewDef {
5555                        name: q.name.clone(),
5556                        query: format!("<parsed view {}>", q.name),
5557                        dependencies: collect_table_refs(&q.query),
5558                        refresh: RefreshPolicy::Manual,
5559                    };
5560                    self.inner.materialized_views.write().register(def);
5561                }
5562                // Plan cache may have cached a plan that didn't know about this
5563                // view — invalidate so future references pick up the new binding.
5564                // Result cache gets flushed too: OR REPLACE must not serve a
5565                // prior execution of the obsolete body.
5566                self.invalidate_plan_cache();
5567                self.invalidate_result_cache();
5568
5569                Ok(RuntimeQueryResult::ok_message(
5570                    query.to_string(),
5571                    &format!(
5572                        "{}view {} created",
5573                        if q.materialized { "materialized " } else { "" },
5574                        q.name
5575                    ),
5576                    "create_view",
5577                ))
5578            }
5579            QueryExpr::DropView(ref q) => {
5580                let mut views = self.inner.views.write();
5581                let existed = views.remove(&q.name).is_some();
5582                drop(views);
5583                if q.materialized || existed {
5584                    // Try the materialised cache too — silent if absent.
5585                    self.inner.materialized_views.write().remove(&q.name);
5586                }
5587                // Drop any plan / result cache entries that baked the
5588                // view body into their QueryExpr.
5589                self.invalidate_plan_cache();
5590                self.invalidate_result_cache();
5591                if !existed && !q.if_exists {
5592                    return Err(RedDBError::Internal(format!(
5593                        "view {} does not exist",
5594                        q.name
5595                    )));
5596                }
5597                self.invalidate_plan_cache();
5598                Ok(RuntimeQueryResult::ok_message(
5599                    query.to_string(),
5600                    &format!("view {} dropped", q.name),
5601                    "drop_view",
5602                ))
5603            }
5604            QueryExpr::RefreshMaterializedView(ref q) => {
5605                // Look up the view definition, execute its underlying query,
5606                // and stash the serialized result in the materialised cache.
5607                let view = {
5608                    let views = self.inner.views.read();
5609                    views.get(&q.name).cloned()
5610                };
5611                let view = match view {
5612                    Some(v) => v,
5613                    None => {
5614                        return Err(RedDBError::Internal(format!(
5615                            "view {} does not exist",
5616                            q.name
5617                        )))
5618                    }
5619                };
5620                if !view.materialized {
5621                    return Err(RedDBError::Internal(format!(
5622                        "view {} is not materialized — REFRESH requires \
5623                         CREATE MATERIALIZED VIEW",
5624                        q.name
5625                    )));
5626                }
5627                // Execute the underlying query fresh.
5628                let inner_result = self.execute_query_expr((*view.query).clone())?;
5629                // Cache data = JSON-serialised result (opaque blob; read path
5630                // returns it verbatim for now).
5631                let serialized = format!("{:?}", inner_result.result);
5632                self.inner
5633                    .materialized_views
5634                    .write()
5635                    .refresh(&q.name, serialized.into_bytes());
5636                Ok(RuntimeQueryResult::ok_message(
5637                    query.to_string(),
5638                    &format!("materialized view {} refreshed", q.name),
5639                    "refresh_materialized_view",
5640                ))
5641            }
5642            // Row Level Security (Phase 2.5 PG parity).
5643            //
5644            // Policies live in an in-memory registry keyed by (table, name).
5645            // Enforcement (AND-ing the policy's USING clause into every
5646            // query's WHERE for the table) arrives in Phase 2.5.2 via the
5647            // filter compiler; this dispatch only manages the catalog.
5648            QueryExpr::CreatePolicy(ref q) => {
5649                let key = (q.table.clone(), q.name.clone());
5650                self.inner
5651                    .rls_policies
5652                    .write()
5653                    .insert(key, Arc::new(q.clone()));
5654                self.invalidate_plan_cache();
5655                // Issue #120 — surface policy names in the
5656                // schema-vocabulary so AskPipeline (#121) can resolve
5657                // a policy reference back to its table.
5658                self.schema_vocabulary_apply(
5659                    crate::runtime::schema_vocabulary::DdlEvent::CreatePolicy {
5660                        collection: q.table.clone(),
5661                        policy: q.name.clone(),
5662                    },
5663                );
5664                Ok(RuntimeQueryResult::ok_message(
5665                    query.to_string(),
5666                    &format!("policy {} on {} created", q.name, q.table),
5667                    "create_policy",
5668                ))
5669            }
5670            QueryExpr::DropPolicy(ref q) => {
5671                let removed = self
5672                    .inner
5673                    .rls_policies
5674                    .write()
5675                    .remove(&(q.table.clone(), q.name.clone()))
5676                    .is_some();
5677                if !removed && !q.if_exists {
5678                    return Err(RedDBError::Internal(format!(
5679                        "policy {} on {} does not exist",
5680                        q.name, q.table
5681                    )));
5682                }
5683                self.invalidate_plan_cache();
5684                // Issue #120 — keep the schema-vocabulary policy
5685                // entry in sync.
5686                self.schema_vocabulary_apply(
5687                    crate::runtime::schema_vocabulary::DdlEvent::DropPolicy {
5688                        collection: q.table.clone(),
5689                        policy: q.name.clone(),
5690                    },
5691                );
5692                Ok(RuntimeQueryResult::ok_message(
5693                    query.to_string(),
5694                    &format!("policy {} on {} dropped", q.name, q.table),
5695                    "drop_policy",
5696                ))
5697            }
5698            // Foreign Data Wrappers (Phase 3.2 PG parity).
5699            //
5700            // CREATE SERVER / CREATE FOREIGN TABLE register into the shared
5701            // `ForeignTableRegistry`. The read path consults that registry
5702            // before dispatching a SELECT — when the table name matches a
5703            // registered foreign table, we forward the scan to the wrapper
5704            // and skip the normal collection lookup.
5705            //
5706            // Phase 3.2 is in-memory only; persistence across restarts is a
5707            // 3.2.2 follow-up that mirrors the view registry pattern.
5708            QueryExpr::CreateServer(ref q) => {
5709                use crate::storage::fdw::FdwOptions;
5710                let registry = Arc::clone(&self.inner.foreign_tables);
5711                if registry.server(&q.name).is_some() {
5712                    if q.if_not_exists {
5713                        return Ok(RuntimeQueryResult::ok_message(
5714                            query.to_string(),
5715                            &format!("server {} already exists — skipped", q.name),
5716                            "create_server",
5717                        ));
5718                    }
5719                    return Err(RedDBError::Internal(format!(
5720                        "server {} already exists",
5721                        q.name
5722                    )));
5723                }
5724                let mut opts = FdwOptions::new();
5725                for (k, v) in &q.options {
5726                    opts.values.insert(k.clone(), v.clone());
5727                }
5728                registry
5729                    .create_server(&q.name, &q.wrapper, opts)
5730                    .map_err(|e| RedDBError::Internal(e.to_string()))?;
5731                Ok(RuntimeQueryResult::ok_message(
5732                    query.to_string(),
5733                    &format!("server {} created (wrapper {})", q.name, q.wrapper),
5734                    "create_server",
5735                ))
5736            }
5737            QueryExpr::DropServer(ref q) => {
5738                let existed = self.inner.foreign_tables.drop_server(&q.name);
5739                if !existed && !q.if_exists {
5740                    return Err(RedDBError::Internal(format!(
5741                        "server {} does not exist",
5742                        q.name
5743                    )));
5744                }
5745                Ok(RuntimeQueryResult::ok_message(
5746                    query.to_string(),
5747                    &format!(
5748                        "server {} dropped{}",
5749                        q.name,
5750                        if q.cascade { " (cascade)" } else { "" }
5751                    ),
5752                    "drop_server",
5753                ))
5754            }
5755            QueryExpr::CreateForeignTable(ref q) => {
5756                use crate::storage::fdw::{FdwOptions, ForeignColumn, ForeignTable};
5757                let registry = Arc::clone(&self.inner.foreign_tables);
5758                if registry.foreign_table(&q.name).is_some() {
5759                    if q.if_not_exists {
5760                        return Ok(RuntimeQueryResult::ok_message(
5761                            query.to_string(),
5762                            &format!("foreign table {} already exists — skipped", q.name),
5763                            "create_foreign_table",
5764                        ));
5765                    }
5766                    return Err(RedDBError::Internal(format!(
5767                        "foreign table {} already exists",
5768                        q.name
5769                    )));
5770                }
5771                let mut opts = FdwOptions::new();
5772                for (k, v) in &q.options {
5773                    opts.values.insert(k.clone(), v.clone());
5774                }
5775                let columns: Vec<ForeignColumn> = q
5776                    .columns
5777                    .iter()
5778                    .map(|c| ForeignColumn {
5779                        name: c.name.clone(),
5780                        data_type: c.data_type.clone(),
5781                        not_null: c.not_null,
5782                    })
5783                    .collect();
5784                registry
5785                    .create_foreign_table(ForeignTable {
5786                        name: q.name.clone(),
5787                        server_name: q.server.clone(),
5788                        columns,
5789                        options: opts,
5790                    })
5791                    .map_err(|e| RedDBError::Internal(e.to_string()))?;
5792                self.invalidate_plan_cache();
5793                Ok(RuntimeQueryResult::ok_message(
5794                    query.to_string(),
5795                    &format!("foreign table {} created (server {})", q.name, q.server),
5796                    "create_foreign_table",
5797                ))
5798            }
5799            QueryExpr::DropForeignTable(ref q) => {
5800                let existed = self.inner.foreign_tables.drop_foreign_table(&q.name);
5801                if !existed && !q.if_exists {
5802                    return Err(RedDBError::Internal(format!(
5803                        "foreign table {} does not exist",
5804                        q.name
5805                    )));
5806                }
5807                self.invalidate_plan_cache();
5808                Ok(RuntimeQueryResult::ok_message(
5809                    query.to_string(),
5810                    &format!("foreign table {} dropped", q.name),
5811                    "drop_foreign_table",
5812                ))
5813            }
5814            // COPY table FROM 'path' (Phase 1.5 PG parity).
5815            //
5816            // Stream CSV rows through the shared `CsvImporter`. The collection
5817            // is auto-created on first insert (via `insert_auto`-style path);
5818            // VACUUM/ANALYZE afterwards is up to the caller.
5819            QueryExpr::CopyFrom(ref q) => {
5820                use crate::storage::import::{CsvConfig, CsvImporter};
5821                let store = self.inner.db.store();
5822                let cfg = CsvConfig {
5823                    collection: q.table.clone(),
5824                    has_header: q.has_header,
5825                    delimiter: q.delimiter.map(|c| c as u8).unwrap_or(b','),
5826                    ..CsvConfig::default()
5827                };
5828                let importer = CsvImporter::new(cfg);
5829                let stats = importer
5830                    .import_file(&q.path, store.as_ref())
5831                    .map_err(|e| RedDBError::Internal(format!("COPY failed: {e}")))?;
5832                // Tables are written → invalidate cached plans / result cache.
5833                self.note_table_write(&q.table);
5834                Ok(RuntimeQueryResult::ok_message(
5835                    query.to_string(),
5836                    &format!(
5837                        "COPY imported {} rows into {} ({} errors skipped, {}ms)",
5838                        stats.records_imported, q.table, stats.errors_skipped, stats.duration_ms
5839                    ),
5840                    "copy_from",
5841                ))
5842            }
5843            // Maintenance commands (Phase 1.2 PG parity).
5844            //
5845            // - VACUUM [FULL] [table]: refreshes planner stats for the target
5846            //   collection(s) and — when FULL — triggers a full pager persist
5847            //   (flushes dirty pages + fsync). Also invalidates the result cache
5848            //   so subsequent reads re-execute against the freshly compacted
5849            //   storage. RedDB's segment/btree GC runs continuously via the
5850            //   background lifecycle; explicit space reclamation for sealed
5851            //   segments arrives with Phase 2.3 (MVCC + dead-tuple reclamation).
5852            // - ANALYZE [table]: reruns `analyze_collection` +
5853            //   `persist_table_stats` via `refresh_table_planner_stats` so the
5854            //   planner has fresh histograms, distinct estimates, null counts.
5855            //
5856            // Both commands accept an optional target; omitting the target
5857            // iterates every collection in the store.
5858            QueryExpr::MaintenanceCommand(ref cmd) => {
5859                use crate::storage::query::ast::MaintenanceCommand as Mc;
5860                let store = self.inner.db.store();
5861                let (kind, msg) = match cmd {
5862                    Mc::Analyze { target } => {
5863                        let targets: Vec<String> = match target {
5864                            Some(t) => vec![t.clone()],
5865                            None => store.list_collections(),
5866                        };
5867                        for t in &targets {
5868                            self.refresh_table_planner_stats(t);
5869                        }
5870                        (
5871                            "analyze",
5872                            format!("ANALYZE refreshed stats for {} table(s)", targets.len()),
5873                        )
5874                    }
5875                    Mc::Vacuum { target, full } => {
5876                        let targets: Vec<String> = match target {
5877                            Some(t) => vec![t.clone()],
5878                            None => store.list_collections(),
5879                        };
5880                        let cutoff_xid = self.mvcc_vacuum_cutoff_xid();
5881                        let mut vacuum_stats =
5882                            crate::storage::unified::store::MvccVacuumStats::default();
5883                        for t in &targets {
5884                            let stats = store.vacuum_mvcc_history(t, cutoff_xid).map_err(|e| {
5885                                RedDBError::Internal(format!(
5886                                    "VACUUM MVCC history failed for {t}: {e}"
5887                                ))
5888                            })?;
5889                            if stats.reclaimed_versions > 0 {
5890                                self.rebuild_runtime_indexes_for_table(t)?;
5891                            }
5892                            vacuum_stats.add(&stats);
5893                        }
5894                        self.inner.snapshot_manager.prune_aborted(cutoff_xid);
5895                        // Stats refresh covers every target (same as ANALYZE).
5896                        for t in &targets {
5897                            self.refresh_table_planner_stats(t);
5898                        }
5899                        // FULL forces a pager persist (dirty-page flush + fsync).
5900                        // Regular VACUUM relies on the background writer / segment
5901                        // lifecycle so the command is non-blocking.
5902                        let persisted = if *full {
5903                            match store.persist() {
5904                                Ok(()) => true,
5905                                Err(e) => {
5906                                    return Err(RedDBError::Internal(format!(
5907                                        "VACUUM FULL persist failed: {e:?}"
5908                                    )));
5909                                }
5910                            }
5911                        } else {
5912                            false
5913                        };
5914                        // Result cache depended on pre-vacuum state.
5915                        self.invalidate_result_cache();
5916                        (
5917                            "vacuum",
5918                            format!(
5919                                "VACUUM{} processed {} table(s): scanned_versions={}, retained_versions={}, reclaimed_versions={}, retained_history_versions={}, reclaimed_history_versions={}, retained_tombstones={}, reclaimed_tombstones={}{}",
5920                                if *full { " FULL" } else { "" },
5921                                targets.len(),
5922                                vacuum_stats.scanned_versions,
5923                                vacuum_stats.retained_versions,
5924                                vacuum_stats.reclaimed_versions,
5925                                vacuum_stats.retained_history_versions,
5926                                vacuum_stats.reclaimed_history_versions,
5927                                vacuum_stats.retained_tombstones,
5928                                vacuum_stats.reclaimed_tombstones,
5929                                if persisted {
5930                                    " (pages flushed to disk)"
5931                                } else {
5932                                    ""
5933                                }
5934                            ),
5935                        )
5936                    }
5937                };
5938                Ok(RuntimeQueryResult::ok_message(
5939                    query.to_string(),
5940                    &msg,
5941                    kind,
5942                ))
5943            }
5944            // GRANT / REVOKE / ALTER USER (RBAC milestone).
5945            //
5946            // These hit the AuthStore directly. The privilege-check
5947            // gate at the top of `execute_query_expr` already decided
5948            // whether the caller may even run the statement; here we
5949            // just translate the AST into AuthStore calls.
5950            QueryExpr::Grant(ref g) => self.execute_grant_statement(query, g),
5951            QueryExpr::Revoke(ref r) => self.execute_revoke_statement(query, r),
5952            QueryExpr::AlterUser(ref a) => self.execute_alter_user_statement(query, a),
5953            QueryExpr::CreateIamPolicy { ref id, ref json } => {
5954                self.execute_create_iam_policy(query, id, json)
5955            }
5956            QueryExpr::DropIamPolicy { ref id } => self.execute_drop_iam_policy(query, id),
5957            QueryExpr::AttachPolicy {
5958                ref policy_id,
5959                ref principal,
5960            } => self.execute_attach_policy(query, policy_id, principal),
5961            QueryExpr::DetachPolicy {
5962                ref policy_id,
5963                ref principal,
5964            } => self.execute_detach_policy(query, policy_id, principal),
5965            QueryExpr::ShowPolicies { ref filter } => {
5966                self.execute_show_policies(query, filter.as_ref())
5967            }
5968            QueryExpr::ShowEffectivePermissions {
5969                ref user,
5970                ref resource,
5971            } => self.execute_show_effective_permissions(query, user, resource.as_ref()),
5972            QueryExpr::SimulatePolicy {
5973                ref user,
5974                ref action,
5975                ref resource,
5976            } => self.execute_simulate_policy(query, user, action, resource),
5977            QueryExpr::CreateMigration(ref q) => self.execute_create_migration(query, q),
5978            QueryExpr::ApplyMigration(ref q) => self.execute_apply_migration(query, q),
5979            QueryExpr::RollbackMigration(ref q) => self.execute_rollback_migration(query, q),
5980            QueryExpr::ExplainMigration(ref q) => self.execute_explain_migration(query, q),
5981        };
5982
5983        // Decrypt Value::Secret columns in-place before caching, so
5984        // cached results match the post-decrypt shape and repeat
5985        // queries skip the per-row AES-GCM pass.
5986        let mut query_result = query_result;
5987        if let Ok(ref mut result) = query_result {
5988            if result.statement_type == "select" {
5989                self.apply_secret_decryption(result);
5990            }
5991        }
5992
5993        // Cache SELECT results for 30s.
5994        // Skip: pre-serialized JSON (large clone), and result sets > 5 rows.
5995        // Large multi-row results (range scans, filtered scans) are rarely
5996        // repeated with the same literal values so the cache hit rate is near
5997        // zero while the clone cost (100 records × ~16 fields each) is high.
5998        // Aggregations (1 row) and point lookups (1 row) still benefit.
5999        if let Ok(ref result) = query_result {
6000            frame.write_result_cache(self, result, result_cache_scopes);
6001        }
6002
6003        query_result
6004    }
6005
6006    /// Execute a pre-parsed `QueryExpr` directly, bypassing SQL parsing and the
6007    /// plan cache. Used by the prepared-statement fast path so that `execute_prepared`
6008    /// calls pay zero parse + cache overhead.
6009    ///
6010    /// Applies secret decryption on SELECT results, identical to `execute_query`.
6011    pub fn execute_query_expr(&self, expr: QueryExpr) -> RedDBResult<RuntimeQueryResult> {
6012        let _config_snapshot_guard = ConfigSnapshotGuard::install(Arc::clone(&self.inner.db));
6013        let _secret_store_guard = SecretStoreGuard::install(self.inner.auth_store.read().clone());
6014        // View rewrite (Phase 2.1): substitute any `QueryExpr::Table(tq)`
6015        // whose `tq.table` matches a registered view with the view's
6016        // underlying query. Safe to call even when no views are registered.
6017        let expr = self.rewrite_view_refs(expr);
6018
6019        self.validate_model_operations_before_auth(&expr)?;
6020        // Granular RBAC privilege check. Runs before dispatch so a
6021        // denied caller never reaches storage. Fail-closed: any error
6022        // resolving the action / resource produces PermissionDenied.
6023        if let Err(err) = self.check_query_privilege(&expr) {
6024            return Err(RedDBError::Query(format!("permission denied: {err}")));
6025        }
6026
6027        let statement = query_expr_name(&expr);
6028        let mode = detect_mode(statement);
6029        let query_str = statement;
6030
6031        let result = self.dispatch_expr(expr, query_str, mode)?;
6032        let mut r = result;
6033        if r.statement_type == "select" {
6034            self.apply_secret_decryption(&mut r);
6035        }
6036        Ok(r)
6037    }
6038
6039    pub(super) fn validate_model_operations_before_auth(
6040        &self,
6041        expr: &QueryExpr,
6042    ) -> RedDBResult<()> {
6043        use crate::catalog::CollectionModel;
6044        use crate::runtime::ddl::polymorphic_resolver;
6045        use crate::storage::query::ast::KvCommand;
6046
6047        let system_schema_target = match expr {
6048            QueryExpr::DropTable(q) => Some(q.name.as_str()),
6049            QueryExpr::DropGraph(q) => Some(q.name.as_str()),
6050            QueryExpr::DropVector(q) => Some(q.name.as_str()),
6051            QueryExpr::DropDocument(q) => Some(q.name.as_str()),
6052            QueryExpr::DropKv(q) => Some(q.name.as_str()),
6053            QueryExpr::DropCollection(q) => Some(q.name.as_str()),
6054            QueryExpr::Truncate(q) => Some(q.name.as_str()),
6055            _ => None,
6056        };
6057        if system_schema_target.is_some_and(crate::runtime::impl_ddl::is_system_schema_name) {
6058            return Err(RedDBError::Query("system schema is read-only".to_string()));
6059        }
6060
6061        let expected = match expr {
6062            QueryExpr::DropTable(q) => Some((q.name.as_str(), CollectionModel::Table)),
6063            QueryExpr::DropGraph(q) => Some((q.name.as_str(), CollectionModel::Graph)),
6064            QueryExpr::DropVector(q) => Some((q.name.as_str(), CollectionModel::Vector)),
6065            QueryExpr::DropDocument(q) => Some((q.name.as_str(), CollectionModel::Document)),
6066            QueryExpr::DropKv(q) => Some((q.name.as_str(), q.model)),
6067            QueryExpr::DropCollection(q) => q.model.map(|model| (q.name.as_str(), model)),
6068            QueryExpr::Truncate(q) => q.model.map(|model| (q.name.as_str(), model)),
6069            QueryExpr::KvCommand(cmd) => {
6070                let (collection, model) = match cmd {
6071                    KvCommand::Put {
6072                        collection, model, ..
6073                    }
6074                    | KvCommand::Get {
6075                        collection, model, ..
6076                    }
6077                    | KvCommand::Incr {
6078                        collection, model, ..
6079                    }
6080                    | KvCommand::Cas {
6081                        collection, model, ..
6082                    }
6083                    | KvCommand::Delete {
6084                        collection, model, ..
6085                    } => (collection.as_str(), *model),
6086                    KvCommand::Rotate { collection, .. }
6087                    | KvCommand::History { collection, .. }
6088                    | KvCommand::List { collection, .. }
6089                    | KvCommand::Purge { collection, .. } => {
6090                        (collection.as_str(), CollectionModel::Vault)
6091                    }
6092                    KvCommand::InvalidateTags { collection, .. } => {
6093                        (collection.as_str(), CollectionModel::Kv)
6094                    }
6095                    KvCommand::Watch {
6096                        collection, model, ..
6097                    } => (collection.as_str(), *model),
6098                    KvCommand::Unseal { collection, .. } => {
6099                        (collection.as_str(), CollectionModel::Vault)
6100                    }
6101                };
6102                Some((collection, model))
6103            }
6104            QueryExpr::ConfigCommand(cmd) => {
6105                self.validate_config_command_before_auth(cmd)?;
6106                None
6107            }
6108            _ => None,
6109        };
6110
6111        let Some((name, expected_model)) = expected else {
6112            return Ok(());
6113        };
6114        let snapshot = self.inner.db.catalog_model_snapshot();
6115        let Some(actual_model) = snapshot
6116            .collections
6117            .iter()
6118            .find(|collection| collection.name == name)
6119            .map(|collection| collection.declared_model.unwrap_or(collection.model))
6120        else {
6121            return Ok(());
6122        };
6123        polymorphic_resolver::ensure_model_match(expected_model, actual_model)
6124    }
6125
6126    /// Walk a `QueryExpr` and replace `QueryExpr::Table(tq)` nodes whose
6127    /// `tq.table` matches a registered view name with the view's stored
6128    /// body. Recurses through joins so `SELECT ... FROM t JOIN myview ...`
6129    /// resolves correctly. Pure operation — no side effects.
6130    pub(super) fn rewrite_view_refs(&self, expr: QueryExpr) -> QueryExpr {
6131        // Fast path: no views registered → return original expression.
6132        if self.inner.views.read().is_empty() {
6133            return expr;
6134        }
6135        self.rewrite_view_refs_inner(expr)
6136    }
6137
6138    fn rewrite_view_refs_inner(&self, expr: QueryExpr) -> QueryExpr {
6139        use crate::storage::query::ast::{Filter, TableSource};
6140        match expr {
6141            QueryExpr::Table(mut tq) => {
6142                // 1. If the TableSource is a subquery, recurse into it so
6143                //    `SELECT ... FROM (SELECT ... FROM myview) t` expands.
6144                //    The legacy `table` field (set to a synthetic
6145                //    "__subq_NNNN" sentinel) stays as-is so callers that
6146                //    read it keep compiling.
6147                if let Some(TableSource::Subquery(body)) = tq.source.take() {
6148                    tq.source = Some(TableSource::Subquery(Box::new(
6149                        self.rewrite_view_refs_inner(*body),
6150                    )));
6151                    return QueryExpr::Table(tq);
6152                }
6153
6154                // 2. Restore the source field (took it above for match).
6155                // When the source was `None` or `TableSource::Name(_)`, the
6156                // real lookup key is `tq.table` — check the view registry.
6157                let maybe_view = {
6158                    let views = self.inner.views.read();
6159                    views.get(&tq.table).cloned()
6160                };
6161                let Some(view) = maybe_view else {
6162                    return QueryExpr::Table(tq);
6163                };
6164
6165                // Recurse into the view body — views may reference other
6166                // views. The recursion yields the final QueryExpr we need
6167                // to merge the outer's filter / limit / offset into.
6168                let inner_expr = self.rewrite_view_refs_inner((*view.query).clone());
6169
6170                // Phase 5: when the body is a Table we merge the outer
6171                // TableQuery's WHERE / LIMIT / OFFSET into it so stacked
6172                // views filter recursively. Non-table bodies (Search,
6173                // Ask, Vector, Graph, Hybrid) can't meaningfully combine
6174                // with an outer Table query today — return the body
6175                // verbatim; outer predicates are lost. Full projection
6176                // merge lands in Phase 5.2.
6177                match inner_expr {
6178                    QueryExpr::Table(mut inner_tq) => {
6179                        if let Some(outer_filter) = tq.filter.take() {
6180                            inner_tq.filter = Some(match inner_tq.filter.take() {
6181                                Some(existing) => {
6182                                    Filter::And(Box::new(existing), Box::new(outer_filter))
6183                                }
6184                                None => outer_filter,
6185                            });
6186                        }
6187                        if let Some(outer_limit) = tq.limit {
6188                            inner_tq.limit = Some(match inner_tq.limit {
6189                                Some(existing) => existing.min(outer_limit),
6190                                None => outer_limit,
6191                            });
6192                        }
6193                        if let Some(outer_offset) = tq.offset {
6194                            inner_tq.offset = Some(match inner_tq.offset {
6195                                Some(existing) => existing + outer_offset,
6196                                None => outer_offset,
6197                            });
6198                        }
6199                        QueryExpr::Table(inner_tq)
6200                    }
6201                    other => other,
6202                }
6203            }
6204            QueryExpr::Join(mut jq) => {
6205                jq.left = Box::new(self.rewrite_view_refs_inner(*jq.left));
6206                jq.right = Box::new(self.rewrite_view_refs_inner(*jq.right));
6207                QueryExpr::Join(jq)
6208            }
6209            // Other variants don't carry nested QueryExpr that can reference
6210            // a view by table name. Return as-is.
6211            other => other,
6212        }
6213    }
6214
6215    /// Internal dispatch: route a `QueryExpr` to the appropriate executor.
6216    /// Shared by `execute_query` (after parse/cache) and `execute_query_expr`
6217    /// (direct call from prepared-statement handler).
6218    fn authorize_relational_table_select(
6219        &self,
6220        mut table: TableQuery,
6221        frame: &dyn super::statement_frame::ReadFrame,
6222    ) -> RedDBResult<Option<TableQuery>> {
6223        if let Some(TableSource::Subquery(inner)) = table.source.take() {
6224            let authorized_inner = self.authorize_relational_select_expr(*inner, frame)?;
6225            table.source = Some(TableSource::Subquery(Box::new(authorized_inner)));
6226            return Ok(Some(table));
6227        }
6228
6229        self.check_table_column_projection_authz(&table, frame)?;
6230
6231        if self.inner.rls_enabled_tables.read().contains(&table.table) {
6232            return Ok(inject_rls_filters(self, frame, table));
6233        }
6234
6235        Ok(Some(table))
6236    }
6237
6238    fn authorize_relational_join_select(
6239        &self,
6240        mut join: JoinQuery,
6241        frame: &dyn super::statement_frame::ReadFrame,
6242    ) -> RedDBResult<Option<JoinQuery>> {
6243        self.check_join_column_projection_authz(&join, frame)?;
6244        join.left = Box::new(self.authorize_relational_join_child(*join.left, frame)?);
6245        join.right = Box::new(self.authorize_relational_join_child(*join.right, frame)?);
6246        Ok(inject_rls_into_join(self, frame, join))
6247    }
6248
6249    fn authorize_relational_join_child(
6250        &self,
6251        expr: QueryExpr,
6252        frame: &dyn super::statement_frame::ReadFrame,
6253    ) -> RedDBResult<QueryExpr> {
6254        match expr {
6255            QueryExpr::Table(mut table) => {
6256                if let Some(TableSource::Subquery(inner)) = table.source.take() {
6257                    let authorized_inner = self.authorize_relational_select_expr(*inner, frame)?;
6258                    table.source = Some(TableSource::Subquery(Box::new(authorized_inner)));
6259                }
6260                Ok(QueryExpr::Table(table))
6261            }
6262            QueryExpr::Join(join) => self
6263                .authorize_relational_join_select(join, frame)?
6264                .map(QueryExpr::Join)
6265                .ok_or_else(|| {
6266                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6267                }),
6268            other => Ok(other),
6269        }
6270    }
6271
6272    fn authorize_relational_select_expr(
6273        &self,
6274        expr: QueryExpr,
6275        frame: &dyn super::statement_frame::ReadFrame,
6276    ) -> RedDBResult<QueryExpr> {
6277        match expr {
6278            QueryExpr::Table(table) => self
6279                .authorize_relational_table_select(table, frame)?
6280                .map(QueryExpr::Table)
6281                .ok_or_else(|| {
6282                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6283                }),
6284            QueryExpr::Join(join) => self
6285                .authorize_relational_join_select(join, frame)?
6286                .map(QueryExpr::Join)
6287                .ok_or_else(|| {
6288                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6289                }),
6290            other => Ok(other),
6291        }
6292    }
6293
6294    fn check_table_column_projection_authz(
6295        &self,
6296        table: &TableQuery,
6297        frame: &dyn super::statement_frame::ReadFrame,
6298    ) -> RedDBResult<()> {
6299        let Some((username, role)) = frame.identity() else {
6300            return Ok(());
6301        };
6302        let Some(auth_store) = self.inner.auth_store.read().clone() else {
6303            return Ok(());
6304        };
6305
6306        let columns = self.resolved_table_projection_columns(table)?;
6307        let request = ColumnAccessRequest::select(table.table.clone(), columns);
6308        let principal = UserId::from_parts(frame.effective_scope(), username);
6309        let ctx = runtime_iam_context(role, frame.effective_scope());
6310        let outcome = auth_store.check_column_projection_authz(&principal, &request, &ctx);
6311        if outcome.allowed() {
6312            return Ok(());
6313        }
6314
6315        if let Some(denied) = outcome.first_denied_column() {
6316            return Err(RedDBError::Query(format!(
6317                "permission denied: principal=`{username}` cannot select column `{}`",
6318                denied.resource.name
6319            )));
6320        }
6321        Err(RedDBError::Query(format!(
6322            "permission denied: principal=`{username}` cannot select table `{}`",
6323            table.table
6324        )))
6325    }
6326
6327    fn check_join_column_projection_authz(
6328        &self,
6329        join: &JoinQuery,
6330        frame: &dyn super::statement_frame::ReadFrame,
6331    ) -> RedDBResult<()> {
6332        let mut by_table: HashMap<String, BTreeSet<String>> = HashMap::new();
6333        let projections = crate::storage::query::sql_lowering::effective_join_projections(join);
6334        self.collect_join_projection_columns(join, &projections, &mut by_table)?;
6335
6336        for (table, columns) in by_table {
6337            let query = TableQuery {
6338                table,
6339                source: None,
6340                alias: None,
6341                select_items: Vec::new(),
6342                columns: columns.into_iter().map(Projection::Column).collect(),
6343                where_expr: None,
6344                filter: None,
6345                group_by_exprs: Vec::new(),
6346                group_by: Vec::new(),
6347                having_expr: None,
6348                having: None,
6349                order_by: Vec::new(),
6350                limit: None,
6351                limit_param: None,
6352                offset: None,
6353                offset_param: None,
6354                expand: None,
6355                as_of: None,
6356            };
6357            self.check_table_column_projection_authz(&query, frame)?;
6358        }
6359        Ok(())
6360    }
6361
6362    fn collect_join_projection_columns(
6363        &self,
6364        join: &JoinQuery,
6365        projections: &[Projection],
6366        out: &mut HashMap<String, BTreeSet<String>>,
6367    ) -> RedDBResult<()> {
6368        let left = table_side_context(join.left.as_ref());
6369        let right = table_side_context(join.right.as_ref());
6370
6371        if projections
6372            .iter()
6373            .any(|projection| matches!(projection, Projection::All))
6374        {
6375            for side in [left.as_ref(), right.as_ref()].into_iter().flatten() {
6376                out.entry(side.table.clone())
6377                    .or_default()
6378                    .extend(self.table_all_projection_columns(&side.table)?);
6379            }
6380            return Ok(());
6381        }
6382
6383        for projection in projections {
6384            collect_projection_columns_for_join_side(
6385                projection,
6386                left.as_ref(),
6387                right.as_ref(),
6388                out,
6389            )?;
6390        }
6391        Ok(())
6392    }
6393
6394    fn resolved_table_projection_columns(&self, table: &TableQuery) -> RedDBResult<Vec<String>> {
6395        let projections = crate::storage::query::sql_lowering::effective_table_projections(table);
6396        if projections
6397            .iter()
6398            .any(|projection| matches!(projection, Projection::All))
6399        {
6400            return self.table_all_projection_columns(&table.table);
6401        }
6402
6403        let mut columns = BTreeSet::new();
6404        for projection in &projections {
6405            collect_projection_columns_for_table(
6406                projection,
6407                &table.table,
6408                table.alias.as_deref(),
6409                &mut columns,
6410            );
6411        }
6412        Ok(columns.into_iter().collect())
6413    }
6414
6415    fn table_all_projection_columns(&self, table: &str) -> RedDBResult<Vec<String>> {
6416        if let Some(contract) = self.inner.db.collection_contract_arc(table) {
6417            let columns: Vec<String> = contract
6418                .declared_columns
6419                .iter()
6420                .map(|column| column.name.clone())
6421                .collect();
6422            if !columns.is_empty() {
6423                return Ok(columns);
6424            }
6425        }
6426
6427        let records = scan_runtime_table_source_records_limited(&self.inner.db, table, Some(1))?;
6428        Ok(records
6429            .first()
6430            .map(|record| {
6431                record
6432                    .column_names()
6433                    .into_iter()
6434                    .map(|column| column.to_string())
6435                    .collect()
6436            })
6437            .unwrap_or_default())
6438    }
6439
6440    fn resolve_table_expr_subqueries(
6441        &self,
6442        mut table: TableQuery,
6443        frame: &dyn super::statement_frame::ReadFrame,
6444    ) -> RedDBResult<TableQuery> {
6445        if let Some(TableSource::Subquery(inner)) = table.source.take() {
6446            let inner = self.resolve_select_expr_subqueries(*inner, frame)?;
6447            table.source = Some(TableSource::Subquery(Box::new(inner)));
6448        }
6449
6450        let outer_scopes = relation_scopes_for_query(&QueryExpr::Table(table.clone()));
6451        for item in &mut table.select_items {
6452            if let crate::storage::query::ast::SelectItem::Expr { expr, .. } = item {
6453                *expr = self.resolve_expr_subqueries(expr.clone(), &outer_scopes, frame)?;
6454            }
6455        }
6456        if let Some(where_expr) = table.where_expr.take() {
6457            table.where_expr =
6458                Some(self.resolve_expr_subqueries(where_expr, &outer_scopes, frame)?);
6459            table.filter = None;
6460        }
6461        if let Some(having_expr) = table.having_expr.take() {
6462            table.having_expr =
6463                Some(self.resolve_expr_subqueries(having_expr, &outer_scopes, frame)?);
6464            table.having = None;
6465        }
6466        for expr in &mut table.group_by_exprs {
6467            *expr = self.resolve_expr_subqueries(expr.clone(), &outer_scopes, frame)?;
6468        }
6469        for clause in &mut table.order_by {
6470            if let Some(expr) = clause.expr.take() {
6471                clause.expr = Some(self.resolve_expr_subqueries(expr, &outer_scopes, frame)?);
6472            }
6473        }
6474        Ok(table)
6475    }
6476
6477    fn resolve_select_expr_subqueries(
6478        &self,
6479        expr: QueryExpr,
6480        frame: &dyn super::statement_frame::ReadFrame,
6481    ) -> RedDBResult<QueryExpr> {
6482        match expr {
6483            QueryExpr::Table(table) => self
6484                .resolve_table_expr_subqueries(table, frame)
6485                .map(QueryExpr::Table),
6486            QueryExpr::Join(mut join) => {
6487                join.left = Box::new(self.resolve_select_expr_subqueries(*join.left, frame)?);
6488                join.right = Box::new(self.resolve_select_expr_subqueries(*join.right, frame)?);
6489                Ok(QueryExpr::Join(join))
6490            }
6491            other => Ok(other),
6492        }
6493    }
6494
6495    fn resolve_expr_subqueries(
6496        &self,
6497        expr: crate::storage::query::ast::Expr,
6498        outer_scopes: &[String],
6499        frame: &dyn super::statement_frame::ReadFrame,
6500    ) -> RedDBResult<crate::storage::query::ast::Expr> {
6501        use crate::storage::query::ast::Expr;
6502
6503        match expr {
6504            Expr::Subquery { query, span } => {
6505                let values = self.execute_expr_subquery_values(query, outer_scopes, frame)?;
6506                if values.len() > 1 {
6507                    return Err(RedDBError::Query(
6508                        "scalar subquery returned more than one row".to_string(),
6509                    ));
6510                }
6511                Ok(Expr::Literal {
6512                    value: values.into_iter().next().unwrap_or(Value::Null),
6513                    span,
6514                })
6515            }
6516            Expr::BinaryOp { op, lhs, rhs, span } => Ok(Expr::BinaryOp {
6517                op,
6518                lhs: Box::new(self.resolve_expr_subqueries(*lhs, outer_scopes, frame)?),
6519                rhs: Box::new(self.resolve_expr_subqueries(*rhs, outer_scopes, frame)?),
6520                span,
6521            }),
6522            Expr::UnaryOp { op, operand, span } => Ok(Expr::UnaryOp {
6523                op,
6524                operand: Box::new(self.resolve_expr_subqueries(*operand, outer_scopes, frame)?),
6525                span,
6526            }),
6527            Expr::Cast {
6528                inner,
6529                target,
6530                span,
6531            } => Ok(Expr::Cast {
6532                inner: Box::new(self.resolve_expr_subqueries(*inner, outer_scopes, frame)?),
6533                target,
6534                span,
6535            }),
6536            Expr::FunctionCall { name, args, span } => {
6537                let args = args
6538                    .into_iter()
6539                    .map(|arg| self.resolve_expr_subqueries(arg, outer_scopes, frame))
6540                    .collect::<RedDBResult<Vec<_>>>()?;
6541                Ok(Expr::FunctionCall { name, args, span })
6542            }
6543            Expr::Case {
6544                branches,
6545                else_,
6546                span,
6547            } => {
6548                let branches = branches
6549                    .into_iter()
6550                    .map(|(cond, value)| {
6551                        Ok((
6552                            self.resolve_expr_subqueries(cond, outer_scopes, frame)?,
6553                            self.resolve_expr_subqueries(value, outer_scopes, frame)?,
6554                        ))
6555                    })
6556                    .collect::<RedDBResult<Vec<_>>>()?;
6557                let else_ = else_
6558                    .map(|expr| self.resolve_expr_subqueries(*expr, outer_scopes, frame))
6559                    .transpose()?
6560                    .map(Box::new);
6561                Ok(Expr::Case {
6562                    branches,
6563                    else_,
6564                    span,
6565                })
6566            }
6567            Expr::IsNull {
6568                operand,
6569                negated,
6570                span,
6571            } => Ok(Expr::IsNull {
6572                operand: Box::new(self.resolve_expr_subqueries(*operand, outer_scopes, frame)?),
6573                negated,
6574                span,
6575            }),
6576            Expr::InList {
6577                target,
6578                values,
6579                negated,
6580                span,
6581            } => {
6582                let target =
6583                    Box::new(self.resolve_expr_subqueries(*target, outer_scopes, frame)?);
6584                let mut resolved = Vec::new();
6585                for value in values {
6586                    if let Expr::Subquery { query, .. } = value {
6587                        resolved.extend(
6588                            self.execute_expr_subquery_values(query, outer_scopes, frame)?
6589                                .into_iter()
6590                                .map(Expr::lit),
6591                        );
6592                    } else {
6593                        resolved.push(self.resolve_expr_subqueries(value, outer_scopes, frame)?);
6594                    }
6595                }
6596                Ok(Expr::InList {
6597                    target,
6598                    values: resolved,
6599                    negated,
6600                    span,
6601                })
6602            }
6603            Expr::Between {
6604                target,
6605                low,
6606                high,
6607                negated,
6608                span,
6609            } => Ok(Expr::Between {
6610                target: Box::new(self.resolve_expr_subqueries(*target, outer_scopes, frame)?),
6611                low: Box::new(self.resolve_expr_subqueries(*low, outer_scopes, frame)?),
6612                high: Box::new(self.resolve_expr_subqueries(*high, outer_scopes, frame)?),
6613                negated,
6614                span,
6615            }),
6616            other => Ok(other),
6617        }
6618    }
6619
6620    fn execute_expr_subquery_values(
6621        &self,
6622        subquery: crate::storage::query::ast::ExprSubquery,
6623        outer_scopes: &[String],
6624        frame: &dyn super::statement_frame::ReadFrame,
6625    ) -> RedDBResult<Vec<Value>> {
6626        let query = *subquery.query;
6627        if query_references_outer_scope(&query, outer_scopes) {
6628            return Err(RedDBError::Query(
6629                "NOT_YET_SUPPORTED: correlated subqueries are not supported yet; track follow-up issue #470-correlated-subqueries".to_string(),
6630            ));
6631        }
6632        let query = self.rewrite_view_refs(query);
6633        let query = self.resolve_select_expr_subqueries(query, frame)?;
6634        let query = self.authorize_relational_select_expr(query, frame)?;
6635        let result = match query {
6636            QueryExpr::Table(table) => {
6637                execute_runtime_table_query(&self.inner.db, &table, Some(&self.inner.index_store))?
6638            }
6639            QueryExpr::Join(join) => execute_runtime_join_query(&self.inner.db, &join)?,
6640            other => {
6641                return Err(RedDBError::Query(format!(
6642                    "expression subquery must be a SELECT query, got {}",
6643                    query_expr_name(&other)
6644                )))
6645            }
6646        };
6647        first_column_values(result)
6648    }
6649
6650    fn dispatch_expr(
6651        &self,
6652        expr: QueryExpr,
6653        query_str: &str,
6654        mode: QueryMode,
6655    ) -> RedDBResult<RuntimeQueryResult> {
6656        let statement = query_expr_name(&expr);
6657        match expr {
6658            QueryExpr::Graph(_) | QueryExpr::Path(_) => {
6659                // Graph queries are not cacheable as prepared statements.
6660                Err(RedDBError::Query(
6661                    "graph queries cannot be used as prepared statements".to_string(),
6662                ))
6663            }
6664            QueryExpr::Table(table) => {
6665                let scope = self.ai_scope();
6666                let table = self.resolve_table_expr_subqueries(
6667                    table,
6668                    &scope as &dyn super::statement_frame::ReadFrame,
6669                )?;
6670                if super::red_schema::is_virtual_table(&table.table) {
6671                    return Ok(RuntimeQueryResult {
6672                        query: query_str.to_string(),
6673                        mode,
6674                        statement,
6675                        engine: "runtime-red-schema",
6676                        result: super::red_schema::red_query(
6677                            self,
6678                            &table.table,
6679                            &table,
6680                            &scope as &dyn super::statement_frame::ReadFrame,
6681                        )?,
6682                        affected_rows: 0,
6683                        statement_type: "select",
6684                    });
6685                }
6686                let Some(table_with_rls) = self.authorize_relational_table_select(
6687                    table,
6688                    &scope as &dyn super::statement_frame::ReadFrame,
6689                )?
6690                else {
6691                    return Ok(RuntimeQueryResult {
6692                        query: query_str.to_string(),
6693                        mode,
6694                        statement,
6695                        engine: "runtime-table-rls",
6696                        result: crate::storage::query::unified::UnifiedResult::empty(),
6697                        affected_rows: 0,
6698                        statement_type: "select",
6699                    });
6700                };
6701                Ok(RuntimeQueryResult {
6702                    query: query_str.to_string(),
6703                    mode,
6704                    statement,
6705                    engine: "runtime-table",
6706                    result: execute_runtime_table_query(
6707                        &self.inner.db,
6708                        &table_with_rls,
6709                        Some(&self.inner.index_store),
6710                    )?,
6711                    affected_rows: 0,
6712                    statement_type: "select",
6713                })
6714            }
6715            QueryExpr::Join(join) => {
6716                let scope = self.ai_scope();
6717                let Some(join_with_rls) = self.authorize_relational_join_select(
6718                    join,
6719                    &scope as &dyn super::statement_frame::ReadFrame,
6720                )?
6721                else {
6722                    return Ok(RuntimeQueryResult {
6723                        query: query_str.to_string(),
6724                        mode,
6725                        statement,
6726                        engine: "runtime-join-rls",
6727                        result: crate::storage::query::unified::UnifiedResult::empty(),
6728                        affected_rows: 0,
6729                        statement_type: "select",
6730                    });
6731                };
6732                Ok(RuntimeQueryResult {
6733                    query: query_str.to_string(),
6734                    mode,
6735                    statement,
6736                    engine: "runtime-join",
6737                    result: execute_runtime_join_query(&self.inner.db, &join_with_rls)?,
6738                    affected_rows: 0,
6739                    statement_type: "select",
6740                })
6741            }
6742            QueryExpr::Vector(vector) => Ok(RuntimeQueryResult {
6743                query: query_str.to_string(),
6744                mode,
6745                statement,
6746                engine: "runtime-vector",
6747                result: execute_runtime_vector_query(&self.inner.db, &vector)?,
6748                affected_rows: 0,
6749                statement_type: "select",
6750            }),
6751            QueryExpr::Hybrid(hybrid) => Ok(RuntimeQueryResult {
6752                query: query_str.to_string(),
6753                mode,
6754                statement,
6755                engine: "runtime-hybrid",
6756                result: execute_runtime_hybrid_query(&self.inner.db, &hybrid)?,
6757                affected_rows: 0,
6758                statement_type: "select",
6759            }),
6760            QueryExpr::Insert(ref insert) if super::red_schema::is_virtual_table(&insert.table) => {
6761                Err(RedDBError::Query(
6762                    super::red_schema::READ_ONLY_ERROR.to_string(),
6763                ))
6764            }
6765            QueryExpr::Update(ref update) if super::red_schema::is_virtual_table(&update.table) => {
6766                Err(RedDBError::Query(
6767                    super::red_schema::READ_ONLY_ERROR.to_string(),
6768                ))
6769            }
6770            QueryExpr::Delete(ref delete) if super::red_schema::is_virtual_table(&delete.table) => {
6771                Err(RedDBError::Query(
6772                    super::red_schema::READ_ONLY_ERROR.to_string(),
6773                ))
6774            }
6775            QueryExpr::Insert(ref insert) => self
6776                .with_deferred_store_wal_for_dml(self.insert_may_emit_events(insert), || {
6777                    self.execute_insert(query_str, insert)
6778                }),
6779            QueryExpr::Update(ref update) => self
6780                .with_deferred_store_wal_for_dml(self.update_may_emit_events(update), || {
6781                    self.execute_update(query_str, update)
6782                }),
6783            QueryExpr::Delete(ref delete) => self
6784                .with_deferred_store_wal_for_dml(self.delete_may_emit_events(delete), || {
6785                    self.execute_delete(query_str, delete)
6786                }),
6787            QueryExpr::SearchCommand(ref cmd) => self.execute_search_command(query_str, cmd),
6788            QueryExpr::Ask(ref ask) => self.execute_ask(query_str, ask),
6789            _ => Err(RedDBError::Query(format!(
6790                "prepared-statement execution does not support {statement} statements"
6791            ))),
6792        }
6793    }
6794
6795    /// Ultra-fast path: detect `SELECT * FROM table WHERE _entity_id = N` by string pattern
6796    /// and execute it without SQL parsing or planning. Returns None if pattern doesn't match.
6797    fn try_fast_entity_lookup(&self, query: &str) -> Option<RedDBResult<RuntimeQueryResult>> {
6798        // Pattern: "SELECT * FROM <table> WHERE _entity_id = <id>"
6799        // or "SELECT * FROM <table> WHERE _entity_id =<id>"
6800        let q = query.trim();
6801        if !q.starts_with("SELECT") && !q.starts_with("select") {
6802            return None;
6803        }
6804
6805        // Find "WHERE _entity_id = " or "WHERE _entity_id ="
6806        let where_pos = q
6807            .find("WHERE _entity_id")
6808            .or_else(|| q.find("where _entity_id"))?;
6809        let after_field = &q[where_pos + 16..].trim_start(); // skip "WHERE _entity_id"
6810        let after_eq = after_field.strip_prefix('=')?.trim_start();
6811
6812        // Parse the entity ID number
6813        let id_str = after_eq.trim();
6814        let entity_id: u64 = id_str.parse().ok()?;
6815
6816        // Extract table name: between "FROM " and " WHERE"
6817        let from_pos = q.find("FROM ").or_else(|| q.find("from "))? + 5;
6818        let table = q[from_pos..where_pos].trim();
6819        if table.is_empty()
6820            || table.contains(' ') && !table.contains(" AS ") && !table.contains(" as ")
6821        {
6822            return None; // complex query, fall through
6823        }
6824        let table_name = table.split_whitespace().next()?;
6825
6826        // Direct entity lookup — skips SQL parse, plan cache, result
6827        // cache, view rewriter, RLS gate. Safe because the gating in
6828        // `execute_query` guarantees no scope override / no
6829        // transaction context is active. MVCC visibility is still
6830        // honoured against the current snapshot.
6831        let store = self.inner.db.store();
6832        let entity = store
6833            .get(
6834                table_name,
6835                crate::storage::unified::EntityId::new(entity_id),
6836            )
6837            .filter(entity_visible_under_current_snapshot);
6838
6839        let count = if entity.is_some() { 1u64 } else { 0 };
6840
6841        // Materialize a record so downstream consumers that walk
6842        // `result.records` (embedded runtime API, decrypt pass, CLI)
6843        // see the row. Previously only `pre_serialized_json` was
6844        // filled, which caused those consumers to see zero rows and
6845        // skewed benchmarks.
6846        let records: Vec<crate::storage::query::unified::UnifiedRecord> = entity
6847            .as_ref()
6848            .and_then(|e| runtime_table_record_from_entity(e.clone()))
6849            .into_iter()
6850            .collect();
6851
6852        let json = match entity {
6853            Some(ref e) => execute_runtime_serialize_single_entity(e),
6854            None => r#"{"columns":[],"record_count":0,"selection":{"scope":"any"},"records":[]}"#
6855                .to_string(),
6856        };
6857
6858        Some(Ok(RuntimeQueryResult {
6859            query: query.to_string(),
6860            mode: crate::storage::query::modes::QueryMode::Sql,
6861            statement: "select",
6862            engine: "fast-entity-lookup",
6863            result: crate::storage::query::unified::UnifiedResult {
6864                columns: Vec::new(),
6865                records,
6866                stats: crate::storage::query::unified::QueryStats {
6867                    rows_scanned: count,
6868                    ..Default::default()
6869                },
6870                pre_serialized_json: Some(json),
6871            },
6872            affected_rows: 0,
6873            statement_type: "select",
6874        }))
6875    }
6876
6877    fn result_cache_backend(&self) -> RuntimeResultCacheBackend {
6878        match self
6879            .config_string(RESULT_CACHE_BACKEND_KEY, RESULT_CACHE_DEFAULT_BACKEND)
6880            .as_str()
6881        {
6882            "blob_cache" => RuntimeResultCacheBackend::BlobCache,
6883            "shadow" => RuntimeResultCacheBackend::Shadow,
6884            _ => RuntimeResultCacheBackend::Legacy,
6885        }
6886    }
6887
6888    pub(super) fn get_result_cache_entry(&self, key: &str) -> Option<RuntimeQueryResult> {
6889        match self.result_cache_backend() {
6890            RuntimeResultCacheBackend::Legacy => self.get_legacy_result_cache_entry(key),
6891            RuntimeResultCacheBackend::BlobCache => self.get_blob_result_cache_entry(key),
6892            RuntimeResultCacheBackend::Shadow => {
6893                let legacy = self.get_legacy_result_cache_entry(key);
6894                let blob = self.get_blob_result_cache_entry(key);
6895                if let (Some(ref legacy), Some(ref blob)) = (&legacy, &blob) {
6896                    if result_cache_fingerprint(legacy) != result_cache_fingerprint(blob) {
6897                        self.inner
6898                            .result_cache_shadow_divergences
6899                            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
6900                        tracing::warn!(
6901                            key,
6902                            metric = crate::runtime::METRIC_CACHE_SHADOW_DIVERGENCE_TOTAL,
6903                            "result cache shadow backend diverged from legacy"
6904                        );
6905                    }
6906                }
6907                legacy
6908            }
6909        }
6910    }
6911
6912    fn get_legacy_result_cache_entry(&self, key: &str) -> Option<RuntimeQueryResult> {
6913        let cache = self.inner.result_cache.read();
6914        cache.0.get(key).and_then(|entry| {
6915            if entry.cached_at.elapsed().as_secs() < RESULT_CACHE_TTL_SECS {
6916                Some(entry.result.clone())
6917            } else {
6918                None
6919            }
6920        })
6921    }
6922
6923    fn get_blob_result_cache_entry(&self, key: &str) -> Option<RuntimeQueryResult> {
6924        let hit = self
6925            .inner
6926            .result_blob_cache
6927            .get(RESULT_CACHE_BLOB_NAMESPACE, key)?;
6928        {
6929            let cache = self.inner.result_blob_entries.read();
6930            if let Some(entry) = cache.0.get(key) {
6931                return Some(entry.result.clone());
6932            }
6933        }
6934
6935        let (result, scopes) = decode_result_cache_payload(hit.value())?;
6936        let mut cache = self.inner.result_blob_entries.write();
6937        let (ref mut map, ref mut order) = *cache;
6938        if !map.contains_key(key) {
6939            order.push_back(key.to_string());
6940        }
6941        map.insert(
6942            key.to_string(),
6943            RuntimeResultCacheEntry {
6944                result: result.clone(),
6945                cached_at: std::time::Instant::now(),
6946                scopes,
6947            },
6948        );
6949        trim_result_cache(map, order);
6950        Some(result)
6951    }
6952
6953    pub(super) fn put_result_cache_entry(&self, key: &str, entry: RuntimeResultCacheEntry) {
6954        match self.result_cache_backend() {
6955            RuntimeResultCacheBackend::Legacy => self.put_legacy_result_cache_entry(key, entry),
6956            RuntimeResultCacheBackend::BlobCache => self.put_blob_result_cache_entry(key, entry),
6957            RuntimeResultCacheBackend::Shadow => {
6958                self.put_legacy_result_cache_entry(key, entry.clone());
6959                self.put_blob_result_cache_entry(key, entry);
6960            }
6961        }
6962    }
6963
6964    fn put_legacy_result_cache_entry(&self, key: &str, entry: RuntimeResultCacheEntry) {
6965        let mut cache = self.inner.result_cache.write();
6966        let (ref mut map, ref mut order) = *cache;
6967        if !map.contains_key(key) {
6968            order.push_back(key.to_string());
6969        }
6970        map.insert(key.to_string(), entry);
6971        trim_result_cache(map, order);
6972    }
6973
6974    fn put_blob_result_cache_entry(&self, key: &str, entry: RuntimeResultCacheEntry) {
6975        let policy = crate::storage::cache::BlobCachePolicy::default()
6976            .ttl_ms(RESULT_CACHE_TTL_SECS * 1000)
6977            .priority(200);
6978        let dependencies = entry.scopes.iter().cloned().collect::<Vec<_>>();
6979        let bytes = encode_result_cache_payload(&entry)
6980            .unwrap_or_else(|| result_cache_fingerprint(&entry.result).into_bytes());
6981        let put = crate::storage::cache::BlobCachePut::new(bytes)
6982            .with_dependencies(dependencies)
6983            .with_policy(policy);
6984        if self
6985            .inner
6986            .result_blob_cache
6987            .put(RESULT_CACHE_BLOB_NAMESPACE, key, put)
6988            .is_err()
6989        {
6990            return;
6991        }
6992
6993        let mut cache = self.inner.result_blob_entries.write();
6994        let (ref mut map, ref mut order) = *cache;
6995        if !map.contains_key(key) {
6996            order.push_back(key.to_string());
6997        }
6998        map.insert(key.to_string(), entry);
6999        trim_result_cache(map, order);
7000    }
7001
7002    pub fn result_cache_shadow_divergences(&self) -> u64 {
7003        self.inner
7004            .result_cache_shadow_divergences
7005            .load(std::sync::atomic::Ordering::Relaxed)
7006    }
7007
7008    /// Invalidate the result cache (call after any write operation).
7009    /// Full clear — use for DDL (DROP TABLE, schema changes) or when table is unknown.
7010    pub fn invalidate_result_cache(&self) {
7011        let mut cache = self.inner.result_cache.write();
7012        cache.0.clear();
7013        cache.1.clear();
7014        let mut blob_entries = self.inner.result_blob_entries.write();
7015        blob_entries.0.clear();
7016        blob_entries.1.clear();
7017        self.inner
7018            .result_blob_cache
7019            .invalidate_namespace(RESULT_CACHE_BLOB_NAMESPACE);
7020        let mut ask_entries = self.inner.ask_answer_cache_entries.write();
7021        ask_entries.0.clear();
7022        ask_entries.1.clear();
7023        self.inner
7024            .result_blob_cache
7025            .invalidate_namespace(ASK_ANSWER_CACHE_NAMESPACE);
7026    }
7027
7028    /// Invalidate only result cache entries that declared a dependency on `table`.
7029    /// Cheaper than a full clear: unrelated tables keep their cached results.
7030    pub(crate) fn invalidate_result_cache_for_table(&self, table: &str) {
7031        // Hot-path probe both backends before taking write locks. The blob
7032        // backend is node-local, same as the legacy result cache.
7033        let legacy_has_match = {
7034            let cache = self.inner.result_cache.read();
7035            let (ref map, _) = *cache;
7036            !map.is_empty() && map.values().any(|entry| entry.scopes.contains(table))
7037        };
7038        let blob_has_match = {
7039            let cache = self.inner.result_blob_entries.read();
7040            let (ref map, _) = *cache;
7041            !map.is_empty() && map.values().any(|entry| entry.scopes.contains(table))
7042        };
7043        if legacy_has_match {
7044            let mut cache = self.inner.result_cache.write();
7045            let (ref mut map, ref mut order) = *cache;
7046            map.retain(|_, entry| !entry.scopes.contains(table));
7047            order.retain(|key| map.contains_key(key));
7048        }
7049
7050        if matches!(
7051            self.result_cache_backend(),
7052            RuntimeResultCacheBackend::BlobCache | RuntimeResultCacheBackend::Shadow
7053        ) {
7054            let mut blob_entries = self.inner.result_blob_entries.write();
7055            let (ref mut blob_map, ref mut blob_order) = *blob_entries;
7056            blob_map.clear();
7057            blob_order.clear();
7058            self.inner
7059                .result_blob_cache
7060                .invalidate_namespace(RESULT_CACHE_BLOB_NAMESPACE);
7061        } else if blob_has_match {
7062            let mut blob_entries = self.inner.result_blob_entries.write();
7063            let (ref mut blob_map, ref mut blob_order) = *blob_entries;
7064            blob_map.retain(|_, entry| !entry.scopes.contains(table));
7065            blob_order.retain(|key| blob_map.contains_key(key));
7066        }
7067        let mut ask_entries = self.inner.ask_answer_cache_entries.write();
7068        ask_entries.0.clear();
7069        ask_entries.1.clear();
7070        self.inner
7071            .result_blob_cache
7072            .invalidate_namespace(ASK_ANSWER_CACHE_NAMESPACE);
7073    }
7074
7075    pub(crate) fn invalidate_plan_cache(&self) {
7076        self.inner.query_cache.write().clear();
7077        self.inner
7078            .ddl_epoch
7079            .fetch_add(1, std::sync::atomic::Ordering::Release);
7080    }
7081
7082    /// Read the monotonic DDL epoch counter. Bumped by every
7083    /// `invalidate_plan_cache` call so prepared-statement holders can
7084    /// detect schema drift between PREPARE and EXECUTE.
7085    pub fn ddl_epoch(&self) -> u64 {
7086        self.inner
7087            .ddl_epoch
7088            .load(std::sync::atomic::Ordering::Acquire)
7089    }
7090
7091    pub(crate) fn clear_table_planner_stats(&self, table: &str) {
7092        let store = self.inner.db.store();
7093        crate::storage::query::planner::stats_catalog::clear_table_stats(store.as_ref(), table);
7094        self.invalidate_plan_cache();
7095    }
7096
7097    /// Replay `tenant_tables.*.column` keys from red_config at boot so
7098    /// `CREATE TABLE ... TENANT BY (col)` declarations persist across
7099    /// restarts (Phase 2.5.4). Reads every row of the `red_config`
7100    /// collection, picks the keys matching the tenant-marker shape,
7101    /// and calls `register_tenant_table` for each.
7102    ///
7103    /// Safe no-op when `red_config` doesn't exist (first boot on a
7104    /// fresh datadir).
7105    pub(crate) fn rehydrate_tenant_tables(&self) {
7106        let store = self.inner.db.store();
7107        let Some(manager) = store.get_collection("red_config") else {
7108            return;
7109        };
7110        // Replay in insertion order (SegmentManager iteration). Multiple
7111        // toggles on the same table leave several rows behind — the
7112        // last one processed wins because each register/unregister
7113        // call overwrites the in-memory state.
7114        for entity in manager.query_all(|_| true) {
7115            let crate::storage::unified::entity::EntityData::Row(row) = &entity.data else {
7116                continue;
7117            };
7118            let Some(named) = &row.named else { continue };
7119            let Some(crate::storage::schema::Value::Text(key)) = named.get("key") else {
7120                continue;
7121            };
7122            // Shape: tenant_tables.{table}.column
7123            let Some(rest) = key.strip_prefix("tenant_tables.") else {
7124                continue;
7125            };
7126            let Some((table, suffix)) = rest.rsplit_once('.') else {
7127                // Issue #205 — a `tenant_tables.*` row that doesn't
7128                // split cleanly is a schema-shape regression: the
7129                // metadata writer must always emit the `.column`
7130                // suffix, so reaching this branch means an upgrade
7131                // with incompatible state or external tampering.
7132                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7133                    collection: "red_config".to_string(),
7134                    detail: format!("malformed tenant_tables key: {key}"),
7135                }
7136                .emit_global();
7137                continue;
7138            };
7139            if suffix != "column" {
7140                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7141                    collection: "red_config".to_string(),
7142                    detail: format!("unexpected tenant_tables suffix: {key}"),
7143                }
7144                .emit_global();
7145                continue;
7146            }
7147            match named.get("value") {
7148                Some(crate::storage::schema::Value::Text(column)) => {
7149                    self.register_tenant_table(table, column);
7150                }
7151                // Null / missing value = DISABLE TENANCY marker.
7152                Some(crate::storage::schema::Value::Null) | None => {
7153                    self.unregister_tenant_table(table);
7154                }
7155                _ => {}
7156            }
7157        }
7158    }
7159
7160    pub(crate) fn rehydrate_declared_column_schemas(&self) {
7161        let store = self.inner.db.store();
7162        for contract in self.inner.db.collection_contracts() {
7163            let columns: Vec<String> = contract
7164                .declared_columns
7165                .iter()
7166                .map(|column| column.name.clone())
7167                .collect();
7168            let Some(manager) = store.get_collection(&contract.name) else {
7169                continue;
7170            };
7171            manager.set_column_schema_if_empty(columns);
7172        }
7173    }
7174
7175    /// Register a table as tenant-scoped (Phase 2.5.4). Installs the
7176    /// in-memory column mapping, the implicit RLS policy, and enables
7177    /// row-level security on the table. Idempotent — re-registering
7178    /// the same `(table, column)` replaces the prior auto-policy.
7179    pub fn register_tenant_table(&self, table: &str, column: &str) {
7180        use crate::storage::query::ast::{
7181            CompareOp, CreatePolicyQuery, Expr, FieldRef, Filter, Span,
7182        };
7183        self.inner
7184            .tenant_tables
7185            .write()
7186            .insert(table.to_string(), column.to_string());
7187
7188        // Build the policy: col = CURRENT_TENANT()
7189        // Uses CompareExpr so the comparison happens at runtime against
7190        // the thread-local tenant value read by the CURRENT_TENANT
7191        // scalar. Spans are synthetic — there's no source location for
7192        // an auto-generated policy.
7193        let lhs = Expr::Column {
7194            field: FieldRef::TableColumn {
7195                table: table.to_string(),
7196                column: column.to_string(),
7197            },
7198            span: Span::synthetic(),
7199        };
7200        let rhs = Expr::FunctionCall {
7201            name: "CURRENT_TENANT".to_string(),
7202            args: Vec::new(),
7203            span: Span::synthetic(),
7204        };
7205        let policy_filter = Filter::CompareExpr {
7206            lhs,
7207            op: CompareOp::Eq,
7208            rhs,
7209        };
7210
7211        let policy = CreatePolicyQuery {
7212            name: "__tenant_iso".to_string(),
7213            table: table.to_string(),
7214            action: None, // None = ALL actions (SELECT/INSERT/UPDATE/DELETE)
7215            role: None,   // None = every role
7216            using: Box::new(policy_filter),
7217            // Auto-tenancy defaults to Table targets. Collections of
7218            // other kinds (graph / vector / queue / timeseries) that
7219            // opt in via `ALTER ... ENABLE TENANCY` should use the
7220            // matching kind — but for now we keep the auto-policy
7221            // kind-agnostic so the evaluator can apply it to any
7222            // entity living in the collection.
7223            target_kind: crate::storage::query::ast::PolicyTargetKind::Table,
7224        };
7225
7226        // Replace any prior auto-policy for this table (column rename).
7227        self.inner.rls_policies.write().insert(
7228            (table.to_string(), "__tenant_iso".to_string()),
7229            Arc::new(policy),
7230        );
7231        self.inner
7232            .rls_enabled_tables
7233            .write()
7234            .insert(table.to_string());
7235
7236        // Auto-build a hash index on the tenant column. Every read/write
7237        // against a tenant-scoped table carries an implicit
7238        // `col = CURRENT_TENANT()` predicate from the auto-policy, so an
7239        // index on that column is on the hot path of every query. Without
7240        // it, every SELECT/UPDATE/DELETE degrades to a full scan.
7241        self.ensure_tenant_index(table, column);
7242    }
7243
7244    /// Auto-create the hash index that backs the tenant-iso RLS predicate.
7245    /// Skipped when:
7246    ///   * the column is dotted (nested path — flat secondary indices
7247    ///     don't cover those today; RLS still works via the policy)
7248    ///   * `__tenant_idx_{table}` already exists (idempotent on rehydrate)
7249    ///   * the user already registered an index whose first column matches
7250    ///     (avoids redundant duplicates of a user-defined composite)
7251    fn ensure_tenant_index(&self, table: &str, column: &str) {
7252        if column.contains('.') {
7253            return;
7254        }
7255        let index_name = format!("__tenant_idx_{table}");
7256        let registry = self.inner.index_store.list_indices(table);
7257        if registry.iter().any(|idx| idx.name == index_name) {
7258            return;
7259        }
7260        if registry
7261            .iter()
7262            .any(|idx| idx.columns.first().map(|c| c.as_str()) == Some(column))
7263        {
7264            return;
7265        }
7266
7267        let store = self.inner.db.store();
7268        let Some(manager) = store.get_collection(table) else {
7269            return;
7270        };
7271        let entities = manager.query_all(|_| true);
7272        let entity_fields: Vec<(
7273            crate::storage::unified::EntityId,
7274            Vec<(String, crate::storage::schema::Value)>,
7275        )> = entities
7276            .iter()
7277            .map(|e| {
7278                let fields = match &e.data {
7279                    crate::storage::EntityData::Row(row) => {
7280                        if let Some(ref named) = row.named {
7281                            named.iter().map(|(k, v)| (k.clone(), v.clone())).collect()
7282                        } else if let Some(ref schema) = row.schema {
7283                            schema
7284                                .iter()
7285                                .zip(row.columns.iter())
7286                                .map(|(k, v)| (k.clone(), v.clone()))
7287                                .collect()
7288                        } else {
7289                            Vec::new()
7290                        }
7291                    }
7292                    crate::storage::EntityData::Node(node) => node
7293                        .properties
7294                        .iter()
7295                        .map(|(k, v)| (k.clone(), v.clone()))
7296                        .collect(),
7297                    _ => Vec::new(),
7298                };
7299                (e.id, fields)
7300            })
7301            .collect();
7302
7303        let columns = vec![column.to_string()];
7304        if self
7305            .inner
7306            .index_store
7307            .create_index(
7308                &index_name,
7309                table,
7310                &columns,
7311                super::index_store::IndexMethodKind::Hash,
7312                false,
7313                &entity_fields,
7314            )
7315            .is_err()
7316        {
7317            return;
7318        }
7319        self.inner
7320            .index_store
7321            .register(super::index_store::RegisteredIndex {
7322                name: index_name,
7323                collection: table.to_string(),
7324                columns,
7325                method: super::index_store::IndexMethodKind::Hash,
7326                unique: false,
7327            });
7328        self.invalidate_plan_cache();
7329    }
7330
7331    /// Drop the auto-generated tenant index, if one exists. Called from
7332    /// `unregister_tenant_table` so DISABLE TENANCY / DROP TABLE clean up.
7333    fn drop_tenant_index(&self, table: &str) {
7334        let index_name = format!("__tenant_idx_{table}");
7335        self.inner.index_store.drop_index(&index_name, table);
7336    }
7337
7338    /// Retrieve the tenant column for a table, if any (Phase 2.5.4).
7339    /// Used by the INSERT auto-fill path to know which column to
7340    /// populate with `current_tenant()` when the user didn't name it.
7341    pub fn tenant_column(&self, table: &str) -> Option<String> {
7342        self.inner.tenant_tables.read().get(table).cloned()
7343    }
7344
7345    /// Remove a table's tenant registration (Phase 2.5.4). Called by
7346    /// DROP TABLE / ALTER TABLE DISABLE TENANCY. Removes the auto-policy
7347    /// but leaves any user-installed explicit policies intact.
7348    pub fn unregister_tenant_table(&self, table: &str) {
7349        self.inner.tenant_tables.write().remove(table);
7350        self.inner
7351            .rls_policies
7352            .write()
7353            .remove(&(table.to_string(), "__tenant_iso".to_string()));
7354        self.drop_tenant_index(table);
7355        // Only clear RLS enablement if no other policies remain.
7356        let has_other_policies = self
7357            .inner
7358            .rls_policies
7359            .read()
7360            .keys()
7361            .any(|(t, _)| t == table);
7362        if !has_other_policies {
7363            self.inner.rls_enabled_tables.write().remove(table);
7364        }
7365    }
7366
7367    /// Record that the running transaction has marked `id` in `collection`
7368    /// for deletion (Phase 2.3.2b MVCC tombstones). `stamper_xid` is the
7369    /// xid that was written into `xmax` — either the parent txn xid or
7370    /// the innermost savepoint sub-xid. Savepoint rollback filters by
7371    /// this xid to revive only its own tombstones.
7372    pub(crate) fn record_pending_tombstone(
7373        &self,
7374        conn_id: u64,
7375        collection: &str,
7376        id: crate::storage::unified::entity::EntityId,
7377        stamper_xid: crate::storage::transaction::snapshot::Xid,
7378        previous_xmax: crate::storage::transaction::snapshot::Xid,
7379    ) {
7380        self.inner
7381            .pending_tombstones
7382            .write()
7383            .entry(conn_id)
7384            .or_default()
7385            .push((collection.to_string(), id, stamper_xid, previous_xmax));
7386    }
7387
7388    pub(crate) fn record_pending_versioned_update(
7389        &self,
7390        conn_id: u64,
7391        collection: &str,
7392        old_id: crate::storage::unified::entity::EntityId,
7393        new_id: crate::storage::unified::entity::EntityId,
7394        stamper_xid: crate::storage::transaction::snapshot::Xid,
7395        previous_xmax: crate::storage::transaction::snapshot::Xid,
7396    ) {
7397        self.inner
7398            .pending_versioned_updates
7399            .write()
7400            .entry(conn_id)
7401            .or_default()
7402            .push((
7403                collection.to_string(),
7404                old_id,
7405                new_id,
7406                stamper_xid,
7407                previous_xmax,
7408            ));
7409    }
7410
7411    fn with_deferred_store_wal_if_transaction<T>(
7412        &self,
7413        f: impl FnOnce() -> RedDBResult<T>,
7414    ) -> RedDBResult<T> {
7415        let conn_id = current_connection_id();
7416        if !self.inner.tx_contexts.read().contains_key(&conn_id) {
7417            return f();
7418        }
7419
7420        crate::storage::UnifiedStore::begin_deferred_store_wal_capture();
7421        let result = f();
7422        let captured = crate::storage::UnifiedStore::take_deferred_store_wal_capture();
7423        match result {
7424            Ok(value) => {
7425                self.record_pending_store_wal_actions(conn_id, captured);
7426                Ok(value)
7427            }
7428            Err(err) => Err(err),
7429        }
7430    }
7431
7432    fn with_deferred_store_wal_for_dml<T>(
7433        &self,
7434        capture_autocommit_events: bool,
7435        f: impl FnOnce() -> RedDBResult<T>,
7436    ) -> RedDBResult<T> {
7437        let conn_id = current_connection_id();
7438        if self.inner.tx_contexts.read().contains_key(&conn_id) {
7439            return self.with_deferred_store_wal_if_transaction(f);
7440        }
7441        if !capture_autocommit_events {
7442            return f();
7443        }
7444
7445        crate::storage::UnifiedStore::begin_deferred_store_wal_capture();
7446        let result = f();
7447        let captured = crate::storage::UnifiedStore::take_deferred_store_wal_capture();
7448        self.inner
7449            .db
7450            .store()
7451            .append_deferred_store_wal_actions(captured)
7452            .map_err(|err| RedDBError::Internal(err.to_string()))?;
7453        result
7454    }
7455
7456    fn insert_may_emit_events(&self, query: &InsertQuery) -> bool {
7457        !query.suppress_events
7458            && self.collection_has_event_subscriptions_for_operation(
7459                &query.table,
7460                crate::catalog::SubscriptionOperation::Insert,
7461            )
7462    }
7463
7464    fn update_may_emit_events(&self, query: &UpdateQuery) -> bool {
7465        !query.suppress_events
7466            && self.collection_has_event_subscriptions_for_operation(
7467                &query.table,
7468                crate::catalog::SubscriptionOperation::Update,
7469            )
7470    }
7471
7472    fn delete_may_emit_events(&self, query: &DeleteQuery) -> bool {
7473        !query.suppress_events
7474            && self.collection_has_event_subscriptions_for_operation(
7475                &query.table,
7476                crate::catalog::SubscriptionOperation::Delete,
7477            )
7478    }
7479
7480    fn collection_has_event_subscriptions_for_operation(
7481        &self,
7482        collection: &str,
7483        operation: crate::catalog::SubscriptionOperation,
7484    ) -> bool {
7485        let Some(contract) = self.db().collection_contract_arc(collection) else {
7486            return false;
7487        };
7488        contract.subscriptions.iter().any(|subscription| {
7489            subscription.enabled
7490                && (subscription.ops_filter.is_empty()
7491                    || subscription.ops_filter.contains(&operation))
7492        })
7493    }
7494
7495    fn record_pending_store_wal_actions(
7496        &self,
7497        conn_id: u64,
7498        actions: crate::storage::unified::DeferredStoreWalActions,
7499    ) {
7500        if actions.is_empty() {
7501            return;
7502        }
7503        let mut guard = self.inner.pending_store_wal_actions.write();
7504        guard.entry(conn_id).or_default().extend(actions);
7505    }
7506
7507    fn flush_pending_store_wal_actions(&self, conn_id: u64) -> RedDBResult<()> {
7508        let Some(actions) = self
7509            .inner
7510            .pending_store_wal_actions
7511            .write()
7512            .remove(&conn_id)
7513        else {
7514            return Ok(());
7515        };
7516        self.inner
7517            .db
7518            .store()
7519            .append_deferred_store_wal_actions(actions)
7520            .map_err(|err| RedDBError::Internal(err.to_string()))
7521    }
7522
7523    fn discard_pending_store_wal_actions(&self, conn_id: u64) {
7524        self.inner
7525            .pending_store_wal_actions
7526            .write()
7527            .remove(&conn_id);
7528    }
7529
7530    fn xid_conflicts_with_snapshot(
7531        &self,
7532        xid: crate::storage::transaction::snapshot::Xid,
7533        snapshot: &crate::storage::transaction::snapshot::Snapshot,
7534        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
7535    ) -> bool {
7536        xid != 0
7537            && !own_xids.contains(&xid)
7538            && !self.inner.snapshot_manager.is_aborted(xid)
7539            && !self.inner.snapshot_manager.is_active(xid)
7540            && (xid > snapshot.xid || snapshot.in_progress.contains(&xid))
7541    }
7542
7543    fn conflict_error(
7544        collection: &str,
7545        logical_id: crate::storage::unified::entity::EntityId,
7546        xid: crate::storage::transaction::snapshot::Xid,
7547    ) -> RedDBError {
7548        RedDBError::Query(format!(
7549            "serialization conflict: table row {collection}/{} was modified by concurrent transaction {xid}",
7550            logical_id.raw()
7551        ))
7552    }
7553
7554    fn check_logical_row_conflict(
7555        &self,
7556        collection: &str,
7557        logical_id: crate::storage::unified::entity::EntityId,
7558        excluded_ids: &[crate::storage::unified::entity::EntityId],
7559        snapshot: &crate::storage::transaction::snapshot::Snapshot,
7560        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
7561    ) -> RedDBResult<()> {
7562        let store = self.inner.db.store();
7563        let Some(manager) = store.get_collection(collection) else {
7564            return Ok(());
7565        };
7566
7567        for candidate in manager.query_all(|_| true) {
7568            if excluded_ids.contains(&candidate.id) || candidate.logical_id() != logical_id {
7569                continue;
7570            }
7571            if self.xid_conflicts_with_snapshot(candidate.xmin, snapshot, own_xids) {
7572                return Err(Self::conflict_error(collection, logical_id, candidate.xmin));
7573            }
7574            if self.xid_conflicts_with_snapshot(candidate.xmax, snapshot, own_xids) {
7575                return Err(Self::conflict_error(collection, logical_id, candidate.xmax));
7576            }
7577        }
7578        Ok(())
7579    }
7580
7581    pub(crate) fn check_table_row_write_conflicts(
7582        &self,
7583        conn_id: u64,
7584        snapshot: &crate::storage::transaction::snapshot::Snapshot,
7585        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
7586    ) -> RedDBResult<()> {
7587        let versioned_updates = self
7588            .inner
7589            .pending_versioned_updates
7590            .read()
7591            .get(&conn_id)
7592            .cloned()
7593            .unwrap_or_default();
7594        let tombstones = self
7595            .inner
7596            .pending_tombstones
7597            .read()
7598            .get(&conn_id)
7599            .cloned()
7600            .unwrap_or_default();
7601
7602        let store = self.inner.db.store();
7603        for (collection, old_id, new_id, xid, previous_xmax) in versioned_updates {
7604            let Some(manager) = store.get_collection(&collection) else {
7605                continue;
7606            };
7607            let Some(old) = manager.get(old_id) else {
7608                continue;
7609            };
7610            let logical_id = old.logical_id();
7611            if self.xid_conflicts_with_snapshot(previous_xmax, snapshot, own_xids) {
7612                return Err(Self::conflict_error(&collection, logical_id, previous_xmax));
7613            }
7614            if old.xmax != xid && self.xid_conflicts_with_snapshot(old.xmax, snapshot, own_xids) {
7615                return Err(Self::conflict_error(&collection, logical_id, old.xmax));
7616            }
7617            self.check_logical_row_conflict(
7618                &collection,
7619                logical_id,
7620                &[old_id, new_id],
7621                snapshot,
7622                own_xids,
7623            )?;
7624        }
7625
7626        for (collection, id, xid, previous_xmax) in tombstones {
7627            let Some(manager) = store.get_collection(&collection) else {
7628                continue;
7629            };
7630            let Some(entity) = manager.get(id) else {
7631                continue;
7632            };
7633            let logical_id = entity.logical_id();
7634            if self.xid_conflicts_with_snapshot(previous_xmax, snapshot, own_xids) {
7635                return Err(Self::conflict_error(&collection, logical_id, previous_xmax));
7636            }
7637            if entity.xmax != xid
7638                && self.xid_conflicts_with_snapshot(entity.xmax, snapshot, own_xids)
7639            {
7640                return Err(Self::conflict_error(&collection, logical_id, entity.xmax));
7641            }
7642            self.check_logical_row_conflict(&collection, logical_id, &[id], snapshot, own_xids)?;
7643        }
7644
7645        Ok(())
7646    }
7647
7648    pub(crate) fn restore_pending_write_stamps(&self, conn_id: u64) {
7649        let versioned_updates = self
7650            .inner
7651            .pending_versioned_updates
7652            .read()
7653            .get(&conn_id)
7654            .cloned()
7655            .unwrap_or_default();
7656        let tombstones = self
7657            .inner
7658            .pending_tombstones
7659            .read()
7660            .get(&conn_id)
7661            .cloned()
7662            .unwrap_or_default();
7663
7664        let store = self.inner.db.store();
7665        for (collection, old_id, _new_id, xid, _previous_xmax) in versioned_updates {
7666            if let Some(manager) = store.get_collection(&collection) {
7667                if let Some(mut entity) = manager.get(old_id) {
7668                    entity.set_xmax(xid);
7669                    let _ = manager.update(entity);
7670                }
7671            }
7672        }
7673        for (collection, id, xid, _previous_xmax) in tombstones {
7674            if let Some(manager) = store.get_collection(&collection) {
7675                if let Some(mut entity) = manager.get(id) {
7676                    entity.set_xmax(xid);
7677                    let _ = manager.update(entity);
7678                }
7679            }
7680        }
7681    }
7682
7683    pub(crate) fn finalize_pending_versioned_updates(&self, conn_id: u64) {
7684        self.inner
7685            .pending_versioned_updates
7686            .write()
7687            .remove(&conn_id);
7688    }
7689
7690    pub(crate) fn revive_pending_versioned_updates(&self, conn_id: u64) {
7691        let Some(pending) = self
7692            .inner
7693            .pending_versioned_updates
7694            .write()
7695            .remove(&conn_id)
7696        else {
7697            return;
7698        };
7699
7700        let store = self.inner.db.store();
7701        for (collection, old_id, new_id, xid, previous_xmax) in pending {
7702            if let Some(manager) = store.get_collection(&collection) {
7703                if let Some(mut old) = manager.get(old_id) {
7704                    if old.xmax == xid {
7705                        old.set_xmax(previous_xmax);
7706                        let _ = manager.update(old);
7707                    }
7708                }
7709            }
7710            let _ = store.delete_batch(&collection, &[new_id]);
7711        }
7712    }
7713
7714    pub(crate) fn revive_versioned_updates_since(&self, conn_id: u64, stamper_xid: u64) -> usize {
7715        let mut guard = self.inner.pending_versioned_updates.write();
7716        let Some(pending) = guard.get_mut(&conn_id) else {
7717            return 0;
7718        };
7719
7720        let store = self.inner.db.store();
7721        let mut reverted = 0usize;
7722        pending.retain(|(collection, old_id, new_id, xid, previous_xmax)| {
7723            if *xid < stamper_xid {
7724                return true;
7725            }
7726            if let Some(manager) = store.get_collection(collection) {
7727                if let Some(mut old) = manager.get(*old_id) {
7728                    if old.xmax == *xid {
7729                        old.set_xmax(*previous_xmax);
7730                        let _ = manager.update(old);
7731                    }
7732                }
7733            }
7734            let _ = store.delete_batch(collection, &[*new_id]);
7735            reverted += 1;
7736            false
7737        });
7738        if pending.is_empty() {
7739            guard.remove(&conn_id);
7740        }
7741        reverted
7742    }
7743
7744    /// Flush tombstones on COMMIT. The xmax stamp is already the durable
7745    /// delete marker; commit only drops the rollback journal and emits
7746    /// side effects. Physical reclamation is left for VACUUM so old
7747    /// snapshots can still resolve the pre-delete row version.
7748    pub(crate) fn finalize_pending_tombstones(&self, conn_id: u64) {
7749        let Some(pending) = self.inner.pending_tombstones.write().remove(&conn_id) else {
7750            return;
7751        };
7752        if pending.is_empty() {
7753            return;
7754        }
7755
7756        let store = self.inner.db.store();
7757        for (collection, id, _xid, _previous_xmax) in pending {
7758            store.context_index().remove_entity(id);
7759            self.cdc_emit(
7760                crate::replication::cdc::ChangeOperation::Delete,
7761                &collection,
7762                id.raw(),
7763                "entity",
7764            );
7765        }
7766    }
7767
7768    /// Revive tombstones on ROLLBACK — reset `xmax` to 0 so the tuples
7769    /// become visible again to future snapshots. Best-effort: a row
7770    /// already reclaimed by a concurrent VACUUM stays gone, but VACUUM
7771    /// never reclaims tuples whose xmax is still referenced by any
7772    /// active snapshot, so this case is only reachable via external
7773    /// storage corruption.
7774    pub(crate) fn revive_pending_tombstones(&self, conn_id: u64) {
7775        let Some(pending) = self.inner.pending_tombstones.write().remove(&conn_id) else {
7776            return;
7777        };
7778
7779        let store = self.inner.db.store();
7780        for (collection, id, xid, previous_xmax) in pending {
7781            let Some(manager) = store.get_collection(&collection) else {
7782                continue;
7783            };
7784            if let Some(mut entity) = manager.get(id) {
7785                if entity.xmax == xid {
7786                    entity.set_xmax(previous_xmax);
7787                    let _ = manager.update(entity);
7788                }
7789            }
7790        }
7791    }
7792
7793    pub(crate) fn finalize_pending_kv_watch_events(&self, conn_id: u64) {
7794        let Some(pending) = self.inner.pending_kv_watch_events.write().remove(&conn_id) else {
7795            return;
7796        };
7797        for event in pending {
7798            self.cdc_emit_kv(
7799                event.op,
7800                &event.collection,
7801                &event.key,
7802                0,
7803                event.before,
7804                event.after,
7805            );
7806        }
7807    }
7808
7809    pub(crate) fn discard_pending_kv_watch_events(&self, conn_id: u64) {
7810        self.inner.pending_kv_watch_events.write().remove(&conn_id);
7811    }
7812
7813    /// Materialise the entire graph store while applying MVCC visibility
7814    /// AND per-collection RLS to each candidate node and edge. Mirrors
7815    /// `materialize_graph` but routes every entity through the same
7816    /// gate the SELECT path uses, with the correct `PolicyTargetKind`
7817    /// per entity kind (`Nodes` for graph nodes, `Edges` for graph
7818    /// edges). Returns the filtered `GraphStore` plus the
7819    /// `node_id → properties` map the executor needs for `RETURN n.*`
7820    /// projections.
7821    fn materialize_graph_with_rls(
7822        &self,
7823    ) -> RedDBResult<(
7824        crate::storage::engine::GraphStore,
7825        std::collections::HashMap<
7826            String,
7827            std::collections::HashMap<String, crate::storage::schema::Value>,
7828        >,
7829        crate::storage::query::unified::EdgeProperties,
7830    )> {
7831        use crate::storage::engine::GraphStore;
7832        use crate::storage::query::ast::{PolicyAction, PolicyTargetKind};
7833        use crate::storage::unified::entity::{EntityData, EntityKind};
7834        use std::collections::{HashMap, HashSet};
7835
7836        let store = self.inner.db.store();
7837        let snap_ctx = capture_current_snapshot();
7838        let role = current_auth_identity().map(|(_, r)| r.as_str().to_string());
7839
7840        let graph = GraphStore::new();
7841        let mut node_properties: HashMap<String, HashMap<String, crate::storage::schema::Value>> =
7842            HashMap::new();
7843        let mut edge_properties: crate::storage::query::unified::EdgeProperties = HashMap::new();
7844        let mut allowed_nodes: HashSet<String> = HashSet::new();
7845
7846        // Per-collection cached compiled filters — Nodes-kind for
7847        // first pass, Edges-kind for the second. None entries mean
7848        // "RLS enabled, zero matching policy → deny all of this kind".
7849        let mut node_rls: HashMap<String, Option<crate::storage::query::ast::Filter>> =
7850            HashMap::new();
7851        let mut edge_rls: HashMap<String, Option<crate::storage::query::ast::Filter>> =
7852            HashMap::new();
7853
7854        let collections = store.list_collections();
7855
7856        // First pass — gather nodes.
7857        for collection in &collections {
7858            let Some(manager) = store.get_collection(collection) else {
7859                continue;
7860            };
7861            let entities = manager.query_all(|_| true);
7862            for entity in entities {
7863                if !entity_visible_with_context(snap_ctx.as_ref(), &entity) {
7864                    continue;
7865                }
7866                let EntityKind::GraphNode(ref node) = entity.kind else {
7867                    continue;
7868                };
7869                if !node_passes_rls(self, collection, role.as_deref(), &mut node_rls, &entity) {
7870                    continue;
7871                }
7872                let id_str = entity.id.raw().to_string();
7873                graph
7874                    .add_node_with_label(
7875                        &id_str,
7876                        &node.label,
7877                        &super::graph_node_label(&node.node_type),
7878                    )
7879                    .map_err(|err| RedDBError::Query(err.to_string()))?;
7880                allowed_nodes.insert(id_str.clone());
7881                if let EntityData::Node(node_data) = &entity.data {
7882                    node_properties.insert(id_str, node_data.properties.clone());
7883                }
7884            }
7885        }
7886
7887        // Second pass — gather edges. An edge appears only when both
7888        // endpoint nodes survived the RLS pass AND the edge itself
7889        // passes its own RLS gate.
7890        for collection in &collections {
7891            let Some(manager) = store.get_collection(collection) else {
7892                continue;
7893            };
7894            let entities = manager.query_all(|_| true);
7895            for entity in entities {
7896                if !entity_visible_with_context(snap_ctx.as_ref(), &entity) {
7897                    continue;
7898                }
7899                let EntityKind::GraphEdge(ref edge) = entity.kind else {
7900                    continue;
7901                };
7902                if !allowed_nodes.contains(&edge.from_node)
7903                    || !allowed_nodes.contains(&edge.to_node)
7904                {
7905                    continue;
7906                }
7907                if !edge_passes_rls(self, collection, role.as_deref(), &mut edge_rls, &entity) {
7908                    continue;
7909                }
7910                let weight = match &entity.data {
7911                    EntityData::Edge(e) => e.weight,
7912                    _ => edge.weight as f32 / 1000.0,
7913                };
7914                let edge_label = super::graph_edge_label(&edge.label);
7915                graph
7916                    .add_edge_with_label(&edge.from_node, &edge.to_node, &edge_label, weight)
7917                    .map_err(|err| RedDBError::Query(err.to_string()))?;
7918                if let EntityData::Edge(edge_data) = &entity.data {
7919                    edge_properties.insert(
7920                        (edge.from_node.clone(), edge_label, edge.to_node.clone()),
7921                        edge_data.properties.clone(),
7922                    );
7923                }
7924            }
7925        }
7926
7927        // Suppress unused-PolicyAction/PolicyTargetKind warnings — both
7928        // are used inside the helper closures via the per-kind helpers
7929        // declared at the bottom of this file.
7930        let _ = (PolicyAction::Select, PolicyTargetKind::Nodes);
7931
7932        Ok((graph, node_properties, edge_properties))
7933    }
7934
7935    /// Phase 1.1 MVCC universal: post-save hook that stamps `xmin` on a
7936    /// freshly-inserted entity when the current connection holds an
7937    /// open transaction. Used by graph / vector / queue / timeseries
7938    /// write paths that go through the DevX builder API (`db.node(...)
7939    /// .save()` and friends) — those live in the storage crate and
7940    /// can't reach `current_xid()` without crossing layers, so the
7941    /// application layer calls this helper right after `save()` to
7942    /// finalise the MVCC stamp.
7943    ///
7944    /// Autocommit (outside BEGIN) is a no-op — no extra lookup or
7945    /// write, so the non-transactional hot path stays untouched.
7946    ///
7947    /// Best-effort: if the collection or entity disappears between
7948    /// the save and the stamp (concurrent DROP), we silently skip.
7949    pub(crate) fn stamp_xmin_if_in_txn(
7950        &self,
7951        collection: &str,
7952        id: crate::storage::unified::entity::EntityId,
7953    ) {
7954        let Some(xid) = self.current_xid() else {
7955            return;
7956        };
7957        let store = self.inner.db.store();
7958        let Some(manager) = store.get_collection(collection) else {
7959            return;
7960        };
7961        if let Some(mut entity) = manager.get(id) {
7962            entity.set_xmin(xid);
7963            let _ = manager.update(entity);
7964        }
7965    }
7966
7967    /// Revive tombstones stamped by `stamper_xid` or any sub-xid
7968    /// allocated after it (Phase 2.3.2e savepoint rollback). Any
7969    /// pending entries with `xid < stamper_xid` stay queued because
7970    /// they belong to the enclosing scope — they'll either flush on
7971    /// COMMIT or revive on an outer ROLLBACK TO SAVEPOINT.
7972    ///
7973    /// Returns the number of tuples whose `xmax` was wiped back to 0.
7974    pub(crate) fn revive_tombstones_since(&self, conn_id: u64, stamper_xid: u64) -> usize {
7975        let mut guard = self.inner.pending_tombstones.write();
7976        let Some(pending) = guard.get_mut(&conn_id) else {
7977            return 0;
7978        };
7979
7980        let store = self.inner.db.store();
7981        let mut revived = 0usize;
7982        pending.retain(|(collection, id, xid, previous_xmax)| {
7983            if *xid < stamper_xid {
7984                // Stamped before the savepoint — keep in queue.
7985                return true;
7986            }
7987            if let Some(manager) = store.get_collection(collection) {
7988                if let Some(mut entity) = manager.get(*id) {
7989                    if entity.xmax == *xid {
7990                        entity.set_xmax(*previous_xmax);
7991                        let _ = manager.update(entity);
7992                        revived += 1;
7993                    }
7994                }
7995            }
7996            false
7997        });
7998        if pending.is_empty() {
7999            guard.remove(&conn_id);
8000        }
8001        revived
8002    }
8003
8004    /// Return the snapshot the current connection should use for visibility
8005    /// checks (Phase 2.3 PG parity).
8006    ///
8007    /// * If the connection is inside a BEGIN-wrapped transaction, reuse
8008    ///   the snapshot stored in its `TxnContext`.
8009    /// * Otherwise (autocommit), capture a fresh snapshot tied to an
8010    ///   implicit xid=0 — the read path treats pre-MVCC rows as always
8011    ///   visible so this degrades to "see everything committed".
8012    pub fn current_snapshot(&self) -> crate::storage::transaction::snapshot::Snapshot {
8013        let conn_id = current_connection_id();
8014        if let Some(ctx) = self.inner.tx_contexts.read().get(&conn_id).cloned() {
8015            return ctx.snapshot;
8016        }
8017        // Autocommit: take a fresh snapshot bounded by `peek_next_xid` so
8018        // every already-committed xid (which is strictly less) passes the
8019        // `xmin <= snap.xid` gate, while concurrently-active xids land in
8020        // the `in_progress` set and stay hidden until they commit. Using
8021        // xid=0 would incorrectly hide every MVCC-stamped tuple.
8022        let high_water = self.inner.snapshot_manager.peek_next_xid();
8023        self.inner.snapshot_manager.snapshot(high_water)
8024    }
8025
8026    /// Xid of the current connection's active transaction, or `None` when
8027    /// running outside a BEGIN/COMMIT block. Write paths call this to
8028    /// decide whether to stamp `xmin`/`xmax` on tuples.
8029    /// Phase 2.3.2e: when a savepoint is open, `writer_xid` returns the
8030    /// sub-xid so new writes can be selectively rolled back. Otherwise
8031    /// the parent txn's xid is returned, matching pre-savepoint
8032    /// behaviour. Callers that need the enclosing *transaction* xid
8033    /// (e.g. VACUUM min-active calculations) should read `ctx.xid`
8034    /// directly.
8035    pub fn current_xid(&self) -> Option<crate::storage::transaction::snapshot::Xid> {
8036        let conn_id = current_connection_id();
8037        self.inner
8038            .tx_contexts
8039            .read()
8040            .get(&conn_id)
8041            .map(|ctx| ctx.writer_xid())
8042    }
8043
8044    /// Access the shared `SnapshotManager` — useful for VACUUM to compute
8045    /// the oldest-active xid when reclaiming dead tuples.
8046    pub fn snapshot_manager(&self) -> Arc<crate::storage::transaction::snapshot::SnapshotManager> {
8047        Arc::clone(&self.inner.snapshot_manager)
8048    }
8049
8050    fn mvcc_vacuum_cutoff_xid(&self) -> crate::storage::transaction::snapshot::Xid {
8051        let manager = &self.inner.snapshot_manager;
8052        let next_xid = manager.peek_next_xid();
8053        let mut cutoff = next_xid;
8054        if let Some(oldest_active) = manager.oldest_active_xid() {
8055            cutoff = cutoff.min(oldest_active);
8056        }
8057        if let Some(oldest_pinned) = manager.oldest_pinned_xid() {
8058            cutoff = cutoff.min(oldest_pinned);
8059        }
8060        let retention_xids = self.config_u64("runtime.mvcc.vacuum_retention_xids", 0);
8061        if retention_xids > 0 {
8062            cutoff = cutoff.min(next_xid.saturating_sub(retention_xids));
8063        }
8064        cutoff
8065    }
8066
8067    fn rebuild_runtime_indexes_for_table(&self, table: &str) -> RedDBResult<()> {
8068        let registered = self.inner.index_store.list_indices(table);
8069        if registered.is_empty() {
8070            return Ok(());
8071        }
8072        let store = self.inner.db.store();
8073        let Some(manager) = store.get_collection(table) else {
8074            return Ok(());
8075        };
8076        let entity_fields = manager
8077            .query_all(|entity| matches!(entity.kind, crate::storage::EntityKind::TableRow { .. }))
8078            .into_iter()
8079            .map(|entity| (entity.id, table_row_index_fields(&entity)))
8080            .collect::<Vec<_>>();
8081
8082        for index in registered {
8083            self.inner.index_store.drop_index(&index.name, table);
8084            self.inner
8085                .index_store
8086                .create_index(
8087                    &index.name,
8088                    table,
8089                    &index.columns,
8090                    index.method,
8091                    index.unique,
8092                    &entity_fields,
8093                )
8094                .map_err(RedDBError::Internal)?;
8095            self.inner.index_store.register(index);
8096        }
8097        self.invalidate_plan_cache();
8098        Ok(())
8099    }
8100
8101    /// Own-tx xids (parent + open/released savepoints) for the current
8102    /// connection. Transports + tests that build a `SnapshotContext`
8103    /// manually (outside the `execute_query` scope) need this set so
8104    /// the writer's own uncommitted tuples stay visible to self.
8105    pub fn current_txn_own_xids(
8106        &self,
8107    ) -> std::collections::HashSet<crate::storage::transaction::snapshot::Xid> {
8108        let mut set = std::collections::HashSet::new();
8109        if let Some(ctx) = self.inner.tx_contexts.read().get(&current_connection_id()) {
8110            set.insert(ctx.xid);
8111            for (_, sub) in &ctx.savepoints {
8112                set.insert(*sub);
8113            }
8114            for sub in &ctx.released_sub_xids {
8115                set.insert(*sub);
8116            }
8117        }
8118        set
8119    }
8120
8121    /// Access the shared `ForeignTableRegistry` (Phase 3.2 PG parity).
8122    ///
8123    /// Callers use this to check whether a table name is a registered
8124    /// foreign table (`registry.is_foreign_table(name)`) and, if so, to
8125    /// scan it (`registry.scan(name)`). The read-path rewriter consults
8126    /// this before dispatching into native-collection lookup.
8127    pub fn foreign_tables(&self) -> Arc<crate::storage::fdw::ForeignTableRegistry> {
8128        Arc::clone(&self.inner.foreign_tables)
8129    }
8130
8131    /// Is Row-Level Security enabled for this table? (Phase 2.5 PG parity)
8132    pub fn is_rls_enabled(&self, table: &str) -> bool {
8133        self.inner.rls_enabled_tables.read().contains(table)
8134    }
8135
8136    /// Collect the USING predicates that apply to this `(table, role, action)`.
8137    ///
8138    /// Returned filters should be OR-combined (a row passes RLS when *any*
8139    /// matching policy accepts it) and then AND-ed into the query's WHERE.
8140    /// When the table has RLS disabled this returns an empty Vec — callers
8141    /// can fast-path back to the unfiltered read.
8142    pub fn matching_rls_policies(
8143        &self,
8144        table: &str,
8145        role: Option<&str>,
8146        action: crate::storage::query::ast::PolicyAction,
8147    ) -> Vec<crate::storage::query::ast::Filter> {
8148        // Default kind = Table preserves the pre-Phase-2.5.5 behaviour:
8149        // callers that don't name a kind only see Table-scoped
8150        // policies (which is what execute SELECT / UPDATE / DELETE
8151        // expect).
8152        self.matching_rls_policies_for_kind(
8153            table,
8154            role,
8155            action,
8156            crate::storage::query::ast::PolicyTargetKind::Table,
8157        )
8158    }
8159
8160    /// Kind-aware variant used by cross-model scans (Phase 2.5.5).
8161    ///
8162    /// Graph scans request `Nodes` / `Edges`, vector ANN requests
8163    /// `Vectors`, queue consumers request `Messages`, and timeseries
8164    /// range scans request `Points`. Policies tagged with a
8165    /// different kind are skipped so a graph-scoped policy doesn't
8166    /// accidentally gate a table SELECT on the same collection.
8167    pub fn matching_rls_policies_for_kind(
8168        &self,
8169        table: &str,
8170        role: Option<&str>,
8171        action: crate::storage::query::ast::PolicyAction,
8172        kind: crate::storage::query::ast::PolicyTargetKind,
8173    ) -> Vec<crate::storage::query::ast::Filter> {
8174        if !self.is_rls_enabled(table) {
8175            return Vec::new();
8176        }
8177        let policies = self.inner.rls_policies.read();
8178        policies
8179            .iter()
8180            .filter_map(|((t, _), p)| {
8181                if t != table {
8182                    return None;
8183                }
8184                // Kind gate — Table policies also apply to every
8185                // other kind *iff* the policy predicate evaluates
8186                // against entity fields that exist uniformly; the
8187                // caller's kind filter is the stricter check, so
8188                // match literally. Auto-tenancy policies stamp
8189                // Table and the caller passes the concrete kind —
8190                // we allow Table policies to apply cross-kind for
8191                // backwards compat.
8192                if p.target_kind != kind
8193                    && p.target_kind != crate::storage::query::ast::PolicyTargetKind::Table
8194                {
8195                    return None;
8196                }
8197                // Action gate — `None` means "ALL" actions.
8198                if let Some(a) = p.action {
8199                    if a != action {
8200                        return None;
8201                    }
8202                }
8203                // Role gate — `None` means "any role".
8204                if let Some(p_role) = p.role.as_deref() {
8205                    match role {
8206                        Some(r) if r == p_role => {}
8207                        _ => return None,
8208                    }
8209                }
8210                Some((*p.using).clone())
8211            })
8212            .collect()
8213    }
8214
8215    pub(crate) fn refresh_table_planner_stats(&self, table: &str) {
8216        let store = self.inner.db.store();
8217        if let Some(stats) =
8218            crate::storage::query::planner::stats_catalog::analyze_collection(store.as_ref(), table)
8219        {
8220            crate::storage::query::planner::stats_catalog::persist_table_stats(
8221                store.as_ref(),
8222                &stats,
8223            );
8224        } else {
8225            crate::storage::query::planner::stats_catalog::clear_table_stats(store.as_ref(), table);
8226        }
8227        self.invalidate_plan_cache();
8228    }
8229
8230    pub(crate) fn note_table_write(&self, table: &str) {
8231        // Skip the write lock when the table is already marked
8232        // dirty. With single-row UPDATEs in a loop this used to
8233        // grab the planner_dirty_tables write lock N times even
8234        // though the first call already flipped the flag.
8235        let already_dirty = self.inner.planner_dirty_tables.read().contains(table);
8236        if !already_dirty {
8237            self.inner
8238                .planner_dirty_tables
8239                .write()
8240                .insert(table.to_string());
8241        }
8242        self.invalidate_result_cache_for_table(table);
8243    }
8244
8245    /// Wrap the planner's `RuntimeQueryExplain` as rows on a
8246    /// `RuntimeQueryResult` so callers over the SQL interface see the
8247    /// plan tree in the same shape a SELECT produces.
8248    ///
8249    /// Columns: `op`, `source`, `est_rows`, `est_cost`, `depth`.
8250    /// Nodes are walked depth-first; `depth` counts from 0 at the
8251    /// root so a text renderer can indent without re-walking.
8252    fn explain_as_rows(&self, raw_query: &str, inner_sql: &str) -> RedDBResult<RuntimeQueryResult> {
8253        let explain = self.explain_query(inner_sql)?;
8254
8255        let columns = vec![
8256            "op".to_string(),
8257            "source".to_string(),
8258            "est_rows".to_string(),
8259            "est_cost".to_string(),
8260            "depth".to_string(),
8261        ];
8262
8263        let mut records: Vec<crate::storage::query::unified::UnifiedRecord> = Vec::new();
8264
8265        // Prepend `CteScan` markers when the query carried a leading
8266        // WITH clause. The CTE bodies are already inlined into the
8267        // main plan tree, but operators reading EXPLAIN need to see
8268        // which named CTEs were resolved — without this row the plan
8269        // would look indistinguishable from a hand-inlined query.
8270        for name in &explain.cte_materializations {
8271            use std::sync::Arc;
8272            let mut rec = crate::storage::query::unified::UnifiedRecord::default();
8273            rec.set_arc(Arc::from("op"), Value::text("CteScan".to_string()));
8274            rec.set_arc(Arc::from("source"), Value::text(name.clone()));
8275            rec.set_arc(Arc::from("est_rows"), Value::Float(0.0));
8276            rec.set_arc(Arc::from("est_cost"), Value::Float(0.0));
8277            rec.set_arc(Arc::from("depth"), Value::Integer(0));
8278            records.push(rec);
8279        }
8280
8281        walk_plan_node(&explain.logical_plan.root, 0, &mut records);
8282
8283        let result = crate::storage::query::unified::UnifiedResult {
8284            columns,
8285            records,
8286            stats: Default::default(),
8287            pre_serialized_json: None,
8288        };
8289
8290        Ok(RuntimeQueryResult {
8291            query: raw_query.to_string(),
8292            mode: explain.mode,
8293            statement: "explain",
8294            engine: "runtime-explain",
8295            result,
8296            affected_rows: 0,
8297            statement_type: "select",
8298        })
8299    }
8300
8301    // -----------------------------------------------------------------
8302    // Granular RBAC — privilege gate + GRANT/REVOKE/ALTER USER dispatch
8303    // -----------------------------------------------------------------
8304
8305    /// Project a `QueryExpr` to the (action, resource) pair the
8306    /// privilege engine cares about. Returns `Ok(())` for statements
8307    /// that don't touch user data (transaction control, SHOW, SET, etc.).
8308    pub(super) fn check_query_privilege(
8309        &self,
8310        expr: &crate::storage::query::ast::QueryExpr,
8311    ) -> Result<(), String> {
8312        use crate::auth::privileges::{Action, AuthzContext, Resource};
8313        use crate::auth::UserId;
8314        use crate::storage::query::ast::QueryExpr;
8315
8316        // No auth store wired (embedded mode / fresh DB / tests) → bypass.
8317        // The bootstrap path itself goes through `execute_query` so this
8318        // is the only sensible default; once auth is wired, the gate
8319        // becomes active.
8320        let auth_store = match self.inner.auth_store.read().clone() {
8321            Some(s) => s,
8322            None => return Ok(()),
8323        };
8324
8325        // Resolve principal + role from the thread-local identity.
8326        // Anonymous (no identity) is allowed to read the bootstrap path
8327        // only when auth_store says so; we treat missing identity as
8328        // platform-admin-equivalent here so embedded test harnesses
8329        // continue to work without setting an identity.
8330        let (username, role) = match current_auth_identity() {
8331            Some(p) => p,
8332            None => return Ok(()),
8333        };
8334        let tenant = current_tenant();
8335
8336        let ctx = AuthzContext {
8337            principal: &username,
8338            effective_role: role,
8339            tenant: tenant.as_deref(),
8340        };
8341        let principal_id = UserId::from_parts(tenant.as_deref(), &username);
8342
8343        // Map QueryExpr → (Action, Resource).
8344        let (action, resource) = match expr {
8345            QueryExpr::Table(t) => (Action::Select, Resource::table_from_name(&t.table)),
8346            QueryExpr::QueueSelect(q) => (Action::Select, Resource::table_from_name(&q.queue)),
8347            QueryExpr::Graph(g) => {
8348                if auth_store.iam_authorization_enabled() {
8349                    self.check_graph_property_projection_privilege(
8350                        &auth_store,
8351                        &principal_id,
8352                        role,
8353                        tenant.as_deref(),
8354                        g,
8355                    )?;
8356                    return Ok(());
8357                }
8358                return Ok(());
8359            }
8360            QueryExpr::Vector(v) => {
8361                if auth_store.iam_authorization_enabled() {
8362                    self.check_table_like_column_projection_privilege(
8363                        &auth_store,
8364                        &principal_id,
8365                        role,
8366                        tenant.as_deref(),
8367                        &v.collection,
8368                        &["content".to_string()],
8369                    )?;
8370                    return Ok(());
8371                }
8372                return Ok(());
8373            }
8374            QueryExpr::Insert(i) => (Action::Insert, Resource::table_from_name(&i.table)),
8375            QueryExpr::Update(u) => (Action::Update, Resource::table_from_name(&u.table)),
8376            QueryExpr::Delete(d) => (Action::Delete, Resource::table_from_name(&d.table)),
8377            // Joins inherit the read privilege from any constituent
8378            // table — for now we emit a single Select on the database
8379            // (admins bypass; non-admins need a Database/Schema grant).
8380            QueryExpr::Join(_) => (Action::Select, Resource::Database),
8381            // GRANT / REVOKE / ALTER USER are authority statements;
8382            // require Admin (the helper methods enforce).
8383            QueryExpr::Grant(_) | QueryExpr::Revoke(_) | QueryExpr::AlterUser(_) => {
8384                return if role == crate::auth::Role::Admin {
8385                    Ok(())
8386                } else {
8387                    Err(format!(
8388                        "principal=`{}` role=`{:?}` cannot issue ACL/auth DDL",
8389                        username, role
8390                    ))
8391                };
8392            }
8393            QueryExpr::CreateIamPolicy { id, .. } => {
8394                return self.check_policy_management_privilege(
8395                    &auth_store,
8396                    &principal_id,
8397                    role,
8398                    tenant.as_deref(),
8399                    "policy:put",
8400                    "policy",
8401                    id,
8402                );
8403            }
8404            QueryExpr::DropIamPolicy { id } => {
8405                return self.check_policy_management_privilege(
8406                    &auth_store,
8407                    &principal_id,
8408                    role,
8409                    tenant.as_deref(),
8410                    "policy:drop",
8411                    "policy",
8412                    id,
8413                );
8414            }
8415            QueryExpr::AttachPolicy { policy_id, .. } => {
8416                return self.check_policy_management_privilege(
8417                    &auth_store,
8418                    &principal_id,
8419                    role,
8420                    tenant.as_deref(),
8421                    "policy:attach",
8422                    "policy",
8423                    policy_id,
8424                );
8425            }
8426            QueryExpr::DetachPolicy { policy_id, .. } => {
8427                return self.check_policy_management_privilege(
8428                    &auth_store,
8429                    &principal_id,
8430                    role,
8431                    tenant.as_deref(),
8432                    "policy:detach",
8433                    "policy",
8434                    policy_id,
8435                );
8436            }
8437            QueryExpr::ShowPolicies { .. } | QueryExpr::ShowEffectivePermissions { .. } => {
8438                return Ok(());
8439            }
8440            QueryExpr::SimulatePolicy { .. } => {
8441                return self.check_policy_management_privilege(
8442                    &auth_store,
8443                    &principal_id,
8444                    role,
8445                    tenant.as_deref(),
8446                    "policy:simulate",
8447                    "policy",
8448                    "*",
8449                );
8450            }
8451            // DROP and TRUNCATE — Write-role gate + per-collection IAM policy
8452            // when IAM mode is active. Other DDL stays role-only for now.
8453            QueryExpr::DropTable(q) => {
8454                return self.check_ddl_collection_privilege(
8455                    &auth_store,
8456                    &principal_id,
8457                    role,
8458                    tenant.as_deref(),
8459                    &username,
8460                    "drop",
8461                    &q.name,
8462                );
8463            }
8464            QueryExpr::DropGraph(q) => {
8465                return self.check_ddl_collection_privilege(
8466                    &auth_store,
8467                    &principal_id,
8468                    role,
8469                    tenant.as_deref(),
8470                    &username,
8471                    "drop",
8472                    &q.name,
8473                );
8474            }
8475            QueryExpr::DropVector(q) => {
8476                return self.check_ddl_collection_privilege(
8477                    &auth_store,
8478                    &principal_id,
8479                    role,
8480                    tenant.as_deref(),
8481                    &username,
8482                    "drop",
8483                    &q.name,
8484                );
8485            }
8486            QueryExpr::DropDocument(q) => {
8487                return self.check_ddl_collection_privilege(
8488                    &auth_store,
8489                    &principal_id,
8490                    role,
8491                    tenant.as_deref(),
8492                    &username,
8493                    "drop",
8494                    &q.name,
8495                );
8496            }
8497            QueryExpr::DropKv(q) => {
8498                return self.check_ddl_collection_privilege(
8499                    &auth_store,
8500                    &principal_id,
8501                    role,
8502                    tenant.as_deref(),
8503                    &username,
8504                    "drop",
8505                    &q.name,
8506                );
8507            }
8508            QueryExpr::DropCollection(q) => {
8509                return self.check_ddl_collection_privilege(
8510                    &auth_store,
8511                    &principal_id,
8512                    role,
8513                    tenant.as_deref(),
8514                    &username,
8515                    "drop",
8516                    &q.name,
8517                );
8518            }
8519            QueryExpr::Truncate(q) => {
8520                return self.check_ddl_collection_privilege(
8521                    &auth_store,
8522                    &principal_id,
8523                    role,
8524                    tenant.as_deref(),
8525                    &username,
8526                    "truncate",
8527                    &q.name,
8528                );
8529            }
8530            // Remaining DDL — gate on Write role. Fine-grained grants TBD.
8531            QueryExpr::CreateTable(_)
8532            | QueryExpr::CreateCollection(_)
8533            | QueryExpr::CreateVector(_)
8534            | QueryExpr::AlterTable(_)
8535            | QueryExpr::CreateIndex(_)
8536            | QueryExpr::DropIndex(_)
8537            | QueryExpr::CreateSchema(_)
8538            | QueryExpr::DropSchema(_)
8539            | QueryExpr::CreateSequence(_)
8540            | QueryExpr::DropSequence(_)
8541            | QueryExpr::CreateView(_)
8542            | QueryExpr::DropView(_)
8543            | QueryExpr::RefreshMaterializedView(_)
8544            | QueryExpr::CreatePolicy(_)
8545            | QueryExpr::DropPolicy(_)
8546            | QueryExpr::CreateServer(_)
8547            | QueryExpr::DropServer(_)
8548            | QueryExpr::CreateForeignTable(_)
8549            | QueryExpr::DropForeignTable(_)
8550            | QueryExpr::CreateTimeSeries(_)
8551            | QueryExpr::DropTimeSeries(_)
8552            | QueryExpr::CreateQueue(_)
8553            | QueryExpr::AlterQueue(_)
8554            | QueryExpr::DropQueue(_)
8555            | QueryExpr::CreateTree(_)
8556            | QueryExpr::DropTree(_) => {
8557                return if role >= crate::auth::Role::Write {
8558                    Ok(())
8559                } else {
8560                    Err(format!(
8561                        "principal=`{}` role=`{:?}` cannot issue DDL",
8562                        username, role
8563                    ))
8564                };
8565            }
8566            // Migration DDL — CREATE MIGRATION requires Write role (schema author).
8567            QueryExpr::CreateMigration(_) => {
8568                return if role >= crate::auth::Role::Write {
8569                    Ok(())
8570                } else {
8571                    Err(format!(
8572                        "principal=`{}` role=`{:?}` cannot issue CREATE MIGRATION",
8573                        username, role
8574                    ))
8575                };
8576            }
8577            // APPLY / ROLLBACK change data and schema — require Admin.
8578            QueryExpr::ApplyMigration(_) | QueryExpr::RollbackMigration(_) => {
8579                return if role == crate::auth::Role::Admin {
8580                    Ok(())
8581                } else {
8582                    Err(format!(
8583                        "principal=`{}` role=`{:?}` cannot issue APPLY/ROLLBACK MIGRATION",
8584                        username, role
8585                    ))
8586                };
8587            }
8588            // EXPLAIN MIGRATION is read-only — any authenticated principal.
8589            QueryExpr::ExplainMigration(_) => return Ok(()),
8590            // Everything else (SET, SHOW, transaction control, graph
8591            // commands, queue/tree commands, MaintenanceCommand …)
8592            // is allowed for any authenticated principal.
8593            _ => return Ok(()),
8594        };
8595
8596        if auth_store.iam_authorization_enabled() {
8597            let iam_action = legacy_action_to_iam(action);
8598            let iam_resource = legacy_resource_to_iam(&resource, tenant.as_deref());
8599            let iam_ctx = runtime_iam_context(role, tenant.as_deref());
8600            if !auth_store.check_policy_authz(&principal_id, iam_action, &iam_resource, &iam_ctx) {
8601                return Err(format!(
8602                    "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
8603                    username, iam_action, iam_resource.kind, iam_resource.name
8604                ));
8605            }
8606
8607            if let QueryExpr::Table(table) = expr {
8608                self.check_table_column_projection_privilege(
8609                    &auth_store,
8610                    &principal_id,
8611                    &iam_ctx,
8612                    table,
8613                )?;
8614            }
8615
8616            if let QueryExpr::Update(update) = expr {
8617                let columns = update_set_target_columns(update);
8618                if !columns.is_empty() {
8619                    let request = column_access_request_for_table_update(&update.table, columns);
8620                    let outcome =
8621                        auth_store.check_column_projection_authz(&principal_id, &request, &iam_ctx);
8622                    if let Some(denied) = outcome.first_denied_column() {
8623                        return Err(format!(
8624                            "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM column policy",
8625                            username, iam_action, denied.resource.kind, denied.resource.name
8626                        ));
8627                    }
8628                    if !outcome.allowed() {
8629                        return Err(format!(
8630                            "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
8631                            username,
8632                            iam_action,
8633                            outcome.table_resource.kind,
8634                            outcome.table_resource.name
8635                        ));
8636                    }
8637                }
8638
8639                if let Some(columns) = update_returning_columns_for_policy(self, update) {
8640                    let request = column_access_request_for_table_select(&update.table, columns);
8641                    let outcome =
8642                        auth_store.check_column_projection_authz(&principal_id, &request, &iam_ctx);
8643                    if let Some(denied) = outcome.first_denied_column() {
8644                        return Err(format!(
8645                            "principal=`{}` action=`select` resource=`{}:{}` denied by IAM column policy",
8646                            username, denied.resource.kind, denied.resource.name
8647                        ));
8648                    }
8649                    if !outcome.allowed() {
8650                        return Err(format!(
8651                            "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
8652                            username, outcome.table_resource.kind, outcome.table_resource.name
8653                        ));
8654                    }
8655                }
8656            }
8657
8658            Ok(())
8659        } else {
8660            auth_store
8661                .check_grant(&ctx, action, &resource)
8662                .map_err(|e| e.to_string())
8663        }
8664    }
8665
8666    fn check_table_column_projection_privilege(
8667        &self,
8668        auth_store: &Arc<crate::auth::store::AuthStore>,
8669        principal: &crate::auth::UserId,
8670        ctx: &crate::auth::policies::EvalContext,
8671        table: &crate::storage::query::ast::TableQuery,
8672    ) -> Result<(), String> {
8673        use crate::auth::{ColumnAccessRequest, ColumnDecisionEffect};
8674
8675        let columns = requested_table_columns_for_policy(table);
8676        if columns.is_empty() {
8677            return Ok(());
8678        }
8679
8680        let request = ColumnAccessRequest::select(table.table.clone(), columns);
8681        let outcome = auth_store.check_column_projection_authz(principal, &request, ctx);
8682        if outcome.allowed() {
8683            return Ok(());
8684        }
8685
8686        if !matches!(
8687            outcome.table_decision,
8688            crate::auth::policies::Decision::Allow { .. }
8689                | crate::auth::policies::Decision::AdminBypass
8690        ) {
8691            return Err(format!(
8692                "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
8693                principal, outcome.table_resource.kind, outcome.table_resource.name
8694            ));
8695        }
8696
8697        let denied = outcome
8698            .first_denied_column()
8699            .filter(|decision| decision.effective == ColumnDecisionEffect::Denied);
8700        match denied {
8701            Some(decision) => Err(format!(
8702                "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
8703                principal, decision.resource.kind, decision.resource.name
8704            )),
8705            None => Ok(()),
8706        }
8707    }
8708
8709    fn check_graph_property_projection_privilege(
8710        &self,
8711        auth_store: &Arc<crate::auth::store::AuthStore>,
8712        principal: &crate::auth::UserId,
8713        role: crate::auth::Role,
8714        tenant: Option<&str>,
8715        query: &crate::storage::query::ast::GraphQuery,
8716    ) -> Result<(), String> {
8717        let columns = explicit_graph_projection_properties(query);
8718        if columns.is_empty() {
8719            return Ok(());
8720        }
8721        self.check_table_like_column_projection_privilege(
8722            auth_store, principal, role, tenant, "graph", &columns,
8723        )
8724    }
8725
8726    fn check_table_like_column_projection_privilege(
8727        &self,
8728        auth_store: &Arc<crate::auth::store::AuthStore>,
8729        principal: &crate::auth::UserId,
8730        role: crate::auth::Role,
8731        tenant: Option<&str>,
8732        table: &str,
8733        columns: &[String],
8734    ) -> Result<(), String> {
8735        let iam_ctx = runtime_iam_context(role, tenant);
8736        let request =
8737            crate::auth::ColumnAccessRequest::select(table.to_string(), columns.iter().cloned());
8738        let outcome = auth_store.check_column_projection_authz(principal, &request, &iam_ctx);
8739        if outcome.allowed() {
8740            return Ok(());
8741        }
8742        let denied = outcome
8743            .first_denied_column()
8744            .map(|d| d.resource.name.clone())
8745            .unwrap_or_else(|| format!("{table}.<unknown>"));
8746        Err(format!(
8747            "principal=`{}` action=`select` resource=`column:{}` denied by IAM policy",
8748            principal, denied
8749        ))
8750    }
8751
8752    fn check_policy_management_privilege(
8753        &self,
8754        auth_store: &Arc<crate::auth::store::AuthStore>,
8755        principal: &crate::auth::UserId,
8756        role: crate::auth::Role,
8757        tenant: Option<&str>,
8758        action: &str,
8759        resource_kind: &str,
8760        resource_name: &str,
8761    ) -> Result<(), String> {
8762        if !auth_store.iam_authorization_enabled() {
8763            return if role == crate::auth::Role::Admin {
8764                Ok(())
8765            } else {
8766                Err(format!(
8767                    "principal=`{}` role=`{:?}` cannot issue ACL/auth DDL",
8768                    principal, role
8769                ))
8770            };
8771        }
8772
8773        let mut resource = crate::auth::policies::ResourceRef::new(
8774            resource_kind.to_string(),
8775            resource_name.to_string(),
8776        );
8777        if let Some(t) = tenant {
8778            resource = resource.with_tenant(t.to_string());
8779        }
8780        let ctx = runtime_iam_context(role, tenant);
8781        if auth_store.check_policy_authz(principal, action, &resource, &ctx) {
8782            Ok(())
8783        } else {
8784            Err(format!(
8785                "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
8786                principal, action, resource.kind, resource.name
8787            ))
8788        }
8789    }
8790
8791    /// IAM privilege check for DROP / TRUNCATE on a named collection.
8792    ///
8793    /// In legacy mode (IAM not enabled): requires Write role.
8794    /// In IAM mode: requires an explicit `drop` / `truncate` policy on
8795    /// `collection:<name>` (Admin role auto-passes via AdminBypass).
8796    /// Records an audit log entry for both allow and deny outcomes.
8797    fn check_ddl_collection_privilege(
8798        &self,
8799        auth_store: &Arc<crate::auth::store::AuthStore>,
8800        principal: &crate::auth::UserId,
8801        role: crate::auth::Role,
8802        tenant: Option<&str>,
8803        username: &str,
8804        action: &str,
8805        collection: &str,
8806    ) -> Result<(), String> {
8807        if role < crate::auth::Role::Write {
8808            let msg = format!(
8809                "principal=`{}` role=`{:?}` cannot issue DDL",
8810                username, role
8811            );
8812            self.inner.audit_log.record(
8813                action,
8814                username,
8815                collection,
8816                "denied",
8817                crate::json::Value::Null,
8818            );
8819            return Err(msg);
8820        }
8821
8822        if !auth_store.iam_authorization_enabled() {
8823            self.inner.audit_log.record(
8824                action,
8825                username,
8826                collection,
8827                "ok",
8828                crate::json::Value::Null,
8829            );
8830            return Ok(());
8831        }
8832
8833        let resource_name = collection.to_string();
8834        let mut resource = crate::auth::policies::ResourceRef::new(
8835            "collection".to_string(),
8836            resource_name.clone(),
8837        );
8838        if let Some(t) = tenant {
8839            resource = resource.with_tenant(t.to_string());
8840        }
8841        let ctx = runtime_iam_context(role, tenant);
8842        if auth_store.check_policy_authz(principal, action, &resource, &ctx) {
8843            self.inner.audit_log.record(
8844                action,
8845                username,
8846                &resource_name,
8847                "ok",
8848                crate::json::Value::Null,
8849            );
8850            Ok(())
8851        } else {
8852            self.inner.audit_log.record(
8853                action,
8854                username,
8855                &resource_name,
8856                "denied",
8857                crate::json::Value::Null,
8858            );
8859            Err(format!(
8860                "principal=`{}` action=`{}` resource=`collection:{}` denied by IAM policy",
8861                username, action, resource_name
8862            ))
8863        }
8864    }
8865
8866    /// Translate the parsed [`GrantStmt`] into AuthStore mutations.
8867    fn execute_grant_statement(
8868        &self,
8869        query: &str,
8870        stmt: &crate::storage::query::ast::GrantStmt,
8871    ) -> RedDBResult<RuntimeQueryResult> {
8872        use crate::auth::privileges::{Action, GrantPrincipal, Resource};
8873        use crate::auth::UserId;
8874        use crate::storage::query::ast::{GrantObjectKind, GrantPrincipalRef};
8875
8876        let auth_store = self
8877            .inner
8878            .auth_store
8879            .read()
8880            .clone()
8881            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
8882
8883        // Granter identity + role.
8884        let (gname, grole) = current_auth_identity().ok_or_else(|| {
8885            RedDBError::Query("GRANT requires an authenticated principal".to_string())
8886        })?;
8887        let granter = UserId::from_parts(current_tenant().as_deref(), &gname);
8888        let granter_role = grole;
8889
8890        // Build the action set.
8891        let mut actions: Vec<Action> = Vec::new();
8892        if stmt.all {
8893            actions.push(Action::All);
8894        } else {
8895            for kw in &stmt.actions {
8896                let a = Action::from_keyword(kw).ok_or_else(|| {
8897                    RedDBError::Query(format!("unknown privilege keyword `{}`", kw))
8898                })?;
8899                actions.push(a);
8900            }
8901        }
8902
8903        // Audit emit (printed; structured emission is Agent #4's lane).
8904        let mut applied = 0usize;
8905        for obj in &stmt.objects {
8906            let resource = match stmt.object_kind {
8907                GrantObjectKind::Table => Resource::Table {
8908                    schema: obj.schema.clone(),
8909                    table: obj.name.clone(),
8910                },
8911                GrantObjectKind::Schema => Resource::Schema(obj.name.clone()),
8912                GrantObjectKind::Database => Resource::Database,
8913                GrantObjectKind::Function => Resource::Function {
8914                    schema: obj.schema.clone(),
8915                    name: obj.name.clone(),
8916                },
8917            };
8918            for principal in &stmt.principals {
8919                let p = match principal {
8920                    GrantPrincipalRef::Public => GrantPrincipal::Public,
8921                    GrantPrincipalRef::Group(g) => GrantPrincipal::Group(g.clone()),
8922                    GrantPrincipalRef::User { tenant, name } => {
8923                        GrantPrincipal::User(UserId::from_parts(tenant.as_deref(), name))
8924                    }
8925                };
8926                // Tenant of the grant follows the granter's tenant
8927                // (cross-tenant guard inside `AuthStore::grant`).
8928                let tenant = granter.tenant.clone();
8929                auth_store
8930                    .grant(
8931                        &granter,
8932                        granter_role,
8933                        p.clone(),
8934                        resource.clone(),
8935                        actions.clone(),
8936                        stmt.with_grant_option,
8937                        tenant.clone(),
8938                    )
8939                    .map_err(|e| RedDBError::Query(e.to_string()))?;
8940
8941                // IAM policy translation: every GRANT also lands as a
8942                // synthetic `_grant_<id>` policy attached to the
8943                // principal so the new evaluator sees it.
8944                if let Some(policy) =
8945                    grant_to_iam_policy(&p, &resource, &actions, tenant.as_deref())
8946                {
8947                    let pid = policy.id.clone();
8948                    auth_store
8949                        .put_policy_internal(policy)
8950                        .map_err(|e| RedDBError::Query(e.to_string()))?;
8951                    let attachment = match &p {
8952                        GrantPrincipal::User(uid) => {
8953                            crate::auth::store::PrincipalRef::User(uid.clone())
8954                        }
8955                        GrantPrincipal::Group(group) => {
8956                            crate::auth::store::PrincipalRef::Group(group.clone())
8957                        }
8958                        GrantPrincipal::Public => crate::auth::store::PrincipalRef::Group(
8959                            crate::auth::store::PUBLIC_IAM_GROUP.to_string(),
8960                        ),
8961                    };
8962                    auth_store
8963                        .attach_policy(attachment, &pid)
8964                        .map_err(|e| RedDBError::Query(e.to_string()))?;
8965                }
8966                applied += 1;
8967                tracing::info!(
8968                    target: "audit",
8969                    principal = %granter,
8970                    action = "grant",
8971                    "GRANT applied"
8972                );
8973            }
8974        }
8975
8976        self.invalidate_result_cache();
8977        Ok(RuntimeQueryResult::ok_message(
8978            query.to_string(),
8979            &format!("GRANT applied to {} target(s)", applied),
8980            "grant",
8981        ))
8982    }
8983
8984    /// Translate the parsed [`RevokeStmt`] into AuthStore mutations.
8985    fn execute_revoke_statement(
8986        &self,
8987        query: &str,
8988        stmt: &crate::storage::query::ast::RevokeStmt,
8989    ) -> RedDBResult<RuntimeQueryResult> {
8990        use crate::auth::privileges::{Action, GrantPrincipal, Resource};
8991        use crate::auth::UserId;
8992        use crate::storage::query::ast::{GrantObjectKind, GrantPrincipalRef};
8993
8994        let auth_store = self
8995            .inner
8996            .auth_store
8997            .read()
8998            .clone()
8999            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9000
9001        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
9002            RedDBError::Query("REVOKE requires an authenticated principal".to_string())
9003        })?;
9004        let granter_role = grole;
9005
9006        let actions: Vec<Action> = if stmt.all {
9007            vec![Action::All]
9008        } else {
9009            stmt.actions
9010                .iter()
9011                .map(|kw| Action::from_keyword(kw).unwrap_or(Action::Select))
9012                .collect()
9013        };
9014
9015        let mut total_removed = 0usize;
9016        for obj in &stmt.objects {
9017            let resource = match stmt.object_kind {
9018                GrantObjectKind::Table => Resource::Table {
9019                    schema: obj.schema.clone(),
9020                    table: obj.name.clone(),
9021                },
9022                GrantObjectKind::Schema => Resource::Schema(obj.name.clone()),
9023                GrantObjectKind::Database => Resource::Database,
9024                GrantObjectKind::Function => Resource::Function {
9025                    schema: obj.schema.clone(),
9026                    name: obj.name.clone(),
9027                },
9028            };
9029            for principal in &stmt.principals {
9030                let p = match principal {
9031                    GrantPrincipalRef::Public => GrantPrincipal::Public,
9032                    GrantPrincipalRef::Group(g) => GrantPrincipal::Group(g.clone()),
9033                    GrantPrincipalRef::User { tenant, name } => {
9034                        GrantPrincipal::User(UserId::from_parts(tenant.as_deref(), name))
9035                    }
9036                };
9037                let removed = auth_store
9038                    .revoke(granter_role, &p, &resource, &actions)
9039                    .map_err(|e| RedDBError::Query(e.to_string()))?;
9040                let _removed_policies =
9041                    auth_store.delete_synthetic_grant_policies(&p, &resource, &actions);
9042                total_removed += removed;
9043            }
9044        }
9045
9046        self.invalidate_result_cache();
9047        Ok(RuntimeQueryResult::ok_message(
9048            query.to_string(),
9049            &format!("REVOKE removed {} grant(s)", total_removed),
9050            "revoke",
9051        ))
9052    }
9053
9054    /// Translate the parsed [`AlterUserStmt`] into AuthStore mutations.
9055    fn execute_alter_user_statement(
9056        &self,
9057        query: &str,
9058        stmt: &crate::storage::query::ast::AlterUserStmt,
9059    ) -> RedDBResult<RuntimeQueryResult> {
9060        use crate::auth::privileges::UserAttributes;
9061        use crate::auth::UserId;
9062        use crate::storage::query::ast::AlterUserAttribute;
9063
9064        let auth_store = self
9065            .inner
9066            .auth_store
9067            .read()
9068            .clone()
9069            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9070
9071        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
9072            RedDBError::Query("ALTER USER requires an authenticated principal".to_string())
9073        })?;
9074        if grole != crate::auth::Role::Admin {
9075            return Err(RedDBError::Query(
9076                "ALTER USER requires Admin role".to_string(),
9077            ));
9078        }
9079
9080        let target = UserId::from_parts(stmt.tenant.as_deref(), &stmt.username);
9081
9082        // Apply attributes incrementally — each one reads the current
9083        // record, mutates the relevant field, writes back.
9084        let mut attrs = auth_store.user_attributes(&target);
9085        let mut enable_change: Option<bool> = None;
9086
9087        for a in &stmt.attributes {
9088            match a {
9089                AlterUserAttribute::ValidUntil(ts) => {
9090                    // Parse ISO-ish timestamp → ms since epoch. Fall
9091                    // back to integer-ms parsing for callers that pass
9092                    // `'1234567890123'`.
9093                    let ms = parse_timestamp_to_ms(ts).ok_or_else(|| {
9094                        RedDBError::Query(format!("invalid VALID UNTIL timestamp `{ts}`"))
9095                    })?;
9096                    attrs.valid_until = Some(ms);
9097                }
9098                AlterUserAttribute::ConnectionLimit(n) => {
9099                    if *n < 0 {
9100                        return Err(RedDBError::Query(
9101                            "CONNECTION LIMIT must be non-negative".to_string(),
9102                        ));
9103                    }
9104                    attrs.connection_limit = Some(*n as u32);
9105                }
9106                AlterUserAttribute::SetSearchPath(p) => {
9107                    attrs.search_path = Some(p.clone());
9108                }
9109                AlterUserAttribute::AddGroup(g) => {
9110                    if !attrs.groups.iter().any(|existing| existing == g) {
9111                        attrs.groups.push(g.clone());
9112                        attrs.groups.sort();
9113                    }
9114                }
9115                AlterUserAttribute::DropGroup(g) => {
9116                    attrs.groups.retain(|existing| existing != g);
9117                }
9118                AlterUserAttribute::Enable => enable_change = Some(true),
9119                AlterUserAttribute::Disable => enable_change = Some(false),
9120                AlterUserAttribute::Password(_) => {
9121                    // Out of scope — accept the AST but no-op so the
9122                    // parser stays compatible with future password
9123                    // rotation work.
9124                }
9125            }
9126        }
9127
9128        auth_store
9129            .set_user_attributes(&target, attrs)
9130            .map_err(|e| RedDBError::Query(e.to_string()))?;
9131        if let Some(en) = enable_change {
9132            auth_store
9133                .set_user_enabled(&target, en)
9134                .map_err(|e| RedDBError::Query(e.to_string()))?;
9135        }
9136        self.invalidate_result_cache();
9137        tracing::info!(
9138            target: "audit",
9139            principal = %target,
9140            action = "alter_user",
9141            "ALTER USER applied"
9142        );
9143
9144        Ok(RuntimeQueryResult::ok_message(
9145            query.to_string(),
9146            &format!("ALTER USER {} applied", target),
9147            "alter_user",
9148        ))
9149    }
9150
9151    // -----------------------------------------------------------------
9152    // IAM policy executors
9153    // -----------------------------------------------------------------
9154
9155    fn execute_create_iam_policy(
9156        &self,
9157        query: &str,
9158        id: &str,
9159        json: &str,
9160    ) -> RedDBResult<RuntimeQueryResult> {
9161        use crate::auth::policies::Policy;
9162
9163        let auth_store = self
9164            .inner
9165            .auth_store
9166            .read()
9167            .clone()
9168            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9169
9170        // Parse + validate. The kernel rejects oversize / bad shape /
9171        // bad action keywords. If the supplied id differs from the JSON
9172        // id, override it with the SQL-provided id (the JSON id is
9173        // optional context — the SQL DDL form is authoritative).
9174        let mut policy = Policy::from_json_str(json)
9175            .map_err(|e| RedDBError::Query(format!("policy parse: {e}")))?;
9176        if policy.id != id {
9177            policy.id = id.to_string();
9178        }
9179        let pid = policy.id.clone();
9180        auth_store
9181            .put_policy(policy)
9182            .map_err(|e| RedDBError::Query(e.to_string()))?;
9183
9184        let principal = current_auth_identity()
9185            .map(|(u, _)| u)
9186            .unwrap_or_else(|| "anonymous".into());
9187        tracing::info!(
9188            target: "audit",
9189            principal = %principal,
9190            action = "iam:policy.put",
9191            matched_policy_id = %pid,
9192            "CREATE POLICY applied"
9193        );
9194        self.inner.audit_log.record(
9195            "iam/policy.put",
9196            &principal,
9197            &pid,
9198            "ok",
9199            crate::json::Value::Null,
9200        );
9201
9202        self.invalidate_result_cache();
9203        Ok(RuntimeQueryResult::ok_message(
9204            query.to_string(),
9205            &format!("policy `{pid}` stored"),
9206            "create_iam_policy",
9207        ))
9208    }
9209
9210    fn execute_drop_iam_policy(&self, query: &str, id: &str) -> RedDBResult<RuntimeQueryResult> {
9211        let auth_store = self
9212            .inner
9213            .auth_store
9214            .read()
9215            .clone()
9216            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9217        auth_store
9218            .delete_policy(id)
9219            .map_err(|e| RedDBError::Query(e.to_string()))?;
9220
9221        let principal = current_auth_identity()
9222            .map(|(u, _)| u)
9223            .unwrap_or_else(|| "anonymous".into());
9224        tracing::info!(
9225            target: "audit",
9226            principal = %principal,
9227            action = "iam:policy.drop",
9228            matched_policy_id = %id,
9229            "DROP POLICY applied"
9230        );
9231        self.inner.audit_log.record(
9232            "iam/policy.drop",
9233            &principal,
9234            id,
9235            "ok",
9236            crate::json::Value::Null,
9237        );
9238
9239        self.invalidate_result_cache();
9240        Ok(RuntimeQueryResult::ok_message(
9241            query.to_string(),
9242            &format!("policy `{id}` dropped"),
9243            "drop_iam_policy",
9244        ))
9245    }
9246
9247    fn execute_attach_policy(
9248        &self,
9249        query: &str,
9250        policy_id: &str,
9251        principal: &crate::storage::query::ast::PolicyPrincipalRef,
9252    ) -> RedDBResult<RuntimeQueryResult> {
9253        use crate::auth::store::PrincipalRef;
9254        use crate::auth::UserId;
9255        use crate::storage::query::ast::PolicyPrincipalRef;
9256
9257        let auth_store = self
9258            .inner
9259            .auth_store
9260            .read()
9261            .clone()
9262            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9263        let p = match principal {
9264            PolicyPrincipalRef::User(u) => {
9265                PrincipalRef::User(UserId::from_parts(u.tenant.as_deref(), &u.username))
9266            }
9267            PolicyPrincipalRef::Group(g) => PrincipalRef::Group(g.clone()),
9268        };
9269        let pretty_target = principal_label(principal);
9270        auth_store
9271            .attach_policy(p, policy_id)
9272            .map_err(|e| RedDBError::Query(e.to_string()))?;
9273
9274        let principal_str = current_auth_identity()
9275            .map(|(u, _)| u)
9276            .unwrap_or_else(|| "anonymous".into());
9277        tracing::info!(
9278            target: "audit",
9279            principal = %principal_str,
9280            action = "iam:policy.attach",
9281            matched_policy_id = %policy_id,
9282            target = %pretty_target,
9283            "ATTACH POLICY applied"
9284        );
9285        self.inner.audit_log.record(
9286            "iam/policy.attach",
9287            &principal_str,
9288            &pretty_target,
9289            "ok",
9290            crate::json::Value::Null,
9291        );
9292
9293        self.invalidate_result_cache();
9294        Ok(RuntimeQueryResult::ok_message(
9295            query.to_string(),
9296            &format!("policy `{policy_id}` attached to {pretty_target}"),
9297            "attach_policy",
9298        ))
9299    }
9300
9301    fn execute_detach_policy(
9302        &self,
9303        query: &str,
9304        policy_id: &str,
9305        principal: &crate::storage::query::ast::PolicyPrincipalRef,
9306    ) -> RedDBResult<RuntimeQueryResult> {
9307        use crate::auth::store::PrincipalRef;
9308        use crate::auth::UserId;
9309        use crate::storage::query::ast::PolicyPrincipalRef;
9310
9311        let auth_store = self
9312            .inner
9313            .auth_store
9314            .read()
9315            .clone()
9316            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9317        let p = match principal {
9318            PolicyPrincipalRef::User(u) => {
9319                PrincipalRef::User(UserId::from_parts(u.tenant.as_deref(), &u.username))
9320            }
9321            PolicyPrincipalRef::Group(g) => PrincipalRef::Group(g.clone()),
9322        };
9323        let pretty_target = principal_label(principal);
9324        auth_store
9325            .detach_policy(p, policy_id)
9326            .map_err(|e| RedDBError::Query(e.to_string()))?;
9327
9328        let principal_str = current_auth_identity()
9329            .map(|(u, _)| u)
9330            .unwrap_or_else(|| "anonymous".into());
9331        tracing::info!(
9332            target: "audit",
9333            principal = %principal_str,
9334            action = "iam:policy.detach",
9335            matched_policy_id = %policy_id,
9336            target = %pretty_target,
9337            "DETACH POLICY applied"
9338        );
9339        self.inner.audit_log.record(
9340            "iam/policy.detach",
9341            &principal_str,
9342            &pretty_target,
9343            "ok",
9344            crate::json::Value::Null,
9345        );
9346
9347        self.invalidate_result_cache();
9348        Ok(RuntimeQueryResult::ok_message(
9349            query.to_string(),
9350            &format!("policy `{policy_id}` detached from {pretty_target}"),
9351            "detach_policy",
9352        ))
9353    }
9354
9355    fn execute_show_policies(
9356        &self,
9357        query: &str,
9358        filter: Option<&crate::storage::query::ast::PolicyPrincipalRef>,
9359    ) -> RedDBResult<RuntimeQueryResult> {
9360        use crate::auth::UserId;
9361        use crate::storage::query::ast::PolicyPrincipalRef;
9362        use crate::storage::query::unified::UnifiedRecord;
9363        use crate::storage::schema::Value as SchemaValue;
9364        use std::sync::Arc;
9365
9366        let auth_store = self
9367            .inner
9368            .auth_store
9369            .read()
9370            .clone()
9371            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9372
9373        let pols = match filter {
9374            None => auth_store.list_policies(),
9375            Some(PolicyPrincipalRef::User(u)) => {
9376                let id = UserId::from_parts(u.tenant.as_deref(), &u.username);
9377                auth_store.effective_policies(&id)
9378            }
9379            Some(PolicyPrincipalRef::Group(g)) => auth_store.group_policies(g),
9380        };
9381
9382        let mut records = Vec::with_capacity(pols.len());
9383        for p in pols.iter() {
9384            let mut rec = UnifiedRecord::default();
9385            rec.set_arc(Arc::from("id"), SchemaValue::text(p.id.clone()));
9386            rec.set_arc(
9387                Arc::from("statements"),
9388                SchemaValue::Integer(p.statements.len() as i64),
9389            );
9390            rec.set_arc(
9391                Arc::from("tenant"),
9392                p.tenant
9393                    .as_deref()
9394                    .map(|t| SchemaValue::text(t.to_string()))
9395                    .unwrap_or(SchemaValue::Null),
9396            );
9397            rec.set_arc(Arc::from("json"), SchemaValue::text(p.to_json_string()));
9398            records.push(rec);
9399        }
9400        let mut result = crate::storage::query::unified::UnifiedResult::empty();
9401        result.records = records;
9402        Ok(RuntimeQueryResult {
9403            query: query.to_string(),
9404            mode: crate::storage::query::modes::QueryMode::Sql,
9405            statement: "show_policies",
9406            engine: "iam-policies",
9407            result,
9408            affected_rows: 0,
9409            statement_type: "select",
9410        })
9411    }
9412
9413    fn execute_show_effective_permissions(
9414        &self,
9415        query: &str,
9416        user: &crate::storage::query::ast::PolicyUserRef,
9417        resource: Option<&crate::storage::query::ast::PolicyResourceRef>,
9418    ) -> RedDBResult<RuntimeQueryResult> {
9419        use crate::auth::UserId;
9420        use crate::storage::query::unified::UnifiedRecord;
9421        use crate::storage::schema::Value as SchemaValue;
9422        use std::sync::Arc;
9423
9424        let auth_store = self
9425            .inner
9426            .auth_store
9427            .read()
9428            .clone()
9429            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9430        let id = UserId::from_parts(user.tenant.as_deref(), &user.username);
9431        let pols = auth_store.effective_policies(&id);
9432
9433        // Show one row per (policy, statement) tuple, plus any
9434        // resource-level filter passed by the caller.
9435        let mut records = Vec::new();
9436        for p in pols.iter() {
9437            for (idx, st) in p.statements.iter().enumerate() {
9438                if let Some(_r) = resource {
9439                    // Naive filter: render statement targets to strings
9440                    // and skip if no match. Conservative default = include
9441                    // (the simulator handles fine-grained matching).
9442                }
9443                let mut rec = UnifiedRecord::default();
9444                rec.set_arc(Arc::from("policy_id"), SchemaValue::text(p.id.clone()));
9445                rec.set_arc(
9446                    Arc::from("statement_index"),
9447                    SchemaValue::Integer(idx as i64),
9448                );
9449                rec.set_arc(
9450                    Arc::from("sid"),
9451                    st.sid
9452                        .as_deref()
9453                        .map(|s| SchemaValue::text(s.to_string()))
9454                        .unwrap_or(SchemaValue::Null),
9455                );
9456                rec.set_arc(
9457                    Arc::from("effect"),
9458                    SchemaValue::text(match st.effect {
9459                        crate::auth::policies::Effect::Allow => "allow",
9460                        crate::auth::policies::Effect::Deny => "deny",
9461                    }),
9462                );
9463                rec.set_arc(
9464                    Arc::from("actions"),
9465                    SchemaValue::Integer(st.actions.len() as i64),
9466                );
9467                rec.set_arc(
9468                    Arc::from("resources"),
9469                    SchemaValue::Integer(st.resources.len() as i64),
9470                );
9471                records.push(rec);
9472            }
9473        }
9474        let mut result = crate::storage::query::unified::UnifiedResult::empty();
9475        result.records = records;
9476        Ok(RuntimeQueryResult {
9477            query: query.to_string(),
9478            mode: crate::storage::query::modes::QueryMode::Sql,
9479            statement: "show_effective_permissions",
9480            engine: "iam-policies",
9481            result,
9482            affected_rows: 0,
9483            statement_type: "select",
9484        })
9485    }
9486
9487    fn execute_simulate_policy(
9488        &self,
9489        query: &str,
9490        user: &crate::storage::query::ast::PolicyUserRef,
9491        action: &str,
9492        resource: &crate::storage::query::ast::PolicyResourceRef,
9493    ) -> RedDBResult<RuntimeQueryResult> {
9494        use crate::auth::policies::ResourceRef;
9495        use crate::auth::store::SimCtx;
9496        use crate::auth::UserId;
9497        use crate::storage::query::unified::UnifiedRecord;
9498        use crate::storage::schema::Value as SchemaValue;
9499        use std::sync::Arc;
9500
9501        let auth_store = self
9502            .inner
9503            .auth_store
9504            .read()
9505            .clone()
9506            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9507        let id = UserId::from_parts(user.tenant.as_deref(), &user.username);
9508        let r = ResourceRef::new(resource.kind.clone(), resource.name.clone());
9509        let outcome = auth_store.simulate(&id, action, &r, SimCtx::default());
9510
9511        let principal_str = current_auth_identity()
9512            .map(|(u, _)| u)
9513            .unwrap_or_else(|| "anonymous".into());
9514        let (decision_str, matched_pid, matched_sid) = decision_to_strings(&outcome.decision);
9515        tracing::info!(
9516            target: "audit",
9517            principal = %principal_str,
9518            action = "iam:policy.simulate",
9519            decision = %decision_str,
9520            matched_policy_id = ?matched_pid,
9521            matched_sid = ?matched_sid,
9522            "SIMULATE issued"
9523        );
9524        self.inner.audit_log.record(
9525            "iam/policy.simulate",
9526            &principal_str,
9527            &id.to_string(),
9528            "ok",
9529            crate::json::Value::Null,
9530        );
9531
9532        let mut rec = UnifiedRecord::default();
9533        rec.set_arc(Arc::from("decision"), SchemaValue::text(decision_str));
9534        rec.set_arc(
9535            Arc::from("matched_policy_id"),
9536            matched_pid
9537                .map(SchemaValue::text)
9538                .unwrap_or(SchemaValue::Null),
9539        );
9540        rec.set_arc(
9541            Arc::from("matched_sid"),
9542            matched_sid
9543                .map(SchemaValue::text)
9544                .unwrap_or(SchemaValue::Null),
9545        );
9546        rec.set_arc(Arc::from("reason"), SchemaValue::text(outcome.reason));
9547        rec.set_arc(
9548            Arc::from("trail_len"),
9549            SchemaValue::Integer(outcome.trail.len() as i64),
9550        );
9551        let mut result = crate::storage::query::unified::UnifiedResult::empty();
9552        result.records = vec![rec];
9553        Ok(RuntimeQueryResult {
9554            query: query.to_string(),
9555            mode: crate::storage::query::modes::QueryMode::Sql,
9556            statement: "simulate_policy",
9557            engine: "iam-policies",
9558            result,
9559            affected_rows: 0,
9560            statement_type: "select",
9561        })
9562    }
9563}
9564
9565/// Translate a parsed GRANT into a synthetic IAM policy whose id
9566/// starts with `_grant_<unique>`. PUBLIC is represented as an
9567/// implicit IAM group; legacy GROUP grants are still rejected by the
9568/// grant store and are not translated here.
9569fn grant_to_iam_policy(
9570    principal: &crate::auth::privileges::GrantPrincipal,
9571    resource: &crate::auth::privileges::Resource,
9572    actions: &[crate::auth::privileges::Action],
9573    tenant: Option<&str>,
9574) -> Option<crate::auth::policies::Policy> {
9575    use crate::auth::policies::{
9576        compile_action, ActionPattern, Effect, Policy, ResourcePattern, Statement,
9577    };
9578    use crate::auth::privileges::{Action, GrantPrincipal, Resource};
9579
9580    if matches!(principal, GrantPrincipal::Group(_)) {
9581        return None;
9582    }
9583
9584    let now = crate::auth::now_ms();
9585    let id = format!("_grant_{:x}_{:x}", now, std::process::id());
9586
9587    let resource_str = match resource {
9588        Resource::Database => "table:*".to_string(),
9589        Resource::Schema(s) => format!("table:{s}.*"),
9590        Resource::Table { schema, table } => match schema {
9591            Some(s) => format!("table:{s}.{table}"),
9592            None => format!("table:{table}"),
9593        },
9594        Resource::Function { schema, name } => match schema {
9595            Some(s) => format!("function:{s}.{name}"),
9596            None => format!("function:{name}"),
9597        },
9598    };
9599
9600    // Compile actions — fall back to `*` only when the grant included
9601    // `Action::All`. Map every other action keyword to its lowercase
9602    // form so it lines up with the kernel's allowlist.
9603    let action_patterns: Vec<ActionPattern> = if actions.contains(&Action::All) {
9604        vec![ActionPattern::Wildcard]
9605    } else {
9606        actions
9607            .iter()
9608            .map(|a| compile_action(&a.as_str().to_ascii_lowercase()))
9609            .collect()
9610    };
9611    if action_patterns.is_empty() {
9612        return None;
9613    }
9614
9615    // Inline resource compilation matching the kernel's `compile_resource`:
9616    //   * `*` → wildcard
9617    //   * contains `*` → glob
9618    //   * `kind:name` → exact
9619    let resource_patterns = if resource_str == "*" {
9620        vec![ResourcePattern::Wildcard]
9621    } else if resource_str.contains('*') {
9622        vec![ResourcePattern::Glob(resource_str.clone())]
9623    } else if let Some((kind, name)) = resource_str.split_once(':') {
9624        vec![ResourcePattern::Exact {
9625            kind: kind.to_string(),
9626            name: name.to_string(),
9627        }]
9628    } else {
9629        vec![ResourcePattern::Wildcard]
9630    };
9631
9632    let policy = Policy {
9633        id,
9634        version: 1,
9635        tenant: tenant.map(|t| t.to_string()),
9636        created_at: now,
9637        updated_at: now,
9638        statements: vec![Statement {
9639            sid: None,
9640            effect: Effect::Allow,
9641            actions: action_patterns,
9642            resources: resource_patterns,
9643            condition: None,
9644        }],
9645    };
9646    if policy.validate().is_err() {
9647        return None;
9648    }
9649    Some(policy)
9650}
9651
9652fn legacy_action_to_iam(action: crate::auth::privileges::Action) -> &'static str {
9653    use crate::auth::privileges::Action;
9654    match action {
9655        Action::Select => "select",
9656        Action::Insert => "insert",
9657        Action::Update => "update",
9658        Action::Delete => "delete",
9659        Action::Truncate => "truncate",
9660        Action::References => "references",
9661        Action::Execute => "execute",
9662        Action::Usage => "usage",
9663        Action::All => "*",
9664    }
9665}
9666
9667fn update_set_target_columns(query: &crate::storage::query::ast::UpdateQuery) -> Vec<String> {
9668    let mut columns = Vec::new();
9669    for (column, _) in &query.assignment_exprs {
9670        if !columns.iter().any(|seen| seen == column) {
9671            columns.push(column.clone());
9672        }
9673    }
9674    columns
9675}
9676
9677fn column_access_request_for_table_update(
9678    table_name: &str,
9679    columns: Vec<String>,
9680) -> crate::auth::ColumnAccessRequest {
9681    match table_name.split_once('.') {
9682        Some((schema, table)) => {
9683            crate::auth::ColumnAccessRequest::update(table.to_string(), columns)
9684                .with_schema(schema.to_string())
9685        }
9686        None => crate::auth::ColumnAccessRequest::update(table_name.to_string(), columns),
9687    }
9688}
9689
9690fn column_access_request_for_table_select(
9691    table_name: &str,
9692    columns: Vec<String>,
9693) -> crate::auth::ColumnAccessRequest {
9694    match table_name.split_once('.') {
9695        Some((schema, table)) => {
9696            crate::auth::ColumnAccessRequest::select(table.to_string(), columns)
9697                .with_schema(schema.to_string())
9698        }
9699        None => crate::auth::ColumnAccessRequest::select(table_name.to_string(), columns),
9700    }
9701}
9702
9703fn update_returning_columns_for_policy(
9704    runtime: &RedDBRuntime,
9705    query: &crate::storage::query::ast::UpdateQuery,
9706) -> Option<Vec<String>> {
9707    let items = query.returning.as_ref()?;
9708    let mut columns = Vec::new();
9709    let project_all = items
9710        .iter()
9711        .any(|item| matches!(item, crate::storage::query::ast::ReturningItem::All));
9712    if project_all {
9713        collect_returning_star_columns(runtime, query, &mut columns);
9714    } else {
9715        for item in items {
9716            let crate::storage::query::ast::ReturningItem::Column(column) = item else {
9717                continue;
9718            };
9719            push_returning_policy_column(&mut columns, column);
9720        }
9721    }
9722    (!columns.is_empty()).then_some(columns)
9723}
9724
9725fn collect_returning_star_columns(
9726    runtime: &RedDBRuntime,
9727    query: &crate::storage::query::ast::UpdateQuery,
9728    columns: &mut Vec<String>,
9729) {
9730    let store = runtime.db().store();
9731    let Some(manager) = store.get_collection(&query.table) else {
9732        return;
9733    };
9734    if let Some(schema) = manager.column_schema() {
9735        for column in schema.iter() {
9736            push_returning_policy_column(columns, column);
9737        }
9738    }
9739    for entity in manager.query_all(|_| true) {
9740        if !returning_entity_matches_update_target(&entity, query.target) {
9741            continue;
9742        }
9743        match &entity.data {
9744            crate::storage::EntityData::Row(row) => {
9745                for (column, _) in row.iter_fields() {
9746                    push_returning_policy_column(columns, column);
9747                }
9748            }
9749            crate::storage::EntityData::Node(node) => {
9750                push_returning_policy_column(columns, "label");
9751                push_returning_policy_column(columns, "node_type");
9752                for column in node.properties.keys() {
9753                    push_returning_policy_column(columns, column);
9754                }
9755            }
9756            crate::storage::EntityData::Edge(edge) => {
9757                push_returning_policy_column(columns, "label");
9758                push_returning_policy_column(columns, "from_rid");
9759                push_returning_policy_column(columns, "to_rid");
9760                push_returning_policy_column(columns, "weight");
9761                for column in edge.properties.keys() {
9762                    push_returning_policy_column(columns, column);
9763                }
9764            }
9765            _ => {}
9766        }
9767    }
9768}
9769
9770fn push_returning_policy_column(columns: &mut Vec<String>, column: &str) {
9771    if returning_public_envelope_column(column) {
9772        return;
9773    }
9774    if !columns.iter().any(|seen| seen == column) {
9775        columns.push(column.to_string());
9776    }
9777}
9778
9779fn returning_public_envelope_column(column: &str) -> bool {
9780    matches!(
9781        column.to_ascii_lowercase().as_str(),
9782        "rid" | "collection" | "kind" | "tenant" | "created_at" | "updated_at" | "red_entity_id"
9783    )
9784}
9785
9786fn returning_entity_matches_update_target(
9787    entity: &crate::storage::UnifiedEntity,
9788    target: crate::storage::query::ast::UpdateTarget,
9789) -> bool {
9790    use crate::storage::query::ast::UpdateTarget;
9791    match target {
9792        UpdateTarget::Rows => {
9793            matches!(returning_row_item_kind(entity), Some(ReturningRowKind::Row))
9794        }
9795        UpdateTarget::Documents => {
9796            matches!(
9797                returning_row_item_kind(entity),
9798                Some(ReturningRowKind::Document)
9799            )
9800        }
9801        UpdateTarget::Kv => matches!(returning_row_item_kind(entity), Some(ReturningRowKind::Kv)),
9802        UpdateTarget::Nodes => matches!(
9803            (&entity.kind, &entity.data),
9804            (
9805                crate::storage::EntityKind::GraphNode(_),
9806                crate::storage::EntityData::Node(_)
9807            )
9808        ),
9809        UpdateTarget::Edges => matches!(
9810            (&entity.kind, &entity.data),
9811            (
9812                crate::storage::EntityKind::GraphEdge(_),
9813                crate::storage::EntityData::Edge(_)
9814            )
9815        ),
9816    }
9817}
9818
9819#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9820enum ReturningRowKind {
9821    Row,
9822    Document,
9823    Kv,
9824}
9825
9826fn returning_row_item_kind(entity: &crate::storage::UnifiedEntity) -> Option<ReturningRowKind> {
9827    let row = entity.data.as_row()?;
9828    let is_kv = row.iter_fields().all(|(column, _)| {
9829        column.eq_ignore_ascii_case("key") || column.eq_ignore_ascii_case("value")
9830    });
9831    if is_kv {
9832        return Some(ReturningRowKind::Kv);
9833    }
9834    let is_document = row
9835        .iter_fields()
9836        .any(|(_, value)| matches!(value, crate::storage::schema::Value::Json(_)));
9837    if is_document {
9838        Some(ReturningRowKind::Document)
9839    } else {
9840        Some(ReturningRowKind::Row)
9841    }
9842}
9843
9844fn requested_table_columns_for_policy(
9845    table: &crate::storage::query::ast::TableQuery,
9846) -> Vec<String> {
9847    use crate::storage::query::sql_lowering::{
9848        effective_table_filter, effective_table_group_by_exprs, effective_table_having_filter,
9849        effective_table_projections,
9850    };
9851
9852    let table_name = table.table.as_str();
9853    let table_alias = table.alias.as_deref();
9854    let mut columns = std::collections::BTreeSet::new();
9855
9856    for projection in effective_table_projections(table) {
9857        collect_projection_columns(&projection, table_name, table_alias, &mut columns);
9858    }
9859    if let Some(filter) = effective_table_filter(table) {
9860        collect_filter_columns(&filter, table_name, table_alias, &mut columns);
9861    }
9862    for expr in effective_table_group_by_exprs(table) {
9863        collect_expr_columns(&expr, table_name, table_alias, &mut columns);
9864    }
9865    if let Some(filter) = effective_table_having_filter(table) {
9866        collect_filter_columns(&filter, table_name, table_alias, &mut columns);
9867    }
9868    for order in &table.order_by {
9869        if let Some(expr) = order.expr.as_ref() {
9870            collect_expr_columns(expr, table_name, table_alias, &mut columns);
9871        } else {
9872            collect_field_ref_column(&order.field, table_name, table_alias, &mut columns);
9873        }
9874    }
9875
9876    columns.into_iter().collect()
9877}
9878
9879fn collect_projection_columns(
9880    projection: &crate::storage::query::ast::Projection,
9881    table_name: &str,
9882    table_alias: Option<&str>,
9883    columns: &mut std::collections::BTreeSet<String>,
9884) {
9885    use crate::storage::query::ast::Projection;
9886    match projection {
9887        Projection::All => {
9888            columns.insert("*".to_string());
9889        }
9890        Projection::Column(column) | Projection::Alias(column, _) => {
9891            if column != "*" {
9892                columns.insert(column.clone());
9893            }
9894        }
9895        Projection::Function(_, args) => {
9896            for arg in args {
9897                collect_projection_columns(arg, table_name, table_alias, columns);
9898            }
9899        }
9900        Projection::Expression(filter, _) => {
9901            collect_filter_columns(filter, table_name, table_alias, columns);
9902        }
9903        Projection::Field(field, _) => {
9904            collect_field_ref_column(field, table_name, table_alias, columns);
9905        }
9906    }
9907}
9908
9909fn collect_filter_columns(
9910    filter: &crate::storage::query::ast::Filter,
9911    table_name: &str,
9912    table_alias: Option<&str>,
9913    columns: &mut std::collections::BTreeSet<String>,
9914) {
9915    use crate::storage::query::ast::Filter;
9916    match filter {
9917        Filter::Compare { field, .. }
9918        | Filter::IsNull(field)
9919        | Filter::IsNotNull(field)
9920        | Filter::In { field, .. }
9921        | Filter::Between { field, .. }
9922        | Filter::Like { field, .. }
9923        | Filter::StartsWith { field, .. }
9924        | Filter::EndsWith { field, .. }
9925        | Filter::Contains { field, .. } => {
9926            collect_field_ref_column(field, table_name, table_alias, columns);
9927        }
9928        Filter::CompareFields { left, right, .. } => {
9929            collect_field_ref_column(left, table_name, table_alias, columns);
9930            collect_field_ref_column(right, table_name, table_alias, columns);
9931        }
9932        Filter::CompareExpr { lhs, rhs, .. } => {
9933            collect_expr_columns(lhs, table_name, table_alias, columns);
9934            collect_expr_columns(rhs, table_name, table_alias, columns);
9935        }
9936        Filter::And(left, right) | Filter::Or(left, right) => {
9937            collect_filter_columns(left, table_name, table_alias, columns);
9938            collect_filter_columns(right, table_name, table_alias, columns);
9939        }
9940        Filter::Not(inner) => collect_filter_columns(inner, table_name, table_alias, columns),
9941    }
9942}
9943
9944fn collect_expr_columns(
9945    expr: &crate::storage::query::ast::Expr,
9946    table_name: &str,
9947    table_alias: Option<&str>,
9948    columns: &mut std::collections::BTreeSet<String>,
9949) {
9950    use crate::storage::query::ast::Expr;
9951    match expr {
9952        Expr::Column { field, .. } => {
9953            collect_field_ref_column(field, table_name, table_alias, columns);
9954        }
9955        Expr::Literal { .. } | Expr::Parameter { .. } => {}
9956        Expr::UnaryOp { operand, .. } | Expr::Cast { inner: operand, .. } => {
9957            collect_expr_columns(operand, table_name, table_alias, columns);
9958        }
9959        Expr::BinaryOp { lhs, rhs, .. } => {
9960            collect_expr_columns(lhs, table_name, table_alias, columns);
9961            collect_expr_columns(rhs, table_name, table_alias, columns);
9962        }
9963        Expr::FunctionCall { args, .. } => {
9964            for arg in args {
9965                collect_expr_columns(arg, table_name, table_alias, columns);
9966            }
9967        }
9968        Expr::Case {
9969            branches, else_, ..
9970        } => {
9971            for (condition, value) in branches {
9972                collect_expr_columns(condition, table_name, table_alias, columns);
9973                collect_expr_columns(value, table_name, table_alias, columns);
9974            }
9975            if let Some(value) = else_ {
9976                collect_expr_columns(value, table_name, table_alias, columns);
9977            }
9978        }
9979        Expr::IsNull { operand, .. } => {
9980            collect_expr_columns(operand, table_name, table_alias, columns);
9981        }
9982        Expr::InList { target, values, .. } => {
9983            collect_expr_columns(target, table_name, table_alias, columns);
9984            for value in values {
9985                collect_expr_columns(value, table_name, table_alias, columns);
9986            }
9987        }
9988        Expr::Between {
9989            target, low, high, ..
9990        } => {
9991            collect_expr_columns(target, table_name, table_alias, columns);
9992            collect_expr_columns(low, table_name, table_alias, columns);
9993            collect_expr_columns(high, table_name, table_alias, columns);
9994        }
9995        Expr::Subquery { .. } => {}
9996    }
9997}
9998
9999fn collect_field_ref_column(
10000    field: &crate::storage::query::ast::FieldRef,
10001    table_name: &str,
10002    table_alias: Option<&str>,
10003    columns: &mut std::collections::BTreeSet<String>,
10004) {
10005    if let Some(column) = policy_column_name_from_field_ref(field, table_name, table_alias) {
10006        if column != "*" {
10007            columns.insert(column);
10008        }
10009    }
10010}
10011
10012fn policy_column_name_from_field_ref(
10013    field: &crate::storage::query::ast::FieldRef,
10014    table_name: &str,
10015    table_alias: Option<&str>,
10016) -> Option<String> {
10017    match field {
10018        crate::storage::query::ast::FieldRef::TableColumn { table, column } => {
10019            if column == "*" {
10020                return Some("*".to_string());
10021            }
10022            if table.is_empty() || table == table_name || Some(table.as_str()) == table_alias {
10023                Some(column.clone())
10024            } else {
10025                Some(format!("{table}.{column}"))
10026            }
10027        }
10028        _ => None,
10029    }
10030}
10031
10032fn legacy_resource_to_iam(
10033    resource: &crate::auth::privileges::Resource,
10034    tenant: Option<&str>,
10035) -> crate::auth::policies::ResourceRef {
10036    use crate::auth::privileges::Resource;
10037
10038    let (kind, name) = match resource {
10039        Resource::Database => ("database".to_string(), "*".to_string()),
10040        Resource::Schema(s) => ("schema".to_string(), format!("{s}.*")),
10041        Resource::Table { schema, table } => (
10042            "table".to_string(),
10043            match schema {
10044                Some(s) => format!("{s}.{table}"),
10045                None => table.clone(),
10046            },
10047        ),
10048        Resource::Function { schema, name } => (
10049            "function".to_string(),
10050            match schema {
10051                Some(s) => format!("{s}.{name}"),
10052                None => name.clone(),
10053            },
10054        ),
10055    };
10056
10057    let mut out = crate::auth::policies::ResourceRef::new(kind, name);
10058    if let Some(t) = tenant {
10059        out = out.with_tenant(t.to_string());
10060    }
10061    out
10062}
10063
10064#[derive(Debug)]
10065struct JoinTableSide {
10066    table: String,
10067    alias: String,
10068}
10069
10070fn table_side_context(expr: &QueryExpr) -> Option<JoinTableSide> {
10071    match expr {
10072        QueryExpr::Table(table) => Some(JoinTableSide {
10073            table: table.table.clone(),
10074            alias: table.alias.clone().unwrap_or_else(|| table.table.clone()),
10075        }),
10076        _ => None,
10077    }
10078}
10079
10080fn collect_projection_columns_for_table(
10081    projection: &Projection,
10082    table: &str,
10083    alias: Option<&str>,
10084    out: &mut BTreeSet<String>,
10085) {
10086    match projection {
10087        Projection::Column(column) | Projection::Alias(column, _) => {
10088            match split_qualified_column(column) {
10089                Some((qualifier, column))
10090                    if qualifier == table || alias.is_some_and(|alias| qualifier == alias) =>
10091                {
10092                    push_policy_column(column, out);
10093                }
10094                Some(_) => {}
10095                None => push_policy_column(column, out),
10096            }
10097        }
10098        Projection::Field(
10099            FieldRef::TableColumn {
10100                table: qualifier,
10101                column,
10102            },
10103            _,
10104        ) => {
10105            if qualifier.is_empty()
10106                || qualifier == table
10107                || alias.is_some_and(|alias| qualifier == alias)
10108            {
10109                push_policy_column(column, out);
10110            }
10111        }
10112        Projection::Field(
10113            FieldRef::NodeProperty {
10114                alias: qualifier,
10115                property,
10116            },
10117            _,
10118        )
10119        | Projection::Field(
10120            FieldRef::EdgeProperty {
10121                alias: qualifier,
10122                property,
10123            },
10124            _,
10125        ) => {
10126            if qualifier == table || alias.is_some_and(|alias| qualifier == alias) {
10127                push_policy_column(property, out);
10128            }
10129        }
10130        Projection::Function(_, args) => {
10131            for arg in args {
10132                collect_projection_columns_for_table(arg, table, alias, out);
10133            }
10134        }
10135        Projection::Expression(_, _) | Projection::All | Projection::Field(_, _) => {}
10136    }
10137}
10138
10139fn collect_projection_columns_for_join_side(
10140    projection: &Projection,
10141    left: Option<&JoinTableSide>,
10142    right: Option<&JoinTableSide>,
10143    out: &mut HashMap<String, BTreeSet<String>>,
10144) -> RedDBResult<()> {
10145    match projection {
10146        Projection::Column(column) | Projection::Alias(column, _) => {
10147            if let Some((qualifier, column)) = split_qualified_column(column) {
10148                push_qualified_join_column(qualifier, column, left, right, out);
10149            } else {
10150                push_unqualified_join_column(column, left, right, out);
10151            }
10152        }
10153        Projection::Field(FieldRef::TableColumn { table, column }, _) => {
10154            if table.is_empty() {
10155                push_unqualified_join_column(column, left, right, out);
10156            } else if let Some(side) = [left, right]
10157                .into_iter()
10158                .flatten()
10159                .find(|side| table == side.table.as_str() || table == side.alias.as_str())
10160            {
10161                push_join_column(&side.table, column, out);
10162            }
10163        }
10164        Projection::Field(FieldRef::NodeProperty { alias, property }, _)
10165        | Projection::Field(FieldRef::EdgeProperty { alias, property }, _) => {
10166            push_qualified_join_column(alias, property, left, right, out);
10167        }
10168        Projection::Function(_, args) => {
10169            for arg in args {
10170                collect_projection_columns_for_join_side(arg, left, right, out)?;
10171            }
10172        }
10173        Projection::Expression(_, _) | Projection::All | Projection::Field(_, _) => {}
10174    }
10175    Ok(())
10176}
10177
10178fn split_qualified_column(column: &str) -> Option<(&str, &str)> {
10179    let (qualifier, column) = column.split_once('.')?;
10180    if qualifier.is_empty() || column.is_empty() || column.contains('.') {
10181        return None;
10182    }
10183    Some((qualifier, column))
10184}
10185
10186fn push_qualified_join_column(
10187    qualifier: &str,
10188    column: &str,
10189    left: Option<&JoinTableSide>,
10190    right: Option<&JoinTableSide>,
10191    out: &mut HashMap<String, BTreeSet<String>>,
10192) {
10193    if let Some(side) = [left, right]
10194        .into_iter()
10195        .flatten()
10196        .find(|side| qualifier == side.table.as_str() || qualifier == side.alias.as_str())
10197    {
10198        push_join_column(&side.table, column, out);
10199    }
10200}
10201
10202fn push_unqualified_join_column(
10203    column: &str,
10204    left: Option<&JoinTableSide>,
10205    right: Option<&JoinTableSide>,
10206    out: &mut HashMap<String, BTreeSet<String>>,
10207) {
10208    for side in [left, right].into_iter().flatten() {
10209        push_join_column(&side.table, column, out);
10210    }
10211}
10212
10213fn push_join_column(table: &str, column: &str, out: &mut HashMap<String, BTreeSet<String>>) {
10214    if is_policy_column_name(column) {
10215        out.entry(table.to_string())
10216            .or_default()
10217            .insert(column.to_string());
10218    }
10219}
10220
10221fn push_policy_column(column: &str, out: &mut BTreeSet<String>) {
10222    if is_policy_column_name(column) {
10223        out.insert(column.to_string());
10224    }
10225}
10226
10227fn is_policy_column_name(column: &str) -> bool {
10228    !column.is_empty()
10229        && column != "*"
10230        && !column.starts_with("LIT:")
10231        && !column.starts_with("TYPE:")
10232}
10233
10234fn runtime_iam_context(
10235    role: crate::auth::Role,
10236    tenant: Option<&str>,
10237) -> crate::auth::policies::EvalContext {
10238    crate::auth::policies::EvalContext {
10239        principal_tenant: tenant.map(|t| t.to_string()),
10240        current_tenant: tenant.map(|t| t.to_string()),
10241        peer_ip: None,
10242        mfa_present: false,
10243        now_ms: crate::auth::now_ms(),
10244        principal_is_admin_role: role == crate::auth::Role::Admin,
10245    }
10246}
10247
10248fn explicit_table_projection_columns(
10249    query: &crate::storage::query::ast::TableQuery,
10250) -> Vec<String> {
10251    use crate::storage::query::ast::{FieldRef, Projection};
10252
10253    let mut columns = Vec::new();
10254    for projection in crate::storage::query::sql_lowering::effective_table_projections(query) {
10255        match projection {
10256            Projection::Column(column) | Projection::Alias(column, _) => {
10257                push_unique(&mut columns, column)
10258            }
10259            Projection::Field(FieldRef::TableColumn { column, .. }, _) => {
10260                push_unique(&mut columns, column)
10261            }
10262            // SELECT * and expression/function projections need the
10263            // executor-wide column-policy context mapped in
10264            // docs/security/select-relational-column-policy-audit-2026-05-08.md.
10265            _ => {}
10266        }
10267    }
10268    columns
10269}
10270
10271fn explicit_graph_projection_properties(
10272    query: &crate::storage::query::ast::GraphQuery,
10273) -> Vec<String> {
10274    use crate::storage::query::ast::{FieldRef, Projection};
10275
10276    let mut columns = Vec::new();
10277    for projection in &query.return_ {
10278        match projection {
10279            Projection::Field(FieldRef::NodeProperty { property, .. }, _)
10280            | Projection::Field(FieldRef::EdgeProperty { property, .. }, _) => {
10281                push_unique(&mut columns, property.clone())
10282            }
10283            _ => {}
10284        }
10285    }
10286    columns
10287}
10288
10289fn push_unique(columns: &mut Vec<String>, column: String) {
10290    if !columns.iter().any(|existing| existing == &column) {
10291        columns.push(column);
10292    }
10293}
10294
10295fn principal_label(p: &crate::storage::query::ast::PolicyPrincipalRef) -> String {
10296    use crate::storage::query::ast::PolicyPrincipalRef;
10297    match p {
10298        PolicyPrincipalRef::User(u) => match &u.tenant {
10299            Some(t) => format!("user:{t}/{}", u.username),
10300            None => format!("user:{}", u.username),
10301        },
10302        PolicyPrincipalRef::Group(g) => format!("group:{g}"),
10303    }
10304}
10305
10306/// Render a `Decision` into the (decision, matched_policy_id, matched_sid)
10307/// shape used by every audit emit + the simulator response.
10308pub(crate) fn decision_to_strings(
10309    d: &crate::auth::policies::Decision,
10310) -> (String, Option<String>, Option<String>) {
10311    use crate::auth::policies::Decision;
10312    match d {
10313        Decision::Allow {
10314            matched_policy_id,
10315            matched_sid,
10316        } => (
10317            "allow".into(),
10318            Some(matched_policy_id.clone()),
10319            matched_sid.clone(),
10320        ),
10321        Decision::Deny {
10322            matched_policy_id,
10323            matched_sid,
10324        } => (
10325            "deny".into(),
10326            Some(matched_policy_id.clone()),
10327            matched_sid.clone(),
10328        ),
10329        Decision::DefaultDeny => ("default_deny".into(), None, None),
10330        Decision::AdminBypass => ("admin_bypass".into(), None, None),
10331    }
10332}
10333
10334fn relation_scopes_for_query(query: &QueryExpr) -> Vec<String> {
10335    let mut scopes = Vec::new();
10336    collect_relation_scopes(query, &mut scopes);
10337    scopes.sort();
10338    scopes.dedup();
10339    scopes
10340}
10341
10342fn collect_relation_scopes(query: &QueryExpr, scopes: &mut Vec<String>) {
10343    match query {
10344        QueryExpr::Table(table) => {
10345            if !table.table.is_empty() {
10346                scopes.push(table.table.clone());
10347            }
10348            if let Some(alias) = &table.alias {
10349                scopes.push(alias.clone());
10350            }
10351        }
10352        QueryExpr::Join(join) => {
10353            collect_relation_scopes(&join.left, scopes);
10354            collect_relation_scopes(&join.right, scopes);
10355        }
10356        _ => {}
10357    }
10358}
10359
10360fn query_references_outer_scope(query: &QueryExpr, outer_scopes: &[String]) -> bool {
10361    let inner_scopes = relation_scopes_for_query(query);
10362    query_expr_references_outer_scope(query, outer_scopes, &inner_scopes)
10363}
10364
10365fn query_expr_references_outer_scope(
10366    query: &QueryExpr,
10367    outer_scopes: &[String],
10368    inner_scopes: &[String],
10369) -> bool {
10370    match query {
10371        QueryExpr::Table(table) => {
10372            table.select_items.iter().any(|item| match item {
10373                crate::storage::query::ast::SelectItem::Wildcard => false,
10374                crate::storage::query::ast::SelectItem::Expr { expr, .. } => {
10375                    expr_references_outer_scope(expr, outer_scopes, inner_scopes)
10376                }
10377            }) || table
10378                .where_expr
10379                .as_ref()
10380                .is_some_and(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
10381                || table.filter.as_ref().is_some_and(|filter| {
10382                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
10383                })
10384                || table.having_expr.as_ref().is_some_and(|expr| {
10385                    expr_references_outer_scope(expr, outer_scopes, inner_scopes)
10386                })
10387                || table.having.as_ref().is_some_and(|filter| {
10388                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
10389                })
10390                || table
10391                    .group_by_exprs
10392                    .iter()
10393                    .any(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
10394                || table.order_by.iter().any(|clause| {
10395                    clause.expr.as_ref().is_some_and(|expr| {
10396                        expr_references_outer_scope(expr, outer_scopes, inner_scopes)
10397                    })
10398                })
10399        }
10400        QueryExpr::Join(join) => {
10401            query_expr_references_outer_scope(&join.left, outer_scopes, inner_scopes)
10402                || query_expr_references_outer_scope(&join.right, outer_scopes, inner_scopes)
10403                || join.filter.as_ref().is_some_and(|filter| {
10404                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
10405                })
10406                || join.return_items.iter().any(|item| match item {
10407                    crate::storage::query::ast::SelectItem::Wildcard => false,
10408                    crate::storage::query::ast::SelectItem::Expr { expr, .. } => {
10409                        expr_references_outer_scope(expr, outer_scopes, inner_scopes)
10410                    }
10411                })
10412        }
10413        _ => false,
10414    }
10415}
10416
10417fn filter_references_outer_scope(
10418    filter: &crate::storage::query::ast::Filter,
10419    outer_scopes: &[String],
10420    inner_scopes: &[String],
10421) -> bool {
10422    use crate::storage::query::ast::Filter;
10423    match filter {
10424        Filter::Compare { field, .. }
10425        | Filter::IsNull(field)
10426        | Filter::IsNotNull(field)
10427        | Filter::In { field, .. }
10428        | Filter::Between { field, .. }
10429        | Filter::Like { field, .. }
10430        | Filter::StartsWith { field, .. }
10431        | Filter::EndsWith { field, .. }
10432        | Filter::Contains { field, .. } => {
10433            field_ref_references_outer_scope(field, outer_scopes, inner_scopes)
10434        }
10435        Filter::CompareFields { left, right, .. } => {
10436            field_ref_references_outer_scope(left, outer_scopes, inner_scopes)
10437                || field_ref_references_outer_scope(right, outer_scopes, inner_scopes)
10438        }
10439        Filter::CompareExpr { lhs, rhs, .. } => {
10440            expr_references_outer_scope(lhs, outer_scopes, inner_scopes)
10441                || expr_references_outer_scope(rhs, outer_scopes, inner_scopes)
10442        }
10443        Filter::And(left, right) | Filter::Or(left, right) => {
10444            filter_references_outer_scope(left, outer_scopes, inner_scopes)
10445                || filter_references_outer_scope(right, outer_scopes, inner_scopes)
10446        }
10447        Filter::Not(inner) => filter_references_outer_scope(inner, outer_scopes, inner_scopes),
10448    }
10449}
10450
10451fn expr_references_outer_scope(
10452    expr: &crate::storage::query::ast::Expr,
10453    outer_scopes: &[String],
10454    inner_scopes: &[String],
10455) -> bool {
10456    use crate::storage::query::ast::Expr;
10457    match expr {
10458        Expr::Column { field, .. } => {
10459            field_ref_references_outer_scope(field, outer_scopes, inner_scopes)
10460        }
10461        Expr::BinaryOp { lhs, rhs, .. } => {
10462            expr_references_outer_scope(lhs, outer_scopes, inner_scopes)
10463                || expr_references_outer_scope(rhs, outer_scopes, inner_scopes)
10464        }
10465        Expr::UnaryOp { operand, .. }
10466        | Expr::Cast { inner: operand, .. }
10467        | Expr::IsNull { operand, .. } => {
10468            expr_references_outer_scope(operand, outer_scopes, inner_scopes)
10469        }
10470        Expr::FunctionCall { args, .. } => args
10471            .iter()
10472            .any(|arg| expr_references_outer_scope(arg, outer_scopes, inner_scopes)),
10473        Expr::Case {
10474            branches, else_, ..
10475        } => {
10476            branches.iter().any(|(cond, value)| {
10477                expr_references_outer_scope(cond, outer_scopes, inner_scopes)
10478                    || expr_references_outer_scope(value, outer_scopes, inner_scopes)
10479            }) || else_
10480                .as_ref()
10481                .is_some_and(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
10482        }
10483        Expr::InList { target, values, .. } => {
10484            expr_references_outer_scope(target, outer_scopes, inner_scopes)
10485                || values
10486                    .iter()
10487                    .any(|value| expr_references_outer_scope(value, outer_scopes, inner_scopes))
10488        }
10489        Expr::Between {
10490            target, low, high, ..
10491        } => {
10492            expr_references_outer_scope(target, outer_scopes, inner_scopes)
10493                || expr_references_outer_scope(low, outer_scopes, inner_scopes)
10494                || expr_references_outer_scope(high, outer_scopes, inner_scopes)
10495        }
10496        Expr::Subquery { query, .. } => query_references_outer_scope(&query.query, inner_scopes),
10497        Expr::Literal { .. } | Expr::Parameter { .. } => false,
10498    }
10499}
10500
10501fn field_ref_references_outer_scope(
10502    field: &crate::storage::query::ast::FieldRef,
10503    outer_scopes: &[String],
10504    inner_scopes: &[String],
10505) -> bool {
10506    match field {
10507        crate::storage::query::ast::FieldRef::TableColumn { table, .. } if !table.is_empty() => {
10508            outer_scopes.iter().any(|scope| scope == table)
10509                && !inner_scopes.iter().any(|scope| scope == table)
10510        }
10511        _ => false,
10512    }
10513}
10514
10515fn first_column_values(
10516    result: crate::storage::query::unified::UnifiedResult,
10517) -> RedDBResult<Vec<Value>> {
10518    if result.columns.len() > 1 {
10519        return Err(RedDBError::Query(
10520            "expression subquery must return exactly one column".to_string(),
10521        ));
10522    }
10523    let fallback_column = result
10524        .records
10525        .first()
10526        .and_then(|record| record.column_names().into_iter().next())
10527        .map(|name| name.to_string());
10528    let column = result.columns.first().cloned().or(fallback_column);
10529    let Some(column) = column else {
10530        return Ok(Vec::new());
10531    };
10532    Ok(result
10533        .records
10534        .iter()
10535        .map(|record| record.get(column.as_str()).cloned().unwrap_or(Value::Null))
10536        .collect())
10537}
10538
10539fn parse_timestamp_to_ms(s: &str) -> Option<u128> {
10540    // Bare integer ms.
10541    if let Ok(n) = s.parse::<u128>() {
10542        return Some(n);
10543    }
10544    // Fallback: ISO-8601 like 2030-01-02 03:04:05 — accept the date
10545    // portion only (midnight UTC). Full RFC3339 parsing is a stretch
10546    // goal; the common case is `'2030-01-01'`.
10547    if let Some(date) = s.split_whitespace().next() {
10548        let parts: Vec<&str> = date.split('-').collect();
10549        if parts.len() == 3 {
10550            let (y, m, d) = (parts[0], parts[1], parts[2]);
10551            if let (Ok(y), Ok(m), Ok(d)) = (y.parse::<i64>(), m.parse::<u32>(), d.parse::<u32>()) {
10552                // Days since 1970-01-01 — simple Julian arithmetic
10553                // suitable for years 1970-2100. Good enough for test
10554                // fixtures; precise parsing lands when we wire chrono.
10555                let days_in = days_from_civil(y, m, d);
10556                return Some((days_in as u128) * 86_400_000u128);
10557            }
10558        }
10559    }
10560    None
10561}
10562
10563/// Days from Unix epoch using H. Hinnant's civil-from-days algorithm.
10564/// Robust for the entire Gregorian range; used by `parse_timestamp_to_ms`.
10565fn days_from_civil(y: i64, m: u32, d: u32) -> i64 {
10566    let y = if m <= 2 { y - 1 } else { y };
10567    let era = if y >= 0 { y } else { y - 399 } / 400;
10568    let yoe = (y - era * 400) as u64; // [0, 399]
10569    let doy = (153 * (if m > 2 { m - 3 } else { m + 9 }) as u64 + 2) / 5 + d as u64 - 1;
10570    let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
10571    era * 146097 + doe as i64 - 719468
10572}
10573
10574fn walk_plan_node(
10575    node: &crate::storage::query::planner::CanonicalLogicalNode,
10576    depth: usize,
10577    out: &mut Vec<crate::storage::query::unified::UnifiedRecord>,
10578) {
10579    use std::sync::Arc;
10580    let mut rec = crate::storage::query::unified::UnifiedRecord::default();
10581    rec.set_arc(Arc::from("op"), Value::text(node.operator.clone()));
10582    rec.set_arc(
10583        Arc::from("source"),
10584        node.source.clone().map(Value::text).unwrap_or(Value::Null),
10585    );
10586    rec.set_arc(Arc::from("est_rows"), Value::Float(node.estimated_rows));
10587    rec.set_arc(Arc::from("est_cost"), Value::Float(node.operator_cost));
10588    rec.set_arc(Arc::from("depth"), Value::Integer(depth as i64));
10589    out.push(rec);
10590    for child in &node.children {
10591        walk_plan_node(child, depth + 1, out);
10592    }
10593}