Skip to main content

reddb_server/runtime/
impl_core.rs

1use super::*;
2use crate::application::entity::metadata_to_json;
3use crate::auth::column_policy_gate::ColumnAccessRequest;
4use crate::auth::UserId;
5use crate::replication::cdc::ChangeRecord;
6use crate::replication::logical::{ApplyMode, LogicalChangeApplier};
7use crate::storage::query::ast::TableSource;
8
9thread_local! {
10    /// Current connection id for the executing statement. Set by the
11    /// per-connection wrapper (stdio/gRPC handlers) before dispatching
12    /// into `execute_query`; falls back to `0` for embedded callers.
13    static CURRENT_CONN_ID: std::cell::Cell<u64> = const { std::cell::Cell::new(0) };
14
15    /// Authenticated user + role for the executing statement (Phase 2.5.2
16    /// RLS enforcement). Set by the transport middleware after validating
17    /// credentials (password / cert / oauth); unset means "anonymous" /
18    /// "embedded" — RLS policies degrade to the role-agnostic subset.
19    ///
20    /// `None` skips RLS injection entirely; `Some((username, role))`
21    /// passes `role` to `matching_rls_policies(table, Some(role), action)`.
22    static CURRENT_AUTH_IDENTITY: std::cell::RefCell<Option<(String, crate::auth::Role)>> =
23        const { std::cell::RefCell::new(None) };
24
25    /// MVCC snapshot scoped to the currently-executing statement (Phase
26    /// 2.3.2d PG parity). `execute_query` captures it on entry and drops
27    /// it on exit; every scan consults it via
28    /// `entity_visible_under_current_snapshot` to hide tuples whose xmin
29    /// hasn't committed or whose xmax already has.
30    ///
31    /// `None` means "pre-MVCC semantics" — the read path returns every
32    /// tuple regardless of xmin/xmax. All embedded callers that bypass
33    /// `execute_query` see this default.
34    static CURRENT_SNAPSHOT: std::cell::RefCell<Option<SnapshotContext>> =
35        const { std::cell::RefCell::new(None) };
36
37    /// Cheap presence flag for `CURRENT_SNAPSHOT`. Scan hot paths
38    /// poll this instead of `borrow()`-ing the RefCell on every
39    /// row — the common case (autocommit / no MVCC session) reads
40    /// one atomic `Cell<bool>` and short-circuits, saving ~10ns × N
41    /// rows on aggregate_group / select_range scans.
42    static HAS_SNAPSHOT: std::cell::Cell<bool> = const { std::cell::Cell::new(false) };
43
44    /// Session-scoped tenant id for the current connection (Phase 2.5.3
45    /// multi-tenancy). Populated by `SET TENANT 'id'` or by transport
46    /// middleware after resolving tenant from auth claims. Read by the
47    /// `CURRENT_TENANT()` scalar function — RLS policies typically
48    /// combine it as `USING (tenant_id = CURRENT_TENANT())` to scope
49    /// every query to one tenant.
50    ///
51    /// `None` means "no tenant bound" — `CURRENT_TENANT()` returns
52    /// NULL, and RLS policies that gate on it hide every row.
53    static CURRENT_TENANT_ID: std::cell::RefCell<Option<String>> =
54        const { std::cell::RefCell::new(None) };
55
56    /// Statement-local config resolver. SQL expressions materialize the
57    /// `red_config` snapshot lazily on the first `$config.*`/`CONFIG()`
58    /// access, keeping ordinary statements on the zero-scan path.
59    static CURRENT_CONFIG_RESOLVER: std::cell::RefCell<Option<ConfigResolver>> =
60        const { std::cell::RefCell::new(None) };
61
62    /// Statement-local secret resolver. SQL expressions materialize the
63    /// vault KV snapshot lazily on first `$secret.*` access, then use
64    /// lock-free map reads for the rest of the statement.
65    static CURRENT_SECRET_RESOLVER: std::cell::RefCell<Option<SecretResolver>> =
66        const { std::cell::RefCell::new(None) };
67}
68
69fn secret_sql_value_to_string(value: &Value) -> RedDBResult<String> {
70    match value {
71        Value::Text(s) => Ok(s.to_string()),
72        Value::Integer(n) => Ok(n.to_string()),
73        Value::UnsignedInteger(n) => Ok(n.to_string()),
74        Value::Float(n) => Ok(n.to_string()),
75        Value::Boolean(b) => Ok(b.to_string()),
76        Value::Null => Err(RedDBError::Query(
77            "SET SECRET key = NULL deletes the secret; use DELETE SECRET for explicit deletes"
78                .to_string(),
79        )),
80        Value::Password(_) | Value::Secret(_) => Err(RedDBError::Query(
81            "SET SECRET accepts plain scalar literals; PASSWORD() and SECRET() are for typed columns"
82                .to_string(),
83        )),
84        _ => Err(RedDBError::Query(format!(
85            "SET SECRET does not support value type {:?} yet",
86            value.data_type()
87        ))),
88    }
89}
90
91fn system_keyed_collection_contract(
92    name: &str,
93    model: crate::catalog::CollectionModel,
94) -> crate::physical::CollectionContract {
95    let now = crate::utils::now_unix_millis() as u128;
96    crate::physical::CollectionContract {
97        name: name.to_string(),
98        declared_model: model,
99        schema_mode: crate::catalog::SchemaMode::Dynamic,
100        origin: crate::physical::ContractOrigin::Implicit,
101        version: 1,
102        created_at_unix_ms: now,
103        updated_at_unix_ms: now,
104        default_ttl_ms: None,
105        context_index_fields: Vec::new(),
106        declared_columns: Vec::new(),
107        table_def: None,
108        timestamps_enabled: false,
109        context_index_enabled: false,
110        append_only: false,
111        subscriptions: Vec::new(),
112    }
113}
114
115/// Snapshot + manager pair used for read-path visibility checks.
116///
117/// The manager is needed in addition to the snapshot because `aborted`
118/// state mutates after the snapshot is captured — a ROLLBACK by a
119/// committed-at-capture-time writer must still hide its tuples. Keeping
120/// the Arc around is O(pointer) and the RwLock reads on `is_aborted`
121/// are cheap (HashSet lookup under a parking_lot read guard).
122///
123/// `own_xids` (Phase 2.3.2e) lists the xids belonging to the current
124/// connection's transaction — the parent xid plus every open
125/// savepoint sub-xid. The visibility rule promotes rows stamped with
126/// these xids to "always visible (unless aborted)" so the writer sees
127/// its own nested-savepoint writes even though their xids exceed
128/// `snapshot.xid`.
129#[derive(Clone)]
130pub struct SnapshotContext {
131    pub snapshot: crate::storage::transaction::snapshot::Snapshot,
132    pub manager: Arc<crate::storage::transaction::snapshot::SnapshotManager>,
133    pub own_xids: std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
134}
135
136/// Install a connection id on the current thread for the duration of a
137/// statement. Transaction state (`RuntimeInner::tx_contexts`) is keyed
138/// by this id so different connections can hold independent BEGINs.
139///
140/// Pub so transports (PG wire, gRPC, HTTP per-request spawners) and
141/// tests can emulate per-connection isolation. Call it once when
142/// binding the connection's worker thread; pair with
143/// `clear_current_connection_id` on teardown.
144pub fn set_current_connection_id(id: u64) {
145    CURRENT_CONN_ID.with(|c| c.set(id));
146}
147
148/// Reset the thread's connection id back to `0` (autocommit).
149pub fn clear_current_connection_id() {
150    CURRENT_CONN_ID.with(|c| c.set(0));
151}
152
153/// Read the connection id set by `set_current_connection_id`. Returns
154/// `0` when no wrapper installed one — auto-commit path.
155pub fn current_connection_id() -> u64 {
156    CURRENT_CONN_ID.with(|c| c.get())
157}
158
159/// Install the authenticated identity for the current thread (Phase 2.5.2
160/// RLS enforcement). Transport layers call this right after resolving
161/// auth so the query dispatch can fold RLS policies into the filter.
162pub fn set_current_auth_identity(username: String, role: crate::auth::Role) {
163    CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = Some((username, role)));
164}
165
166/// Clear the thread-local auth identity. Transports call this after the
167/// statement completes so pooled threads don't leak identities across
168/// requests.
169pub fn clear_current_auth_identity() {
170    CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = None);
171}
172
173/// Read the current-thread auth identity. `None` when no transport
174/// installed one (embedded mode / anonymous access).
175pub(crate) fn current_auth_identity() -> Option<(String, crate::auth::Role)> {
176    CURRENT_AUTH_IDENTITY.with(|cell| cell.borrow().clone())
177}
178
179/// Install the session tenant id for the current thread (Phase 2.5.3
180/// multi-tenancy). Called by `SET TENANT 'id'` dispatch and by
181/// transport middleware that resolves tenant from auth claims (e.g.
182/// JWT `tenant` claim, HTTP header, subdomain).
183pub fn set_current_tenant(tenant_id: String) {
184    CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = Some(tenant_id));
185}
186
187/// Clear the current-thread tenant — `CURRENT_TENANT()` will then
188/// return NULL and any RLS policy gated on it will hide every row.
189pub fn clear_current_tenant() {
190    CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = None);
191}
192
193/// Read the current-thread tenant id, applying overrides in priority order:
194///   1. `WITHIN TENANT '<id>' …` per-statement override (highest)
195///   2. `SET LOCAL TENANT '<id>'` transaction-local override (consulted
196///      only when the current connection has an open transaction)
197///   3. `SET TENANT '<id>'` session-level thread-local
198///   4. `None` (deny-default for RLS).
199///
200/// The transaction-local layer is read through the runtime; an embedded
201/// helper crate that has no `RedDBRuntime` access still gets correct
202/// behaviour for layers 1, 3, and 4.
203pub fn current_tenant() -> Option<String> {
204    let inherited = CURRENT_TENANT_ID.with(|cell| cell.borrow().clone());
205    if let Some(over) = current_scope_override() {
206        if over.tenant.is_active() {
207            return over.tenant.resolve(inherited);
208        }
209    }
210    if let Some(tx_local) = current_tx_local_tenant() {
211        return tx_local;
212    }
213    inherited
214}
215
216thread_local! {
217    /// Snapshot of the active connection's `tx_local_tenants` entry for
218    /// the current `execute_query` call. Outer `Some(_)` means "a
219    /// transaction-local tenant override is active for this call";
220    /// inner is the override's value (`Some(s)` overrides to `s`,
221    /// `None` overrides to NULL/cleared). Refreshed at the top of every
222    /// `execute_query` invocation and cleared by the RAII guard on
223    /// return so pooled connections cannot leak the override past the
224    /// statement that owns it.
225    static TX_LOCAL_TENANT: std::cell::RefCell<Option<Option<String>>> =
226        const { std::cell::RefCell::new(None) };
227}
228
229fn current_tx_local_tenant() -> Option<Option<String>> {
230    TX_LOCAL_TENANT.with(|cell| cell.borrow().clone())
231}
232
233/// Recognise `SET LOCAL TENANT '<id>'` / `SET LOCAL TENANT NULL` —
234/// returns `Ok(Some(Some(id)))` for an explicit value, `Ok(Some(None))`
235/// for an explicit NULL clear, `Ok(None)` when the input is not a
236/// `SET LOCAL TENANT` statement at all, and `Err` when the prefix
237/// matches but the value is malformed.
238fn parse_set_local_tenant(query: &str) -> RedDBResult<Option<Option<String>>> {
239    let mut tokens = query.split_ascii_whitespace();
240    let Some(w1) = tokens.next() else {
241        return Ok(None);
242    };
243    if !w1.eq_ignore_ascii_case("SET") {
244        return Ok(None);
245    }
246    let Some(w2) = tokens.next() else {
247        return Ok(None);
248    };
249    if !w2.eq_ignore_ascii_case("LOCAL") {
250        return Ok(None);
251    }
252    let Some(w3) = tokens.next() else {
253        return Ok(None);
254    };
255    if !w3.eq_ignore_ascii_case("TENANT") {
256        return Ok(None);
257    }
258    let rest: String = tokens.collect::<Vec<_>>().join(" ");
259    let rest = rest.trim().trim_end_matches(';').trim();
260    let value_str = rest.strip_prefix('=').map(|s| s.trim()).unwrap_or(rest);
261    if value_str.is_empty() {
262        return Err(RedDBError::Query(
263            "SET LOCAL TENANT expects a string literal or NULL".to_string(),
264        ));
265    }
266    if value_str.eq_ignore_ascii_case("NULL") {
267        return Ok(Some(None));
268    }
269    if value_str.starts_with('\'') && value_str.ends_with('\'') && value_str.len() >= 2 {
270        let inner = &value_str[1..value_str.len() - 1];
271        return Ok(Some(Some(inner.to_string())));
272    }
273    Err(RedDBError::Query(format!(
274        "SET LOCAL TENANT expects a string literal or NULL, got `{value_str}`"
275    )))
276}
277
278pub(crate) struct TxLocalTenantGuard;
279
280impl TxLocalTenantGuard {
281    pub fn install(value: Option<Option<String>>) -> Self {
282        TX_LOCAL_TENANT.with(|cell| *cell.borrow_mut() = value);
283        Self
284    }
285}
286
287impl Drop for TxLocalTenantGuard {
288    fn drop(&mut self) {
289        TX_LOCAL_TENANT.with(|cell| *cell.borrow_mut() = None);
290    }
291}
292
293thread_local! {
294    /// Stack of `WITHIN ... <stmt>` overrides active on the current
295    /// thread. Every entry corresponds to one in-flight `execute_query`
296    /// call that started with a `WITHIN` prefix; the entry is pushed
297    /// before dispatch and popped before the call returns. The stack
298    /// shape supports nested invocations (e.g. a view body that itself
299    /// re-enters execute_query).
300    static SCOPE_OVERRIDES: std::cell::RefCell<Vec<crate::runtime::within_clause::ScopeOverride>> =
301        const { std::cell::RefCell::new(Vec::new()) };
302}
303
304pub(crate) fn push_scope_override(over: crate::runtime::within_clause::ScopeOverride) {
305    SCOPE_OVERRIDES.with(|cell| cell.borrow_mut().push(over));
306}
307
308pub(crate) fn pop_scope_override() {
309    SCOPE_OVERRIDES.with(|cell| {
310        cell.borrow_mut().pop();
311    });
312}
313
314pub(crate) fn current_scope_override() -> Option<crate::runtime::within_clause::ScopeOverride> {
315    SCOPE_OVERRIDES.with(|cell| cell.borrow().last().cloned())
316}
317
318/// Cheap probe: is any `WITHIN …` scope override active on this
319/// thread? The fast-path needs to know without paying for the full
320/// `.last().cloned()` allocation — just peek at stack length.
321pub(crate) fn has_scope_override_active() -> bool {
322    SCOPE_OVERRIDES.with(|cell| !cell.borrow().is_empty())
323}
324
325/// RAII guard pairing `push_scope_override` with the matching pop, so
326/// the stack stays balanced even when the inner `execute_query` returns
327/// early via `?`.
328pub(crate) struct ScopeOverrideGuard;
329
330impl ScopeOverrideGuard {
331    pub fn install(over: crate::runtime::within_clause::ScopeOverride) -> Self {
332        push_scope_override(over);
333        Self
334    }
335}
336
337impl Drop for ScopeOverrideGuard {
338    fn drop(&mut self) {
339        pop_scope_override();
340    }
341}
342
343/// Read the current-thread auth identity, honouring per-statement
344/// `WITHIN ... USER '<u>' AS ROLE '<r>'` overrides. The override only
345/// supplies projected strings — it never grants additional privilege —
346/// so callers that need to make authorisation decisions must read from
347/// the underlying `current_auth_identity()` directly.
348pub(crate) fn current_user_projected() -> Option<String> {
349    let inherited = current_auth_identity().map(|(u, _)| u);
350    if let Some(over) = current_scope_override() {
351        if over.user.is_active() {
352            return over.user.resolve(inherited);
353        }
354    }
355    inherited
356}
357
358pub(crate) fn current_role_projected() -> Option<String> {
359    let inherited = current_auth_identity().map(|(_, r)| format!("{r:?}").to_lowercase());
360    if let Some(over) = current_scope_override() {
361        if over.role.is_active() {
362            return over.role.resolve(inherited);
363        }
364    }
365    inherited
366}
367
368pub(crate) fn current_secret_value(path: &str) -> Option<String> {
369    let key = path.to_ascii_lowercase();
370    CURRENT_SECRET_RESOLVER.with(|cell| {
371        let mut resolver = cell.borrow_mut();
372        let resolver = resolver.as_mut()?;
373        if resolver.values.is_none() {
374            resolver.values = resolver
375                .store
376                .as_ref()
377                .map(|store| store.vault_kv_snapshot());
378        }
379        let values = resolver.values.as_ref()?;
380        values.get(&key).cloned().or_else(|| {
381            key.strip_prefix("red.vault/").and_then(|rest| {
382                values
383                    .get(rest)
384                    .cloned()
385                    .or_else(|| values.get(&format!("red.secret.{rest}")).cloned())
386            })
387        })
388    })
389}
390
391struct SecretResolver {
392    store: Option<Arc<crate::auth::store::AuthStore>>,
393    values: Option<HashMap<String, String>>,
394}
395
396pub(super) struct SecretStoreGuard {
397    previous: Option<SecretResolver>,
398}
399
400impl SecretStoreGuard {
401    pub(super) fn install(store: Option<Arc<crate::auth::store::AuthStore>>) -> Self {
402        let previous = CURRENT_SECRET_RESOLVER.with(|cell| {
403            cell.replace(Some(SecretResolver {
404                store,
405                values: None,
406            }))
407        });
408        Self { previous }
409    }
410}
411
412impl Drop for SecretStoreGuard {
413    fn drop(&mut self) {
414        let previous = self.previous.take();
415        CURRENT_SECRET_RESOLVER.with(|cell| {
416            cell.replace(previous);
417        });
418    }
419}
420
421pub(crate) fn current_config_value(path: &str) -> Option<Value> {
422    let key = path.to_ascii_lowercase();
423    CURRENT_CONFIG_RESOLVER.with(|cell| {
424        let mut resolver = cell.borrow_mut();
425        let resolver = resolver.as_mut()?;
426        if resolver.values.is_none() {
427            resolver.values = Some(latest_config_snapshot(&resolver.db));
428        }
429        let values = resolver.values.as_ref()?;
430        values.get(&key).cloned().or_else(|| {
431            key.strip_prefix("red.config/")
432                .and_then(|rest| values.get(&format!("red.config.{rest}")).cloned())
433        })
434    })
435}
436
437fn update_current_config_value(path: &str, value: Value) {
438    let key = path.to_ascii_lowercase();
439    CURRENT_CONFIG_RESOLVER.with(|cell| {
440        if let Some(resolver) = cell.borrow_mut().as_mut() {
441            if let Some(values) = resolver.values.as_mut() {
442                values.insert(key, value);
443            }
444        }
445    });
446}
447
448fn update_current_secret_value(path: &str, value: Option<String>) {
449    let key = path.to_ascii_lowercase();
450    CURRENT_SECRET_RESOLVER.with(|cell| {
451        if let Some(resolver) = cell.borrow_mut().as_mut() {
452            let Some(values) = resolver.values.as_mut() else {
453                return;
454            };
455            match value {
456                Some(value) => {
457                    values.insert(key, value);
458                }
459                None => {
460                    values.remove(&key);
461                }
462            }
463        }
464    });
465}
466
467fn latest_config_snapshot(db: &RedDB) -> HashMap<String, Value> {
468    let mut latest: HashMap<String, (u64, Value)> = HashMap::new();
469
470    if let Some(manager) = db.store().get_collection("red_config") {
471        manager.for_each_entity(|entity| {
472            let Some(row) = entity.data.as_row() else {
473                return true;
474            };
475            let Some(Value::Text(key)) = row.get_field("key") else {
476                return true;
477            };
478            let value = row.get_field("value").cloned().unwrap_or(Value::Null);
479            let id = entity.id.raw();
480            let key = key.to_ascii_lowercase();
481            insert_latest_config_value(&mut latest, key.clone(), id, value.clone());
482            if let Some(rest) = key.strip_prefix("red.config.") {
483                insert_latest_config_value(&mut latest, format!("red.config/{rest}"), id, value);
484            }
485            true
486        });
487    }
488
489    if let Some(manager) = db.store().get_collection("red.config") {
490        manager.for_each_entity(|entity| {
491            let Some(row) = entity.data.as_row() else {
492                return true;
493            };
494            if matches!(row.get_field("tombstone"), Some(Value::Boolean(true))) {
495                return true;
496            }
497            let Some(Value::Text(key)) = row.get_field("key") else {
498                return true;
499            };
500            let value = row.get_field("value").cloned().unwrap_or(Value::Null);
501            insert_latest_config_value(
502                &mut latest,
503                format!("red.config/{}", key.to_ascii_lowercase()),
504                entity.id.raw(),
505                value,
506            );
507            true
508        });
509    }
510
511    latest
512        .into_iter()
513        .map(|(key, (_, value))| (key, value))
514        .collect()
515}
516
517fn insert_latest_config_value(
518    latest: &mut HashMap<String, (u64, Value)>,
519    key: String,
520    id: u64,
521    value: Value,
522) {
523    match latest.get(&key) {
524        Some((prev_id, _)) if *prev_id > id => {}
525        _ => {
526            latest.insert(key, (id, value));
527        }
528    }
529}
530
531struct ConfigResolver {
532    db: Arc<RedDB>,
533    values: Option<HashMap<String, Value>>,
534}
535
536pub(super) struct ConfigSnapshotGuard {
537    previous: Option<ConfigResolver>,
538}
539
540impl ConfigSnapshotGuard {
541    pub(super) fn install(db: Arc<RedDB>) -> Self {
542        let previous = CURRENT_CONFIG_RESOLVER
543            .with(|cell| cell.replace(Some(ConfigResolver { db, values: None })));
544        Self { previous }
545    }
546}
547
548impl Drop for ConfigSnapshotGuard {
549    fn drop(&mut self) {
550        let previous = self.previous.take();
551        CURRENT_CONFIG_RESOLVER.with(|cell| {
552            cell.replace(previous);
553        });
554    }
555}
556
557/// Install the MVCC snapshot used by the current thread for the duration
558/// of one statement. Paired with `clear_current_snapshot()` — callers
559/// should prefer the `CurrentSnapshotGuard` RAII wrapper so early returns
560/// still clean up.
561pub fn set_current_snapshot(ctx: SnapshotContext) {
562    CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = Some(ctx));
563    HAS_SNAPSHOT.with(|c| c.set(true));
564}
565
566pub fn clear_current_snapshot() {
567    CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = None);
568    HAS_SNAPSHOT.with(|c| c.set(false));
569}
570
571/// Drop-guard that restores the previous snapshot on scope exit. Safe to
572/// nest — each statement saves the caller's snapshot and puts it back
573/// instead of blindly clearing, so a top-level `execute_query` called
574/// from inside another statement dispatch (e.g. vector source subqueries)
575/// doesn't strip visibility from the outer scan.
576pub(crate) struct CurrentSnapshotGuard {
577    previous: Option<SnapshotContext>,
578}
579
580impl CurrentSnapshotGuard {
581    pub(crate) fn install(ctx: SnapshotContext) -> Self {
582        let previous = CURRENT_SNAPSHOT.with(|cell| cell.borrow().clone());
583        set_current_snapshot(ctx);
584        Self { previous }
585    }
586}
587
588impl Drop for CurrentSnapshotGuard {
589    fn drop(&mut self) {
590        let prev = self.previous.take();
591        let has = prev.is_some();
592        CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = prev);
593        HAS_SNAPSHOT.with(|c| c.set(has));
594    }
595}
596
597/// Is this entity visible under the current thread's MVCC snapshot?
598///
599/// Returns `true` (no filtering) when no snapshot is installed — that
600/// path is used by embedded callers and by operations that intentionally
601/// bypass MVCC (VACUUM, snapshot export, admin introspection).
602///
603/// When a snapshot is installed the result is
604///   `snapshot.sees(xmin, xmax) && !mgr.is_aborted(xmin) && !xmax_half_abort`
605/// where `xmax_half_abort` re-grants visibility for tuples whose
606/// deleting transaction rolled back.
607#[inline]
608pub fn entity_visible_under_current_snapshot(
609    entity: &crate::storage::unified::entity::UnifiedEntity,
610) -> bool {
611    // Fast path — one `Cell<bool>` read, no RefCell borrow. Autocommit
612    // reads (no active MVCC transaction) see `HAS_SNAPSHOT == false`
613    // and return `true` without ever touching the snapshot context.
614    // This runs on every row of every scan; the slow path only fires
615    // inside an explicit transaction.
616    if !HAS_SNAPSHOT.with(|c| c.get()) {
617        return true;
618    }
619    CURRENT_SNAPSHOT.with(|cell| {
620        let guard = cell.borrow();
621        let Some(ctx) = guard.as_ref() else {
622            return true;
623        };
624        visibility_check(ctx, entity.xmin, entity.xmax)
625    })
626}
627
628/// Direct visibility check from raw `(xmin, xmax)` — bypasses the
629/// entity borrow for callers that already decomposed the tuple (e.g.
630/// pre-materialized scan caches). Same semantics as
631/// `entity_visible_under_current_snapshot`.
632#[inline]
633pub(crate) fn xids_visible_under_current_snapshot(xmin: u64, xmax: u64) -> bool {
634    if !HAS_SNAPSHOT.with(|c| c.get()) {
635        return true;
636    }
637    CURRENT_SNAPSHOT.with(|cell| {
638        let guard = cell.borrow();
639        let Some(ctx) = guard.as_ref() else {
640            return true;
641        };
642        visibility_check(ctx, xmin, xmax)
643    })
644}
645
646/// Clone the current thread's snapshot context. Parallel scan paths
647/// (`query_all_zoned` with `std::thread::scope`) call this on the main
648/// thread *before* spawning workers so the captured `SnapshotContext`
649/// can be moved into every worker closure. Worker threads do not
650/// inherit thread-locals, so calling `entity_visible_under_current_snapshot`
651/// from inside a spawned closure would silently skip the filter.
652pub fn capture_current_snapshot() -> Option<SnapshotContext> {
653    CURRENT_SNAPSHOT.with(|cell| cell.borrow().clone())
654}
655
656/// Frozen MVCC + identity context for callers that need to reinstall
657/// the same view across thread-local boundaries — long-lived cursors,
658/// background batchers, anything that detaches from the dispatch path
659/// and re-enters later.
660///
661/// The bundle bakes in the three thread-locals every read path
662/// consults: `SnapshotContext` (MVCC visibility), the auth identity
663/// (RLS policy gate), and the tenant id (RLS scope). A FETCH that
664/// reinstalls the bundle sees exactly the same rows as the DECLARE
665/// would have, regardless of writes that landed in between.
666///
667/// Cheap to clone — `SnapshotContext` is a clone of three
668/// `Arc`-backed fields, identity is a `(String, Role)`, tenant is a
669/// `String`. None of these contend with the read path.
670#[derive(Clone, Default)]
671pub struct SnapshotBundle {
672    pub snapshot: Option<SnapshotContext>,
673    pub auth: Option<(String, crate::auth::Role)>,
674    pub tenant: Option<String>,
675}
676
677/// Capture the three read-path thread-locals into a `SnapshotBundle`.
678/// Pairs with `with_snapshot_bundle` for re-entry.
679pub fn snapshot_bundle() -> SnapshotBundle {
680    SnapshotBundle {
681        snapshot: capture_current_snapshot(),
682        auth: current_auth_identity(),
683        tenant: CURRENT_TENANT_ID.with(|cell| cell.borrow().clone()),
684    }
685}
686
687/// Reinstall a captured `SnapshotBundle` for the duration of `f`.
688/// Restores the caller's previous thread-locals on exit (panic-safe via
689/// the explicit guard struct so a panic in `f` cannot leak the
690/// installed identity into the worker's next request).
691pub fn with_snapshot_bundle<R>(bundle: &SnapshotBundle, f: impl FnOnce() -> R) -> R {
692    struct Guard {
693        prev_snapshot: Option<SnapshotContext>,
694        prev_auth: Option<(String, crate::auth::Role)>,
695        prev_tenant: Option<String>,
696    }
697    impl Drop for Guard {
698        fn drop(&mut self) {
699            let snap = self.prev_snapshot.take();
700            let has = snap.is_some();
701            CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = snap);
702            HAS_SNAPSHOT.with(|c| c.set(has));
703            CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = self.prev_auth.take());
704            CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = self.prev_tenant.take());
705        }
706    }
707
708    let _guard = {
709        let prev_snapshot = CURRENT_SNAPSHOT.with(|cell| cell.borrow().clone());
710        let prev_auth = CURRENT_AUTH_IDENTITY.with(|cell| cell.borrow().clone());
711        let prev_tenant = CURRENT_TENANT_ID.with(|cell| cell.borrow().clone());
712
713        match bundle.snapshot.clone() {
714            Some(ctx) => set_current_snapshot(ctx),
715            None => clear_current_snapshot(),
716        }
717        CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = bundle.auth.clone());
718        CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = bundle.tenant.clone());
719
720        Guard {
721            prev_snapshot,
722            prev_auth,
723            prev_tenant,
724        }
725    };
726    f()
727}
728
729/// Apply the same visibility rules used by the thread-local helpers
730/// against a caller-provided context. Intended for parallel workers
731/// that captured the snapshot with `capture_current_snapshot()`.
732#[inline]
733pub fn entity_visible_with_context(
734    ctx: Option<&SnapshotContext>,
735    entity: &crate::storage::unified::entity::UnifiedEntity,
736) -> bool {
737    match ctx {
738        Some(ctx) => visibility_check(ctx, entity.xmin, entity.xmax),
739        None => true,
740    }
741}
742
743#[inline]
744fn visibility_check(ctx: &SnapshotContext, xmin: u64, xmax: u64) -> bool {
745    // Writer aborted → tuple never existed from any future reader's view.
746    // Checked *before* the own-xids fast path so an aborted own-sub-xid
747    // (rolled-back savepoint) stays hidden from the parent.
748    if xmin != 0 && ctx.manager.is_aborted(xmin) {
749        return false;
750    }
751    // Deleter aborted → treat xmax as unset; fall back to xmin-only check.
752    let effective_xmax = if xmax != 0 && ctx.manager.is_aborted(xmax) {
753        0
754    } else {
755        xmax
756    };
757    // Phase 2.3.2e: own-tx writes are always visible to the connection
758    // that stamped them, even when xmin/xmax exceed `snapshot.xid` (as
759    // happens for sub-xids allocated by SAVEPOINT after BEGIN).
760    let own_xmin = xmin != 0 && ctx.own_xids.contains(&xmin);
761    let own_xmax = effective_xmax != 0 && ctx.own_xids.contains(&effective_xmax);
762    if own_xmax {
763        // This connection deleted the row via this xid — hide it from self.
764        return false;
765    }
766    if own_xmin {
767        return true;
768    }
769    ctx.snapshot.sees(xmin, effective_xmax)
770}
771
772fn runtime_pool_lock(runtime: &RedDBRuntime) -> std::sync::MutexGuard<'_, PoolState> {
773    runtime
774        .inner
775        .pool
776        .lock()
777        .unwrap_or_else(|poisoned| poisoned.into_inner())
778}
779
780fn cache_scope_insert(scopes: &mut HashSet<String>, name: &str) {
781    if name.is_empty() || name.starts_with("__subq_") || is_universal_query_source(name) {
782        return;
783    }
784    scopes.insert(name.to_string());
785}
786
787fn collect_table_source_scopes(scopes: &mut HashSet<String>, query: &TableQuery) {
788    match query.source.as_ref() {
789        Some(crate::storage::query::ast::TableSource::Name(name)) => {
790            cache_scope_insert(scopes, name)
791        }
792        Some(crate::storage::query::ast::TableSource::Subquery(subquery)) => {
793            collect_query_expr_result_cache_scopes(scopes, subquery);
794        }
795        None => cache_scope_insert(scopes, &query.table),
796    }
797}
798
799fn collect_vector_source_scopes(
800    scopes: &mut HashSet<String>,
801    source: &crate::storage::query::ast::VectorSource,
802) {
803    match source {
804        crate::storage::query::ast::VectorSource::Reference { collection, .. } => {
805            cache_scope_insert(scopes, collection);
806        }
807        crate::storage::query::ast::VectorSource::Subquery(subquery) => {
808            collect_query_expr_result_cache_scopes(scopes, subquery);
809        }
810        crate::storage::query::ast::VectorSource::Literal(_)
811        | crate::storage::query::ast::VectorSource::Text(_) => {}
812    }
813}
814
815fn collect_path_selector_scopes(
816    scopes: &mut HashSet<String>,
817    selector: &crate::storage::query::ast::NodeSelector,
818) {
819    if let crate::storage::query::ast::NodeSelector::ByRow { table, .. } = selector {
820        cache_scope_insert(scopes, table);
821    }
822}
823
824fn collect_query_expr_result_cache_scopes(scopes: &mut HashSet<String>, expr: &QueryExpr) {
825    match expr {
826        QueryExpr::Table(query) => collect_table_source_scopes(scopes, query),
827        QueryExpr::Join(query) => {
828            collect_query_expr_result_cache_scopes(scopes, &query.left);
829            collect_query_expr_result_cache_scopes(scopes, &query.right);
830        }
831        QueryExpr::Path(query) => {
832            collect_path_selector_scopes(scopes, &query.from);
833            collect_path_selector_scopes(scopes, &query.to);
834        }
835        QueryExpr::Vector(query) => {
836            cache_scope_insert(scopes, &query.collection);
837            collect_vector_source_scopes(scopes, &query.query_vector);
838        }
839        QueryExpr::Hybrid(query) => {
840            collect_query_expr_result_cache_scopes(scopes, &query.structured);
841            cache_scope_insert(scopes, &query.vector.collection);
842            collect_vector_source_scopes(scopes, &query.vector.query_vector);
843        }
844        QueryExpr::Insert(query) => cache_scope_insert(scopes, &query.table),
845        QueryExpr::Update(query) => cache_scope_insert(scopes, &query.table),
846        QueryExpr::Delete(query) => cache_scope_insert(scopes, &query.table),
847        QueryExpr::CreateTable(query) => cache_scope_insert(scopes, &query.name),
848        QueryExpr::DropTable(query) => cache_scope_insert(scopes, &query.name),
849        QueryExpr::DropGraph(query) => cache_scope_insert(scopes, &query.name),
850        QueryExpr::DropVector(query) => cache_scope_insert(scopes, &query.name),
851        QueryExpr::DropDocument(query) => cache_scope_insert(scopes, &query.name),
852        QueryExpr::DropKv(query) => cache_scope_insert(scopes, &query.name),
853        QueryExpr::DropCollection(query) => cache_scope_insert(scopes, &query.name),
854        QueryExpr::Truncate(query) => cache_scope_insert(scopes, &query.name),
855        QueryExpr::AlterTable(query) => cache_scope_insert(scopes, &query.name),
856        QueryExpr::CreateIndex(query) => cache_scope_insert(scopes, &query.table),
857        QueryExpr::DropIndex(query) => cache_scope_insert(scopes, &query.table),
858        QueryExpr::CreateTimeSeries(query) => cache_scope_insert(scopes, &query.name),
859        QueryExpr::DropTimeSeries(query) => cache_scope_insert(scopes, &query.name),
860        QueryExpr::CreateQueue(query) => cache_scope_insert(scopes, &query.name),
861        QueryExpr::AlterQueue(query) => cache_scope_insert(scopes, &query.name),
862        QueryExpr::DropQueue(query) => cache_scope_insert(scopes, &query.name),
863        QueryExpr::QueueSelect(query) => cache_scope_insert(scopes, &query.queue),
864        QueryExpr::QueueCommand(query) => match query {
865            QueueCommand::Push { queue, .. }
866            | QueueCommand::Pop { queue, .. }
867            | QueueCommand::Peek { queue, .. }
868            | QueueCommand::Len { queue }
869            | QueueCommand::Purge { queue }
870            | QueueCommand::GroupCreate { queue, .. }
871            | QueueCommand::GroupRead { queue, .. }
872            | QueueCommand::Pending { queue, .. }
873            | QueueCommand::Claim { queue, .. }
874            | QueueCommand::Ack { queue, .. }
875            | QueueCommand::Nack { queue, .. } => cache_scope_insert(scopes, queue),
876            QueueCommand::Move {
877                source,
878                destination,
879                ..
880            } => {
881                cache_scope_insert(scopes, source);
882                cache_scope_insert(scopes, destination);
883            }
884        },
885        QueryExpr::EventsBackfill(query) => {
886            cache_scope_insert(scopes, &query.collection);
887            cache_scope_insert(scopes, &query.target_queue);
888        }
889        QueryExpr::CreateTree(query) => cache_scope_insert(scopes, &query.collection),
890        QueryExpr::DropTree(query) => cache_scope_insert(scopes, &query.collection),
891        QueryExpr::TreeCommand(query) => match query {
892            TreeCommand::Insert { collection, .. }
893            | TreeCommand::Move { collection, .. }
894            | TreeCommand::Delete { collection, .. }
895            | TreeCommand::Validate { collection, .. }
896            | TreeCommand::Rebalance { collection, .. } => cache_scope_insert(scopes, collection),
897        },
898        QueryExpr::SearchCommand(query) => match query {
899            SearchCommand::Similar { collection, .. }
900            | SearchCommand::Hybrid { collection, .. }
901            | SearchCommand::SpatialRadius { collection, .. }
902            | SearchCommand::SpatialBbox { collection, .. }
903            | SearchCommand::SpatialNearest { collection, .. } => {
904                cache_scope_insert(scopes, collection);
905            }
906            SearchCommand::Text { collection, .. }
907            | SearchCommand::Multimodal { collection, .. }
908            | SearchCommand::Index { collection, .. }
909            | SearchCommand::Context { collection, .. } => {
910                if let Some(collection) = collection.as_deref() {
911                    cache_scope_insert(scopes, collection);
912                }
913            }
914        },
915        QueryExpr::Ask(query) => {
916            if let Some(collection) = query.collection.as_deref() {
917                cache_scope_insert(scopes, collection);
918            }
919        }
920        QueryExpr::ExplainAlter(query) => cache_scope_insert(scopes, &query.target.name),
921        QueryExpr::MaintenanceCommand(cmd) => match cmd {
922            crate::storage::query::ast::MaintenanceCommand::Vacuum { target, .. }
923            | crate::storage::query::ast::MaintenanceCommand::Analyze { target } => {
924                if let Some(t) = target {
925                    cache_scope_insert(scopes, t);
926                }
927            }
928        },
929        QueryExpr::CopyFrom(cmd) => cache_scope_insert(scopes, &cmd.table),
930        QueryExpr::CreateView(cmd) => {
931            cache_scope_insert(scopes, &cmd.name);
932            // Invalidating the view should also invalidate its dependencies.
933            collect_query_expr_result_cache_scopes(scopes, &cmd.query);
934        }
935        QueryExpr::DropView(cmd) => cache_scope_insert(scopes, &cmd.name),
936        QueryExpr::RefreshMaterializedView(cmd) => cache_scope_insert(scopes, &cmd.name),
937        QueryExpr::CreatePolicy(cmd) => cache_scope_insert(scopes, &cmd.table),
938        QueryExpr::DropPolicy(cmd) => cache_scope_insert(scopes, &cmd.table),
939        QueryExpr::CreateServer(_) | QueryExpr::DropServer(_) => {}
940        QueryExpr::CreateForeignTable(cmd) => cache_scope_insert(scopes, &cmd.name),
941        QueryExpr::DropForeignTable(cmd) => cache_scope_insert(scopes, &cmd.name),
942        QueryExpr::Graph(_)
943        | QueryExpr::GraphCommand(_)
944        | QueryExpr::ProbabilisticCommand(_)
945        | QueryExpr::SetConfig { .. }
946        | QueryExpr::ShowConfig { .. }
947        | QueryExpr::SetSecret { .. }
948        | QueryExpr::DeleteSecret { .. }
949        | QueryExpr::ShowSecrets { .. }
950        | QueryExpr::SetTenant(_)
951        | QueryExpr::ShowTenant
952        | QueryExpr::TransactionControl(_)
953        | QueryExpr::CreateSchema(_)
954        | QueryExpr::DropSchema(_)
955        | QueryExpr::CreateSequence(_)
956        | QueryExpr::DropSequence(_)
957        | QueryExpr::Grant(_)
958        | QueryExpr::Revoke(_)
959        | QueryExpr::AlterUser(_)
960        | QueryExpr::CreateIamPolicy { .. }
961        | QueryExpr::DropIamPolicy { .. }
962        | QueryExpr::AttachPolicy { .. }
963        | QueryExpr::DetachPolicy { .. }
964        | QueryExpr::ShowPolicies { .. }
965        | QueryExpr::ShowEffectivePermissions { .. }
966        | QueryExpr::SimulatePolicy { .. }
967        | QueryExpr::CreateMigration(_)
968        | QueryExpr::ApplyMigration(_)
969        | QueryExpr::RollbackMigration(_)
970        | QueryExpr::ExplainMigration(_)
971        | QueryExpr::EventsBackfillStatus { .. } => {}
972        QueryExpr::KvCommand(cmd) => {
973            use crate::storage::query::ast::KvCommand;
974            match cmd {
975                KvCommand::Put { collection, .. }
976                | KvCommand::InvalidateTags { collection, .. }
977                | KvCommand::Get { collection, .. }
978                | KvCommand::Unseal { collection, .. }
979                | KvCommand::Rotate { collection, .. }
980                | KvCommand::History { collection, .. }
981                | KvCommand::List { collection, .. }
982                | KvCommand::Purge { collection, .. }
983                | KvCommand::Watch { collection, .. }
984                | KvCommand::Delete { collection, .. }
985                | KvCommand::Incr { collection, .. }
986                | KvCommand::Cas { collection, .. } => cache_scope_insert(scopes, collection),
987            }
988        }
989        QueryExpr::ConfigCommand(cmd) => {
990            use crate::storage::query::ast::ConfigCommand;
991            match cmd {
992                ConfigCommand::Put { collection, .. }
993                | ConfigCommand::Get { collection, .. }
994                | ConfigCommand::Resolve { collection, .. }
995                | ConfigCommand::Rotate { collection, .. }
996                | ConfigCommand::Delete { collection, .. }
997                | ConfigCommand::History { collection, .. }
998                | ConfigCommand::List { collection, .. }
999                | ConfigCommand::Watch { collection, .. }
1000                | ConfigCommand::InvalidVolatileOperation { collection, .. } => {
1001                    cache_scope_insert(scopes, collection)
1002                }
1003            }
1004        }
1005    }
1006}
1007
1008/// Combine matching RLS policies for a table + action into a single
1009/// `Filter` suitable for AND-ing into a caller's `WHERE` clause.
1010///
1011/// Returns `None` when RLS is disabled or no policy admits the caller's
1012/// role — callers use that to short-circuit the mutation (for DELETE /
1013/// UPDATE we simply skip the operation, which PG expresses as "no rows
1014/// match the policy + predicate combination").
1015pub(crate) fn rls_policy_filter(
1016    runtime: &RedDBRuntime,
1017    table: &str,
1018    action: crate::storage::query::ast::PolicyAction,
1019) -> Option<crate::storage::query::ast::Filter> {
1020    rls_policy_filter_for_kind(
1021        runtime,
1022        table,
1023        action,
1024        crate::storage::query::ast::PolicyTargetKind::Table,
1025    )
1026}
1027
1028/// Kind-aware policy filter combiner (Phase 2.5.5 RLS universal).
1029/// Graph / vector / queue / timeseries scans pass the concrete kind;
1030/// policies targeting other kinds are ignored. Legacy Table-scoped
1031/// policies still apply cross-kind — callers register auto-tenancy
1032/// policies as Table today.
1033pub(crate) fn rls_policy_filter_for_kind(
1034    runtime: &RedDBRuntime,
1035    table: &str,
1036    action: crate::storage::query::ast::PolicyAction,
1037    kind: crate::storage::query::ast::PolicyTargetKind,
1038) -> Option<crate::storage::query::ast::Filter> {
1039    use crate::storage::query::ast::Filter;
1040
1041    if !runtime.inner.rls_enabled_tables.read().contains(table) {
1042        return None;
1043    }
1044    let role = current_auth_identity().map(|(_, role)| role);
1045    let role_str = role.map(|r| r.as_str().to_string());
1046    let policies = runtime.matching_rls_policies_for_kind(table, role_str.as_deref(), action, kind);
1047    if policies.is_empty() {
1048        return None;
1049    }
1050    policies
1051        .into_iter()
1052        .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1053}
1054
1055/// Returns true when the table has RLS enforcement enabled. Convenience
1056/// shortcut so DML paths can gate the AND-combine work without reaching
1057/// into `runtime.inner.rls_enabled_tables` directly.
1058pub(crate) fn rls_is_enabled(runtime: &RedDBRuntime, table: &str) -> bool {
1059    runtime.inner.rls_enabled_tables.read().contains(table)
1060}
1061
1062/// Per-entity gate used by the graph materialiser for `GraphNode`
1063/// entities. RLS is checked against the source collection with
1064/// `kind = Nodes`, which `matching_rls_policies_for_kind` resolves to
1065/// either `Nodes`-targeted policies or legacy `Table`-targeted ones
1066/// (for back-compat with auto-tenancy declarations). Cached per
1067/// collection so big graphs only resolve the policy chain once.
1068fn node_passes_rls(
1069    runtime: &RedDBRuntime,
1070    collection: &str,
1071    role: Option<&str>,
1072    cache: &mut std::collections::HashMap<String, Option<crate::storage::query::ast::Filter>>,
1073    entity: &crate::storage::unified::entity::UnifiedEntity,
1074) -> bool {
1075    use crate::storage::query::ast::{Filter, PolicyAction, PolicyTargetKind};
1076
1077    if !runtime.inner.rls_enabled_tables.read().contains(collection) {
1078        return true;
1079    }
1080    let filter = cache.entry(collection.to_string()).or_insert_with(|| {
1081        let policies = runtime.matching_rls_policies_for_kind(
1082            collection,
1083            role,
1084            PolicyAction::Select,
1085            PolicyTargetKind::Nodes,
1086        );
1087        if policies.is_empty() {
1088            None
1089        } else {
1090            policies
1091                .into_iter()
1092                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1093        }
1094    });
1095    let Some(filter) = filter else {
1096        return false;
1097    };
1098    crate::runtime::query_exec::evaluate_entity_filter_with_db(
1099        Some(&runtime.inner.db),
1100        entity,
1101        filter,
1102        collection,
1103        collection,
1104    )
1105}
1106
1107/// Edge counterpart of `node_passes_rls`. Same caching strategy with
1108/// `kind = Edges`.
1109fn edge_passes_rls(
1110    runtime: &RedDBRuntime,
1111    collection: &str,
1112    role: Option<&str>,
1113    cache: &mut std::collections::HashMap<String, Option<crate::storage::query::ast::Filter>>,
1114    entity: &crate::storage::unified::entity::UnifiedEntity,
1115) -> bool {
1116    use crate::storage::query::ast::{Filter, PolicyAction, PolicyTargetKind};
1117
1118    if !runtime.inner.rls_enabled_tables.read().contains(collection) {
1119        return true;
1120    }
1121    let filter = cache.entry(collection.to_string()).or_insert_with(|| {
1122        let policies = runtime.matching_rls_policies_for_kind(
1123            collection,
1124            role,
1125            PolicyAction::Select,
1126            PolicyTargetKind::Edges,
1127        );
1128        if policies.is_empty() {
1129            None
1130        } else {
1131            policies
1132                .into_iter()
1133                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1134        }
1135    });
1136    let Some(filter) = filter else {
1137        return false;
1138    };
1139    crate::runtime::query_exec::evaluate_entity_filter_with_db(
1140        Some(&runtime.inner.db),
1141        entity,
1142        filter,
1143        collection,
1144        collection,
1145    )
1146}
1147
1148/// RLS policy injection (Phase 2.5.2 PG parity).
1149///
1150/// Fetch every matching policy for the current thread-local role and
1151/// fold them into the query's filter. Semantics mirror PostgreSQL:
1152///
1153/// * Multiple policies on the same table combine with **OR** — a row is
1154///   visible if *any* policy admits it.
1155/// * The combined policy predicate is **AND**-ed into the caller's
1156///   existing `WHERE` clause so explicit predicates continue to trim
1157///   the policy-allowed set.
1158/// * No matching policies + RLS enabled = zero rows (PG's
1159///   restrictive-default). Callers get `None` and return an empty
1160///   `UnifiedResult` without ever dispatching the scan.
1161///
1162/// This runs only when `RuntimeInner::rls_enabled_tables` already
1163/// contains the table name — callers gate the hot path upfront to
1164/// avoid the lock acquisition on tables without RLS.
1165///
1166/// Returns `None` when no policy admits the current role; returns
1167/// `Some(mutated_table)` with policy filters folded in otherwise.
1168fn inject_rls_filters(
1169    runtime: &RedDBRuntime,
1170    frame: &dyn super::statement_frame::ReadFrame,
1171    mut table: crate::storage::query::ast::TableQuery,
1172) -> Option<crate::storage::query::ast::TableQuery> {
1173    use crate::storage::query::ast::{Filter, PolicyAction};
1174
1175    // `None` role falls through to policies with no `TO role` clause.
1176    let role = frame.identity().map(|(_, role)| role);
1177    let role_str = role.map(|r| r.as_str().to_string());
1178    let policies =
1179        runtime.matching_rls_policies(&table.table, role_str.as_deref(), PolicyAction::Select);
1180
1181    if policies.is_empty() {
1182        // RLS enabled + no policy match = deny everything. Signal the
1183        // caller to short-circuit with an empty result set.
1184        return None;
1185    }
1186
1187    // Combine policy predicates with OR (PG's permissive default).
1188    let combined = policies
1189        .into_iter()
1190        .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1191        .expect("policies non-empty");
1192
1193    // AND into the caller's existing filter.
1194    table.filter = Some(match table.filter.take() {
1195        Some(existing) => Filter::And(Box::new(existing), Box::new(combined)),
1196        None => combined,
1197    });
1198    Some(table)
1199}
1200
1201/// Apply per-table RLS to a `JoinQuery` by folding each side's policy
1202/// predicate into the join's outer filter. Walking the merged record
1203/// at the join layer (rather than mutating the per-side scan filter)
1204/// keeps the planner's strategy choice and per-side index selection
1205/// undisturbed — the policy predicate uses the qualified `t.col` form
1206/// that resolves cleanly against the merged record's keys.
1207///
1208/// Returns `None` when any leaf has RLS enabled and no policy admits
1209/// the caller — the join short-circuits to an empty result.
1210fn inject_rls_into_join(
1211    runtime: &RedDBRuntime,
1212    frame: &dyn super::statement_frame::ReadFrame,
1213    mut join: crate::storage::query::ast::JoinQuery,
1214) -> Option<crate::storage::query::ast::JoinQuery> {
1215    use crate::storage::query::ast::Filter;
1216
1217    let mut policy_filters: Vec<Filter> = Vec::new();
1218    if !collect_join_side_policy(runtime, frame, join.left.as_ref(), &mut policy_filters) {
1219        return None;
1220    }
1221    if !collect_join_side_policy(runtime, frame, join.right.as_ref(), &mut policy_filters) {
1222        return None;
1223    }
1224
1225    if policy_filters.is_empty() {
1226        return Some(join);
1227    }
1228
1229    let combined = policy_filters
1230        .into_iter()
1231        .reduce(|acc, f| Filter::And(Box::new(acc), Box::new(f)))
1232        .expect("policy_filters non-empty");
1233
1234    join.filter = Some(match join.filter.take() {
1235        Some(existing) => Filter::And(Box::new(existing), Box::new(combined)),
1236        None => combined,
1237    });
1238
1239    Some(join)
1240}
1241
1242/// For each `Table` leaf reachable through nested joins, append the
1243/// RLS-policy filter (combined with OR across that side's matching
1244/// policies) into `out`. Returns `false` when a side has RLS enabled
1245/// but no policy admits the caller — the join must short-circuit.
1246fn collect_join_side_policy(
1247    runtime: &RedDBRuntime,
1248    frame: &dyn super::statement_frame::ReadFrame,
1249    expr: &crate::storage::query::ast::QueryExpr,
1250    out: &mut Vec<crate::storage::query::ast::Filter>,
1251) -> bool {
1252    use crate::storage::query::ast::{Filter, PolicyAction, QueryExpr};
1253    match expr {
1254        QueryExpr::Table(t) => {
1255            if !runtime.inner.rls_enabled_tables.read().contains(&t.table) {
1256                return true;
1257            }
1258            let role = frame.identity().map(|(_, role)| role);
1259            let role_str = role.map(|r| r.as_str().to_string());
1260            let policies =
1261                runtime.matching_rls_policies(&t.table, role_str.as_deref(), PolicyAction::Select);
1262            if policies.is_empty() {
1263                return false;
1264            }
1265            let combined = policies
1266                .into_iter()
1267                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1268                .expect("policies non-empty");
1269            out.push(combined);
1270            true
1271        }
1272        QueryExpr::Join(inner) => {
1273            collect_join_side_policy(runtime, frame, inner.left.as_ref(), out)
1274                && collect_join_side_policy(runtime, frame, inner.right.as_ref(), out)
1275        }
1276        _ => true,
1277    }
1278}
1279
1280/// Foreign-table post-scan filter application (Phase 3.2.2 PG parity).
1281///
1282/// Phase 3.2 FDW wrappers don't advertise filter pushdown, so the runtime
1283/// applies `WHERE` / `ORDER BY` / `LIMIT` / `OFFSET` after the wrapper
1284/// materialises all rows. Projections are best-effort — when the query
1285/// lists explicit columns we keep only those; a `SELECT *` keeps every
1286/// wrapper-emitted field verbatim.
1287///
1288/// When a wrapper later opts into pushdown (`supports_pushdown = true`)
1289/// the runtime will pass the compiled filter down instead of post-filtering.
1290fn apply_foreign_table_filters(
1291    records: Vec<crate::storage::query::unified::UnifiedRecord>,
1292    query: &crate::storage::query::ast::TableQuery,
1293) -> crate::storage::query::unified::UnifiedResult {
1294    use crate::storage::query::sql_lowering::{
1295        effective_table_filter, effective_table_projections,
1296    };
1297    use crate::storage::query::unified::UnifiedResult;
1298
1299    let filter = effective_table_filter(query);
1300    let projections = effective_table_projections(query);
1301
1302    // Step 1 — WHERE. Reuse the cross-store evaluator so the semantics
1303    // match native-collection queries (same operators, same NULL handling).
1304    let mut filtered: Vec<_> = records
1305        .into_iter()
1306        .filter(|record| match &filter {
1307            Some(f) => {
1308                super::join_filter::evaluate_runtime_filter_with_db(None, record, f, None, None)
1309            }
1310            None => true,
1311        })
1312        .collect();
1313
1314    // Step 2 — LIMIT / OFFSET. Applied after filter to match SQL semantics.
1315    if let Some(offset) = query.offset {
1316        let offset = offset as usize;
1317        if offset >= filtered.len() {
1318            filtered.clear();
1319        } else {
1320            filtered.drain(0..offset);
1321        }
1322    }
1323    if let Some(limit) = query.limit {
1324        filtered.truncate(limit as usize);
1325    }
1326
1327    // Step 3 — columns list. `SELECT *` (no explicit projections) keeps
1328    // the wrapper's column set; an explicit list trims to those names.
1329    let columns: Vec<String> = if projections.is_empty() {
1330        filtered
1331            .first()
1332            .map(|r| r.column_names().iter().map(|k| k.to_string()).collect())
1333            .unwrap_or_default()
1334    } else {
1335        projections
1336            .iter()
1337            .map(super::join_filter::projection_name)
1338            .collect()
1339    };
1340
1341    let mut result = UnifiedResult::empty();
1342    result.columns = columns;
1343    result.records = filtered;
1344    result
1345}
1346
1347/// Collect every concrete table reference inside a `QueryExpr`.
1348///
1349/// Used by view bookkeeping (dependency tracking for materialised
1350/// invalidation) and any other rewriter that needs to know the base
1351/// tables a query pulls from. Does not descend into projections/filters;
1352/// only the `FROM` side.
1353pub(crate) fn collect_table_refs(expr: &QueryExpr) -> Vec<String> {
1354    let mut scopes: HashSet<String> = HashSet::new();
1355    collect_query_expr_result_cache_scopes(&mut scopes, expr);
1356    scopes.into_iter().collect()
1357}
1358
1359fn query_expr_result_cache_scopes(expr: &QueryExpr) -> HashSet<String> {
1360    let mut scopes = HashSet::new();
1361    collect_query_expr_result_cache_scopes(&mut scopes, expr);
1362    scopes
1363}
1364
1365const RESULT_CACHE_BACKEND_KEY: &str = "runtime.result_cache.backend";
1366const RESULT_CACHE_DEFAULT_BACKEND: &str = "legacy";
1367const RESULT_CACHE_BLOB_NAMESPACE: &str = "runtime.result_cache";
1368const RESULT_CACHE_TTL_SECS: u64 = 30;
1369const RESULT_CACHE_MAX_ENTRIES: usize = 1000;
1370const RESULT_CACHE_PAYLOAD_MAGIC: &[u8; 8] = b"RDRC0001";
1371
1372#[derive(Clone, Copy, Debug, PartialEq, Eq)]
1373enum RuntimeResultCacheBackend {
1374    Legacy,
1375    BlobCache,
1376    Shadow,
1377}
1378
1379fn trim_result_cache(
1380    map: &mut HashMap<String, RuntimeResultCacheEntry>,
1381    order: &mut std::collections::VecDeque<String>,
1382) {
1383    while map.len() > RESULT_CACHE_MAX_ENTRIES {
1384        if let Some(oldest) = order.pop_front() {
1385            map.remove(&oldest);
1386        } else {
1387            break;
1388        }
1389    }
1390}
1391
1392fn result_cache_fingerprint(result: &RuntimeQueryResult) -> String {
1393    format!(
1394        "{:?}|{}|{}|{}|{}|{:?}",
1395        result.result,
1396        result.query,
1397        result.statement,
1398        result.engine,
1399        result.affected_rows,
1400        result.statement_type
1401    )
1402}
1403
1404fn mode_to_byte(mode: crate::storage::query::modes::QueryMode) -> u8 {
1405    match mode {
1406        crate::storage::query::modes::QueryMode::Sql => 0,
1407        crate::storage::query::modes::QueryMode::Gremlin => 1,
1408        crate::storage::query::modes::QueryMode::Cypher => 2,
1409        crate::storage::query::modes::QueryMode::Sparql => 3,
1410        crate::storage::query::modes::QueryMode::Path => 4,
1411        crate::storage::query::modes::QueryMode::Natural => 5,
1412        crate::storage::query::modes::QueryMode::Unknown => 255,
1413    }
1414}
1415
1416fn mode_from_byte(byte: u8) -> Option<crate::storage::query::modes::QueryMode> {
1417    match byte {
1418        0 => Some(crate::storage::query::modes::QueryMode::Sql),
1419        1 => Some(crate::storage::query::modes::QueryMode::Gremlin),
1420        2 => Some(crate::storage::query::modes::QueryMode::Cypher),
1421        3 => Some(crate::storage::query::modes::QueryMode::Sparql),
1422        4 => Some(crate::storage::query::modes::QueryMode::Path),
1423        5 => Some(crate::storage::query::modes::QueryMode::Natural),
1424        255 => Some(crate::storage::query::modes::QueryMode::Unknown),
1425        _ => None,
1426    }
1427}
1428
1429fn result_cache_static_str(value: &str) -> Option<&'static str> {
1430    match value {
1431        "select" => Some("select"),
1432        "materialized-graph" => Some("materialized-graph"),
1433        "runtime-red-schema" => Some("runtime-red-schema"),
1434        "runtime-fdw" => Some("runtime-fdw"),
1435        "runtime-table-rls" => Some("runtime-table-rls"),
1436        "runtime-table" => Some("runtime-table"),
1437        "runtime-join-rls" => Some("runtime-join-rls"),
1438        "runtime-join" => Some("runtime-join"),
1439        "runtime-vector" => Some("runtime-vector"),
1440        "runtime-hybrid" => Some("runtime-hybrid"),
1441        "runtime-secret" => Some("runtime-secret"),
1442        "runtime-config" => Some("runtime-config"),
1443        "runtime-tenant" => Some("runtime-tenant"),
1444        "runtime-explain" => Some("runtime-explain"),
1445        "runtime-tree" => Some("runtime-tree"),
1446        "runtime-kv" => Some("runtime-kv"),
1447        "runtime-queue" => Some("runtime-queue"),
1448        _ => None,
1449    }
1450}
1451
1452fn write_u32(out: &mut Vec<u8>, value: usize) -> Option<()> {
1453    let value = u32::try_from(value).ok()?;
1454    out.extend_from_slice(&value.to_le_bytes());
1455    Some(())
1456}
1457
1458fn write_string(out: &mut Vec<u8>, value: &str) -> Option<()> {
1459    write_u32(out, value.len())?;
1460    out.extend_from_slice(value.as_bytes());
1461    Some(())
1462}
1463
1464fn write_bytes(out: &mut Vec<u8>, value: &[u8]) -> Option<()> {
1465    write_u32(out, value.len())?;
1466    out.extend_from_slice(value);
1467    Some(())
1468}
1469
1470fn read_u8(input: &mut &[u8]) -> Option<u8> {
1471    let (&value, rest) = input.split_first()?;
1472    *input = rest;
1473    Some(value)
1474}
1475
1476fn read_u32(input: &mut &[u8]) -> Option<usize> {
1477    if input.len() < 4 {
1478        return None;
1479    }
1480    let value = u32::from_le_bytes(input[..4].try_into().ok()?) as usize;
1481    *input = &input[4..];
1482    Some(value)
1483}
1484
1485fn read_u64(input: &mut &[u8]) -> Option<u64> {
1486    if input.len() < 8 {
1487        return None;
1488    }
1489    let value = u64::from_le_bytes(input[..8].try_into().ok()?);
1490    *input = &input[8..];
1491    Some(value)
1492}
1493
1494fn read_string(input: &mut &[u8]) -> Option<String> {
1495    let len = read_u32(input)?;
1496    if input.len() < len {
1497        return None;
1498    }
1499    let value = String::from_utf8(input[..len].to_vec()).ok()?;
1500    *input = &input[len..];
1501    Some(value)
1502}
1503
1504fn read_bytes<'a>(input: &mut &'a [u8]) -> Option<&'a [u8]> {
1505    let len = read_u32(input)?;
1506    if input.len() < len {
1507        return None;
1508    }
1509    let value = &input[..len];
1510    *input = &input[len..];
1511    Some(value)
1512}
1513
1514fn encode_result_cache_payload(entry: &RuntimeResultCacheEntry) -> Option<Vec<u8>> {
1515    let result = &entry.result;
1516    if result.result.pre_serialized_json.is_some()
1517        || result_cache_static_str(result.statement).is_none()
1518        || result_cache_static_str(result.engine).is_none()
1519        || result_cache_static_str(result.statement_type).is_none()
1520        || result.result.records.iter().any(|record| {
1521            !record.nodes.is_empty()
1522                || !record.edges.is_empty()
1523                || !record.paths.is_empty()
1524                || !record.vector_results.is_empty()
1525        })
1526    {
1527        return None;
1528    }
1529
1530    let mut out = Vec::new();
1531    out.extend_from_slice(RESULT_CACHE_PAYLOAD_MAGIC);
1532    write_string(&mut out, &result.query)?;
1533    out.push(mode_to_byte(result.mode));
1534    write_string(&mut out, result.statement)?;
1535    write_string(&mut out, result.engine)?;
1536    out.extend_from_slice(&result.affected_rows.to_le_bytes());
1537    write_string(&mut out, result.statement_type)?;
1538
1539    write_u32(&mut out, result.result.columns.len())?;
1540    for column in &result.result.columns {
1541        write_string(&mut out, column)?;
1542    }
1543    out.extend_from_slice(&result.result.stats.nodes_scanned.to_le_bytes());
1544    out.extend_from_slice(&result.result.stats.edges_scanned.to_le_bytes());
1545    out.extend_from_slice(&result.result.stats.rows_scanned.to_le_bytes());
1546    out.extend_from_slice(&result.result.stats.exec_time_us.to_le_bytes());
1547
1548    write_u32(&mut out, result.result.records.len())?;
1549    for record in &result.result.records {
1550        let fields = record.iter_fields().collect::<Vec<_>>();
1551        write_u32(&mut out, fields.len())?;
1552        for (name, value) in fields {
1553            write_string(&mut out, name)?;
1554            let mut encoded = Vec::new();
1555            crate::storage::schema::value_codec::encode(value, &mut encoded);
1556            write_bytes(&mut out, &encoded)?;
1557        }
1558    }
1559
1560    write_u32(&mut out, entry.scopes.len())?;
1561    for scope in &entry.scopes {
1562        write_string(&mut out, scope)?;
1563    }
1564    Some(out)
1565}
1566
1567fn decode_result_cache_payload(mut input: &[u8]) -> Option<(RuntimeQueryResult, HashSet<String>)> {
1568    if input.len() < RESULT_CACHE_PAYLOAD_MAGIC.len()
1569        || &input[..RESULT_CACHE_PAYLOAD_MAGIC.len()] != RESULT_CACHE_PAYLOAD_MAGIC
1570    {
1571        return None;
1572    }
1573    input = &input[RESULT_CACHE_PAYLOAD_MAGIC.len()..];
1574
1575    let query = read_string(&mut input)?;
1576    let mode = mode_from_byte(read_u8(&mut input)?)?;
1577    let statement = result_cache_static_str(&read_string(&mut input)?)?;
1578    let engine = result_cache_static_str(&read_string(&mut input)?)?;
1579    let affected_rows = read_u64(&mut input)?;
1580    let statement_type = result_cache_static_str(&read_string(&mut input)?)?;
1581
1582    let mut columns = Vec::new();
1583    for _ in 0..read_u32(&mut input)? {
1584        columns.push(read_string(&mut input)?);
1585    }
1586    let stats = crate::storage::query::unified::QueryStats {
1587        nodes_scanned: read_u64(&mut input)?,
1588        edges_scanned: read_u64(&mut input)?,
1589        rows_scanned: read_u64(&mut input)?,
1590        exec_time_us: read_u64(&mut input)?,
1591    };
1592
1593    let mut records = Vec::new();
1594    for _ in 0..read_u32(&mut input)? {
1595        let mut record = crate::storage::query::unified::UnifiedRecord::new();
1596        for _ in 0..read_u32(&mut input)? {
1597            let name = read_string(&mut input)?;
1598            let bytes = read_bytes(&mut input)?;
1599            let (value, used) = crate::storage::schema::value_codec::decode(bytes).ok()?;
1600            if used != bytes.len() {
1601                return None;
1602            }
1603            record.set_owned(name, value);
1604        }
1605        records.push(record);
1606    }
1607
1608    let mut scopes = HashSet::new();
1609    for _ in 0..read_u32(&mut input)? {
1610        scopes.insert(read_string(&mut input)?);
1611    }
1612    if !input.is_empty() {
1613        return None;
1614    }
1615
1616    Some((
1617        RuntimeQueryResult {
1618            query,
1619            mode,
1620            statement,
1621            engine,
1622            result: crate::storage::query::unified::UnifiedResult {
1623                columns,
1624                records,
1625                stats,
1626                pre_serialized_json: None,
1627            },
1628            affected_rows,
1629            statement_type,
1630        },
1631        scopes,
1632    ))
1633}
1634
1635/// Heuristic: does the raw SQL reference a built-in whose output
1636/// varies by connection, clock, or randomness? Such queries must
1637/// skip the 30s result cache — see the call site for rationale.
1638///
1639/// ASCII case-insensitive substring match. False positives (the
1640/// token appears in a quoted string) only skip caching, which is
1641/// the conservative direction.
1642/// If `sql` starts with `EXPLAIN` followed by a non-`ALTER` token,
1643/// return the trimmed inner statement; otherwise `None`.
1644///
1645/// `EXPLAIN ALTER FOR CREATE TABLE ...` is a separate schema-diff
1646/// command handled inside the normal SQL parser, so we leave it
1647/// alone here.
1648fn strip_explain_prefix(sql: &str) -> Option<&str> {
1649    let trimmed = sql.trim_start();
1650    let (head, rest) = trimmed.split_at(
1651        trimmed
1652            .find(|c: char| c.is_whitespace())
1653            .unwrap_or(trimmed.len()),
1654    );
1655    if !head.eq_ignore_ascii_case("EXPLAIN") {
1656        return None;
1657    }
1658    let rest = rest.trim_start();
1659    if rest.is_empty() {
1660        return None;
1661    }
1662    // Peek the next token — if ALTER, defer to the existing
1663    // EXPLAIN ALTER FOR path.
1664    let next_head_end = rest.find(|c: char| c.is_whitespace()).unwrap_or(rest.len());
1665    if rest[..next_head_end].eq_ignore_ascii_case("ALTER") {
1666        return None;
1667    }
1668    Some(rest)
1669}
1670
1671/// Cheap prefix check for a leading `WITH` keyword. Used to gate the
1672/// CTE-aware parse in `execute_query` without paying for a full
1673/// lexer pass on every statement. Treats `WITHIN` as not-a-CTE so
1674/// `WITHIN TENANT '...' SELECT ...` doesn't mis-route.
1675pub(super) fn has_with_prefix(sql: &str) -> bool {
1676    let trimmed = sql.trim_start();
1677    let head_end = trimmed
1678        .find(|c: char| c.is_whitespace() || c == '(')
1679        .unwrap_or(trimmed.len());
1680    trimmed[..head_end].eq_ignore_ascii_case("WITH")
1681}
1682
1683/// If the query is a plain SELECT whose top-level `TableQuery`
1684/// carries an `AS OF` clause, return a typed spec that the runtime
1685/// can feed to `vcs_resolve_as_of`. Returns `None` for any other
1686/// shape — joins, DML, EXPLAIN, or parse failures — so callers fall
1687/// back to the connection's regular MVCC snapshot. A cheap textual
1688/// prefilter skips the parse entirely when the source doesn't
1689/// mention `AS OF` / `as of`, keeping the autocommit hot path free.
1690fn peek_top_level_as_of(sql: &str) -> Option<crate::application::vcs::AsOfSpec> {
1691    peek_top_level_as_of_with_table(sql).map(|(spec, _)| spec)
1692}
1693
1694/// Same as `peek_top_level_as_of` but also returns the table name
1695/// targeted by the AS OF clause (when the FROM clause names a
1696/// concrete table). `None` for the table slot means scalar SELECT
1697/// or a subquery source — callers treat those as "no enforcement".
1698pub(super) fn peek_top_level_as_of_with_table(
1699    sql: &str,
1700) -> Option<(crate::application::vcs::AsOfSpec, Option<String>)> {
1701    if !sql
1702        .as_bytes()
1703        .windows(5)
1704        .any(|w| w.eq_ignore_ascii_case(b"as of"))
1705    {
1706        return None;
1707    }
1708    let parsed = crate::storage::query::parser::parse(sql).ok()?;
1709    let crate::storage::query::ast::QueryExpr::Table(table) = parsed.query else {
1710        return None;
1711    };
1712    let clause = table.as_of?;
1713    let table_name = if table.table.is_empty() || table.table == "any" {
1714        None
1715    } else {
1716        Some(table.table.clone())
1717    };
1718    let spec = match clause {
1719        crate::storage::query::ast::AsOfClause::Commit(h) => {
1720            crate::application::vcs::AsOfSpec::Commit(h)
1721        }
1722        crate::storage::query::ast::AsOfClause::Branch(b) => {
1723            crate::application::vcs::AsOfSpec::Branch(b)
1724        }
1725        crate::storage::query::ast::AsOfClause::Tag(t) => crate::application::vcs::AsOfSpec::Tag(t),
1726        crate::storage::query::ast::AsOfClause::TimestampMs(ts) => {
1727            crate::application::vcs::AsOfSpec::TimestampMs(ts)
1728        }
1729        crate::storage::query::ast::AsOfClause::Snapshot(x) => {
1730            crate::application::vcs::AsOfSpec::Snapshot(x)
1731        }
1732    };
1733    Some((spec, table_name))
1734}
1735
1736pub(super) fn query_has_volatile_builtin(sql: &str) -> bool {
1737    // Lowercase the bytes up to the first null/newline into a small
1738    // stack buffer for cheap contains() checks. Most SQL fits in the
1739    // buffer; longer queries fall back to owned lowercase.
1740    const VOLATILE_TOKENS: &[&str] = &[
1741        "pg_advisory_lock",
1742        "pg_try_advisory_lock",
1743        "pg_advisory_unlock",
1744        "random()",
1745        // NOW() / CURRENT_TIMESTAMP / CURRENT_DATE intentionally
1746        // omitted for now — they ARE volatile but today's tests rely
1747        // on caching them. Revisit once a tighter volatility story
1748        // lands.
1749    ];
1750    let lowered = sql.to_ascii_lowercase();
1751    VOLATILE_TOKENS.iter().any(|t| lowered.contains(t))
1752}
1753
1754/// Pick the `(global_mode, collection_mode)` pair for an expression,
1755/// or `None` for variants that opt out of intent-locking entirely
1756/// (admin statements like `SHOW CONFIG`, transaction control, tenant
1757/// toggles).
1758///
1759/// Phase-1 contract:
1760/// - Reads  — `(IX-compatible) (Global, IS) → (Collection, IS)`
1761/// - Writes — `(IX-compatible) (Global, IX) → (Collection, IX)`
1762/// - DDL    — `(strong)        (Global, IX) → (Collection, X)`
1763pub(super) fn intent_lock_modes_for(
1764    expr: &QueryExpr,
1765) -> Option<(
1766    crate::storage::transaction::lock::LockMode,
1767    crate::storage::transaction::lock::LockMode,
1768)> {
1769    use crate::storage::transaction::lock::LockMode::{Exclusive, IntentExclusive, IntentShared};
1770
1771    match expr {
1772        // Reads — IS / IS.
1773        QueryExpr::Table(_)
1774        | QueryExpr::Join(_)
1775        | QueryExpr::Vector(_)
1776        | QueryExpr::Hybrid(_)
1777        | QueryExpr::Graph(_)
1778        | QueryExpr::Path(_)
1779        | QueryExpr::Ask(_)
1780        | QueryExpr::SearchCommand(_)
1781        | QueryExpr::GraphCommand(_)
1782        | QueryExpr::QueueSelect(_) => Some((IntentShared, IntentShared)),
1783
1784        // Writes — IX / IX. Non-tabular mutations (vector insert,
1785        // graph node insert, queue push, timeseries point insert)
1786        // don't carry their own dispatch arm here; they ride through
1787        // the Insert variant or a command variant covered by the
1788        // read-side arm above. P1.T4 expands only the TableQuery-ish
1789        // writes; non-tabular kinds inherit when their DML variants
1790        // land in later phases.
1791        QueryExpr::Insert(_)
1792        | QueryExpr::Update(_)
1793        | QueryExpr::Delete(_)
1794        | QueryExpr::QueueCommand(QueueCommand::Move { .. }) => {
1795            Some((IntentExclusive, IntentExclusive))
1796        }
1797        QueryExpr::QueueCommand(_) => Some((IntentShared, IntentShared)),
1798
1799        // DDL — IX / X. A DDL against collection `c` blocks all
1800        // other writers + readers on `c` but leaves other collections
1801        // running (because Global stays IX, not X).
1802        QueryExpr::CreateTable(_)
1803        | QueryExpr::DropTable(_)
1804        | QueryExpr::DropGraph(_)
1805        | QueryExpr::DropVector(_)
1806        | QueryExpr::DropDocument(_)
1807        | QueryExpr::DropKv(_)
1808        | QueryExpr::DropCollection(_)
1809        | QueryExpr::Truncate(_)
1810        | QueryExpr::AlterTable(_)
1811        | QueryExpr::CreateIndex(_)
1812        | QueryExpr::DropIndex(_)
1813        | QueryExpr::CreateTimeSeries(_)
1814        | QueryExpr::DropTimeSeries(_)
1815        | QueryExpr::CreateQueue(_)
1816        | QueryExpr::AlterQueue(_)
1817        | QueryExpr::DropQueue(_)
1818        | QueryExpr::CreateTree(_)
1819        | QueryExpr::DropTree(_)
1820        | QueryExpr::CreatePolicy(_)
1821        | QueryExpr::DropPolicy(_)
1822        | QueryExpr::CreateView(_)
1823        | QueryExpr::DropView(_)
1824        | QueryExpr::RefreshMaterializedView(_)
1825        | QueryExpr::CreateSchema(_)
1826        | QueryExpr::DropSchema(_)
1827        | QueryExpr::CreateSequence(_)
1828        | QueryExpr::DropSequence(_)
1829        | QueryExpr::CreateServer(_)
1830        | QueryExpr::DropServer(_)
1831        | QueryExpr::CreateForeignTable(_)
1832        | QueryExpr::DropForeignTable(_) => Some((IntentExclusive, Exclusive)),
1833
1834        // Admin / control — skip intent locks. `SET TENANT`,
1835        // `BEGIN / COMMIT / ROLLBACK`, `SET CONFIG`, `SHOW CONFIG`,
1836        // `VACUUM`, etc. don't touch collection data the same way
1837        // and the existing transaction layer already serialises the
1838        // pieces that matter.
1839        _ => None,
1840    }
1841}
1842
1843/// Best-effort collection inventory for an expression. Used to pick
1844/// `Collection(...)` resources for the intent-lock guard. Overshoots
1845/// are fine (take an extra IS, benign); undershoots leak writes past
1846/// DDL X locks, so err on the side of listing more names.
1847pub(super) fn collections_referenced(expr: &QueryExpr) -> Vec<String> {
1848    let mut out = Vec::new();
1849    walk_collections(expr, &mut out);
1850    out.sort();
1851    out.dedup();
1852    out
1853}
1854
1855fn walk_collections(expr: &QueryExpr, out: &mut Vec<String>) {
1856    match expr {
1857        QueryExpr::Table(t) => out.push(t.table.clone()),
1858        QueryExpr::Join(j) => {
1859            walk_collections(&j.left, out);
1860            walk_collections(&j.right, out);
1861        }
1862        QueryExpr::Insert(i) => out.push(i.table.clone()),
1863        QueryExpr::Update(u) => out.push(u.table.clone()),
1864        QueryExpr::Delete(d) => out.push(d.table.clone()),
1865        QueryExpr::QueueSelect(q) => out.push(q.queue.clone()),
1866
1867        // DDL — include the target collection so DDL takes
1868        // `(Collection, X)` and blocks concurrent readers / writers
1869        // on the same collection. Other collections stay live
1870        // because Global is still IX.
1871        QueryExpr::CreateTable(q) => out.push(q.name.clone()),
1872        QueryExpr::DropTable(q) => out.push(q.name.clone()),
1873        QueryExpr::DropGraph(q) => out.push(q.name.clone()),
1874        QueryExpr::DropVector(q) => out.push(q.name.clone()),
1875        QueryExpr::DropDocument(q) => out.push(q.name.clone()),
1876        QueryExpr::DropKv(q) => out.push(q.name.clone()),
1877        QueryExpr::DropCollection(q) => out.push(q.name.clone()),
1878        QueryExpr::Truncate(q) => out.push(q.name.clone()),
1879        QueryExpr::AlterTable(q) => out.push(q.name.clone()),
1880        QueryExpr::CreateIndex(q) => out.push(q.table.clone()),
1881        QueryExpr::DropIndex(q) => out.push(q.table.clone()),
1882        QueryExpr::CreateTimeSeries(q) => out.push(q.name.clone()),
1883        QueryExpr::DropTimeSeries(q) => out.push(q.name.clone()),
1884        QueryExpr::CreateQueue(q) => out.push(q.name.clone()),
1885        QueryExpr::AlterQueue(q) => out.push(q.name.clone()),
1886        QueryExpr::DropQueue(q) => out.push(q.name.clone()),
1887        QueryExpr::QueueCommand(QueueCommand::Move {
1888            source,
1889            destination,
1890            ..
1891        }) => {
1892            out.push(source.clone());
1893            out.push(destination.clone());
1894        }
1895        QueryExpr::CreatePolicy(q) => out.push(q.table.clone()),
1896        QueryExpr::CreateView(q) => out.push(q.name.clone()),
1897        QueryExpr::DropView(q) => out.push(q.name.clone()),
1898        QueryExpr::RefreshMaterializedView(q) => out.push(q.name.clone()),
1899
1900        // Vector / Hybrid / Graph / Path / commands reference
1901        // collections through fields whose shape varies; without a
1902        // uniform accessor we fall back to the global lock only —
1903        // benign because every runtime path still holds the global
1904        // mode.
1905        _ => {}
1906    }
1907}
1908
1909impl RedDBRuntime {
1910    pub fn in_memory() -> RedDBResult<Self> {
1911        Self::with_options(RedDBOptions::in_memory())
1912    }
1913
1914    /// Handle to the intent-lock manager for tests + introspection.
1915    /// Production code acquires via `LockerGuard::new(rt.lock_manager())`
1916    /// rather than touching the manager directly.
1917    pub fn lock_manager(&self) -> std::sync::Arc<crate::storage::transaction::lock::LockManager> {
1918        self.inner.lock_manager.clone()
1919    }
1920
1921    #[inline(never)]
1922    pub fn with_options(options: RedDBOptions) -> RedDBResult<Self> {
1923        Self::with_pool(options, ConnectionPoolConfig::default())
1924    }
1925
1926    pub fn with_pool(
1927        options: RedDBOptions,
1928        pool_config: ConnectionPoolConfig,
1929    ) -> RedDBResult<Self> {
1930        // PLAN.md Phase 9.1 — capture wall-clock before storage
1931        // open so the cold-start phase markers can be backfilled
1932        // once Lifecycle is constructed below. Storage open
1933        // encapsulates auto-restore + WAL replay; we treat the
1934        // whole window as one combined "restore" + "wal_replay"
1935        // phase split at the same boundary because the storage
1936        // layer doesn't yet emit a finer signal.
1937        let boot_open_start_ms = std::time::SystemTime::now()
1938            .duration_since(std::time::UNIX_EPOCH)
1939            .map(|d| d.as_millis() as u64)
1940            .unwrap_or(0);
1941        let db = Arc::new(
1942            RedDB::open_with_options(&options)
1943                .map_err(|err| RedDBError::Internal(err.to_string()))?,
1944        );
1945        let result_blob_cache = crate::storage::cache::BlobCache::open_with_l2(
1946            crate::storage::cache::BlobCacheConfig::default().with_l2_path(
1947                options
1948                    .resolved_path("data.rdb")
1949                    .with_extension("result-cache.l2"),
1950            ),
1951        )
1952        .map_err(|err| {
1953            RedDBError::Internal(format!("open result Blob Cache L2 failed: {err:?}"))
1954        })?;
1955        let storage_ready_ms = std::time::SystemTime::now()
1956            .duration_since(std::time::UNIX_EPOCH)
1957            .map(|d| d.as_millis() as u64)
1958            .unwrap_or(0);
1959
1960        let runtime = Self {
1961            inner: Arc::new(RuntimeInner {
1962                db,
1963                layout: PhysicalLayout::from_options(&options),
1964                indices: IndexCatalog::register_default_vector_graph(
1965                    options.has_capability(crate::api::Capability::Table),
1966                    options.has_capability(crate::api::Capability::Graph),
1967                ),
1968                pool_config,
1969                pool: Mutex::new(PoolState::default()),
1970                started_at_unix_ms: SystemTime::now()
1971                    .duration_since(UNIX_EPOCH)
1972                    .unwrap_or_default()
1973                    .as_millis(),
1974                probabilistic: super::probabilistic_store::ProbabilisticStore::new(),
1975                index_store: super::index_store::IndexStore::new(),
1976                cdc: crate::replication::cdc::CdcBuffer::new(100_000),
1977                backup_scheduler: crate::replication::scheduler::BackupScheduler::new(3600),
1978                query_cache: parking_lot::RwLock::new(
1979                    crate::storage::query::planner::cache::PlanCache::new(1000),
1980                ),
1981                result_cache: parking_lot::RwLock::new((
1982                    HashMap::new(),
1983                    std::collections::VecDeque::new(),
1984                )),
1985                result_blob_cache,
1986                result_blob_entries: parking_lot::RwLock::new((
1987                    HashMap::new(),
1988                    std::collections::VecDeque::new(),
1989                )),
1990                result_cache_shadow_divergences: std::sync::atomic::AtomicU64::new(0),
1991                queue_message_locks: parking_lot::RwLock::new(HashMap::new()),
1992                planner_dirty_tables: parking_lot::RwLock::new(HashSet::new()),
1993                ec_registry: Arc::new(crate::ec::config::EcRegistry::new()),
1994                ec_worker: crate::ec::worker::EcWorker::new(),
1995                auth_store: parking_lot::RwLock::new(None),
1996                oauth_validator: parking_lot::RwLock::new(None),
1997                views: parking_lot::RwLock::new(HashMap::new()),
1998                materialized_views: parking_lot::RwLock::new(
1999                    crate::storage::cache::result::MaterializedViewCache::new(),
2000                ),
2001                snapshot_manager: Arc::new(
2002                    crate::storage::transaction::snapshot::SnapshotManager::new(),
2003                ),
2004                tx_contexts: parking_lot::RwLock::new(HashMap::new()),
2005                tx_local_tenants: parking_lot::RwLock::new(HashMap::new()),
2006                env_config_overrides: crate::runtime::config_overlay::collect_env_overrides(),
2007                lock_manager: Arc::new({
2008                    // Sourced from the matrix: Tier B key
2009                    // `concurrency.locking.deadlock_timeout_ms`
2010                    // (default 5000). Env var wins at boot so
2011                    // operators can tune without touching red_config.
2012                    let env = crate::runtime::config_overlay::collect_env_overrides();
2013                    let timeout_ms = env
2014                        .get("concurrency.locking.deadlock_timeout_ms")
2015                        .and_then(|raw| raw.parse::<u64>().ok())
2016                        .unwrap_or_else(|| {
2017                            match crate::runtime::config_matrix::default_for(
2018                                "concurrency.locking.deadlock_timeout_ms",
2019                            ) {
2020                                Some(crate::serde_json::Value::Number(n)) => n as u64,
2021                                _ => 5000,
2022                            }
2023                        });
2024                    let cfg = crate::storage::transaction::lock::LockConfig {
2025                        default_timeout: std::time::Duration::from_millis(timeout_ms),
2026                        ..Default::default()
2027                    };
2028                    crate::storage::transaction::lock::LockManager::new(cfg)
2029                }),
2030                rls_policies: parking_lot::RwLock::new(HashMap::new()),
2031                rls_enabled_tables: parking_lot::RwLock::new(HashSet::new()),
2032                foreign_tables: Arc::new(crate::storage::fdw::ForeignTableRegistry::with_builtins()),
2033                pending_tombstones: parking_lot::RwLock::new(HashMap::new()),
2034                pending_kv_watch_events: parking_lot::RwLock::new(HashMap::new()),
2035                tenant_tables: parking_lot::RwLock::new(HashMap::new()),
2036                ddl_epoch: std::sync::atomic::AtomicU64::new(0),
2037                write_gate: Arc::new(crate::runtime::write_gate::WriteGate::from_options(
2038                    &options,
2039                )),
2040                lifecycle: crate::runtime::lifecycle::Lifecycle::new(),
2041                resource_limits: crate::runtime::resource_limits::ResourceLimits::from_env(),
2042                audit_log: {
2043                    // Default audit-log path for the in-memory case
2044                    // sits in the system temp dir; persistent runs
2045                    // place it next to data.rdb.
2046                    let data_path = options
2047                        .data_path
2048                        .clone()
2049                        .unwrap_or_else(|| std::env::temp_dir().join("reddb"));
2050                    Arc::new(crate::runtime::audit_log::AuditLogger::for_data_path(
2051                        &data_path,
2052                    ))
2053                },
2054                lease_lifecycle: std::sync::OnceLock::new(),
2055                replica_apply_metrics: crate::replication::logical::ReplicaApplyMetrics::default(),
2056                quota_bucket: crate::runtime::quota_bucket::QuotaBucket::from_env(),
2057                schema_vocabulary: parking_lot::RwLock::new(
2058                    crate::runtime::schema_vocabulary::SchemaVocabulary::new(),
2059                ),
2060                slow_query_logger: {
2061                    // Issue #205 — slow-query sink lives in the same
2062                    // directory the audit log uses, so backup/restore
2063                    // ships them together. Threshold + sample-pct
2064                    // default conservatively (1 s, 100% sampling) so
2065                    // emitted lines are rare and complete. Operators
2066                    // tune via env / config matrix in a follow-up.
2067                    //
2068                    // `data_path` points at the primary `.rdb` *file*
2069                    // (mirrors AuditLogger::for_data_path), so we
2070                    // anchor the slow log at its parent directory.
2071                    let log_dir = options
2072                        .data_path
2073                        .as_ref()
2074                        .and_then(|p| p.parent().map(std::path::PathBuf::from))
2075                        .unwrap_or_else(|| std::env::temp_dir().join("reddb"));
2076                    let threshold_ms = std::env::var("RED_SLOW_QUERY_THRESHOLD_MS")
2077                        .ok()
2078                        .and_then(|s| s.parse::<u64>().ok())
2079                        .unwrap_or(1000);
2080                    let sample_pct = std::env::var("RED_SLOW_QUERY_SAMPLE_PCT")
2081                        .ok()
2082                        .and_then(|s| s.parse::<u8>().ok())
2083                        .unwrap_or(100);
2084                    crate::telemetry::slow_query_logger::SlowQueryLogger::new(
2085                        crate::telemetry::slow_query_logger::SlowQueryOpts {
2086                            log_dir,
2087                            threshold_ms,
2088                            sample_pct,
2089                        },
2090                    )
2091                },
2092                kv_stats: crate::runtime::KvStatsCounters::default(),
2093                kv_tag_index: crate::runtime::KvTagIndex::default(),
2094            }),
2095        };
2096
2097        // Issue #205 — install the process-wide OperatorEvent sink so
2098        // emit sites buried in storage / replication / signal handlers
2099        // can record without threading an `&AuditLogger` through every
2100        // call stack. First registration wins; subsequent in-memory
2101        // runtimes (test harnesses) fall through to tracing+eprintln.
2102        crate::telemetry::operator_event::install_global_audit_sink(Arc::clone(
2103            &runtime.inner.audit_log,
2104        ));
2105
2106        // PLAN.md Phase 9.1 — backfill cold-start phase markers
2107        // from the wall-clock captured before storage open. The
2108        // entire `RedDB::open_with_options` call covers both
2109        // auto-restore (when configured) and WAL replay. We
2110        // record both phases against the same boundary today;
2111        // a follow-up will split them once the storage layer
2112        // surfaces a finer-grained event.
2113        runtime
2114            .inner
2115            .lifecycle
2116            .set_restore_started_at_ms(boot_open_start_ms);
2117        runtime
2118            .inner
2119            .lifecycle
2120            .set_restore_ready_at_ms(storage_ready_ms);
2121        runtime
2122            .inner
2123            .lifecycle
2124            .set_wal_replay_started_at_ms(boot_open_start_ms);
2125        runtime
2126            .inner
2127            .lifecycle
2128            .set_wal_replay_ready_at_ms(storage_ready_ms);
2129
2130        let restored_cdc_lsn = runtime
2131            .inner
2132            .db
2133            .replication
2134            .as_ref()
2135            .map(|repl| {
2136                repl.logical_wal_spool
2137                    .as_ref()
2138                    .map(|spool| spool.current_lsn())
2139                    .unwrap_or(0)
2140            })
2141            .unwrap_or(0)
2142            .max(runtime.config_u64("red.config.timeline.last_archived_lsn", 0));
2143        runtime.inner.cdc.set_current_lsn(restored_cdc_lsn);
2144        runtime.bootstrap_system_keyed_collections()?;
2145
2146        // Phase 2.5.4: replay `tenant_tables.{table}.column` markers so
2147        // tables declared via `TENANT BY (col)` survive restart. Each
2148        // entry re-registers the auto-policy and flips RLS on again.
2149        runtime.rehydrate_tenant_tables();
2150        if let Some(repl) = &runtime.inner.db.replication {
2151            repl.wal_buffer.set_current_lsn(restored_cdc_lsn);
2152        }
2153
2154        // Save system info to red_config on boot
2155        {
2156            let sys = SystemInfo::collect();
2157            runtime.inner.db.store().set_config_tree(
2158                "red.system",
2159                &crate::serde_json::json!({
2160                    "pid": sys.pid,
2161                    "cpu_cores": sys.cpu_cores,
2162                    "total_memory_bytes": sys.total_memory_bytes,
2163                    "available_memory_bytes": sys.available_memory_bytes,
2164                    "os": sys.os,
2165                    "arch": sys.arch,
2166                    "hostname": sys.hostname,
2167                    "started_at": SystemTime::now()
2168                        .duration_since(UNIX_EPOCH)
2169                        .unwrap_or_default()
2170                        .as_millis() as u64
2171                }),
2172            );
2173
2174            // Seed defaults on first boot (only if red_config is empty or missing defaults)
2175            let store = runtime.inner.db.store();
2176            if store
2177                .get_collection("red_config")
2178                .map(|m| m.query_all(|_| true).len())
2179                .unwrap_or(0)
2180                <= 10
2181            {
2182                store.set_config_tree("red.ai", &crate::json!({
2183                    "default": crate::json!({
2184                        "provider": "openai",
2185                        "model": crate::ai::DEFAULT_OPENAI_PROMPT_MODEL
2186                    }),
2187                    "max_embedding_inputs": 256,
2188                    "max_prompt_batch": 256,
2189                    "timeout": crate::json!({ "connect_secs": 10, "read_secs": 90, "write_secs": 30 })
2190                }));
2191                store.set_config_tree(
2192                    "red.server",
2193                    &crate::json!({
2194                        "max_scan_limit": 1000,
2195                        "max_body_size": 1048576,
2196                        "read_timeout_ms": 5000,
2197                        "write_timeout_ms": 5000
2198                    }),
2199                );
2200                store.set_config_tree(
2201                    "red.storage",
2202                    &crate::json!({
2203                        "page_size": 4096,
2204                        "page_cache_capacity": 100000,
2205                        "auto_checkpoint_pages": 1000,
2206                        "snapshot_retention": 16,
2207                        "verify_checksums": true,
2208                        "segment": crate::json!({
2209                            "max_entities": 100000,
2210                            "max_bytes": 268435456_u64,
2211                            "compression_level": 6
2212                        }),
2213                        "hnsw": crate::json!({ "m": 16, "ef_construction": 100, "ef_search": 50 }),
2214                        "ivf": crate::json!({ "n_lists": 100, "n_probes": 10 }),
2215                        "bm25": crate::json!({ "k1": 1.2, "b": 0.75 })
2216                    }),
2217                );
2218                store.set_config_tree(
2219                    "red.search",
2220                    &crate::json!({
2221                        "rag": crate::json!({
2222                            "max_chunks_per_source": 10,
2223                            "max_total_chunks": 25,
2224                            "similarity_threshold": 0.8,
2225                            "graph_depth": 2,
2226                            "min_relevance": 0.3
2227                        }),
2228                        "fusion": crate::json!({
2229                            "vector_weight": 0.5,
2230                            "graph_weight": 0.3,
2231                            "table_weight": 0.2,
2232                            "dedup_threshold": 0.85
2233                        })
2234                    }),
2235                );
2236                store.set_config_tree(
2237                    "red.auth",
2238                    &crate::json!({
2239                        "enabled": false,
2240                        "session_ttl_secs": 3600,
2241                        "require_auth": false
2242                    }),
2243                );
2244                store.set_config_tree(
2245                    "red.query",
2246                    &crate::json!({
2247                        "connection_pool": crate::json!({ "max_connections": 64, "max_idle": 16 }),
2248                        "max_recursion_depth": 1000
2249                    }),
2250                );
2251                store.set_config_tree(
2252                    "red.indexes",
2253                    &crate::json!({
2254                        "auto_select": true,
2255                        "bloom_filter": crate::json!({
2256                            "enabled": true,
2257                            "false_positive_rate": 0.01,
2258                            "prune_on_scan": true
2259                        }),
2260                        "hash": crate::json!({ "enabled": true }),
2261                        "bitmap": crate::json!({ "enabled": true, "max_cardinality": 1000 }),
2262                        "spatial": crate::json!({ "enabled": true })
2263                    }),
2264                );
2265                store.set_config_tree(
2266                    "red.memtable",
2267                    &crate::json!({
2268                        "enabled": true,
2269                        "max_bytes": 67108864_u64,
2270                        "flush_threshold": 0.75
2271                    }),
2272                );
2273                store.set_config_tree(
2274                    "red.probabilistic",
2275                    &crate::json!({
2276                        "hll_registers": 16384,
2277                        "sketch_default_width": 1000,
2278                        "sketch_default_depth": 5,
2279                        "filter_default_capacity": 100000
2280                    }),
2281                );
2282                store.set_config_tree(
2283                    "red.timeseries",
2284                    &crate::json!({
2285                        "default_chunk_size": 1024,
2286                        "compression": crate::json!({
2287                            "timestamps": "delta_of_delta",
2288                            "values": "gorilla_xor"
2289                        }),
2290                        "default_retention_days": 0
2291                    }),
2292                );
2293                store.set_config_tree(
2294                    "red.queue",
2295                    &crate::json!({
2296                        "default_max_size": 0,
2297                        "default_max_attempts": 3,
2298                        "visibility_timeout_ms": 30000,
2299                        "consumer_idle_timeout_ms": 60000
2300                    }),
2301                );
2302                store.set_config_tree(
2303                    "red.backup",
2304                    &crate::json!({
2305                        "enabled": false,
2306                        "interval_secs": 3600,
2307                        "retention_count": 24,
2308                        "upload": false,
2309                        "backend": "local"
2310                    }),
2311                );
2312                store.set_config_tree(
2313                    "red.wal",
2314                    &crate::json!({
2315                        "archive": crate::json!({
2316                            "enabled": false,
2317                            "retention_hours": 168,
2318                            "prefix": "wal/"
2319                        })
2320                    }),
2321                );
2322                store.set_config_tree(
2323                    "red.cdc",
2324                    &crate::json!({
2325                        "enabled": true,
2326                        "buffer_size": 100000
2327                    }),
2328                );
2329                store.set_config_tree(
2330                    "red.config.secret",
2331                    &crate::json!({
2332                        "auto_encrypt": true,
2333                        "auto_decrypt": true
2334                    }),
2335                );
2336            }
2337
2338            // Perf-parity config matrix: heal the Tier A (critical)
2339            // keys unconditionally on every boot. Idempotent — only
2340            // writes the default when the key is missing. Keeps
2341            // `SHOW CONFIG` showing every guarantee the operator has
2342            // (durability.mode, concurrency.locking.enabled, …) even
2343            // on long-running datadirs that predate the matrix.
2344            crate::runtime::config_matrix::heal_critical_keys(store.as_ref());
2345
2346            // Phase 5 — Lehman-Yao runtime flag. Read the Tier A
2347            // `storage.btree.lehman_yao` value from the matrix (env
2348            // > file > red_config > default) and publish it to the
2349            // storage layer's atomic so the B-tree read / split
2350            // paths can branch without re-reading the config on
2351            // every hot-path call.
2352            let lehman_yao = runtime.config_bool("storage.btree.lehman_yao", true);
2353            crate::storage::engine::btree::lehman_yao::set_enabled(lehman_yao);
2354            if lehman_yao {
2355                tracing::info!(
2356                    "storage.btree.lehman_yao=true — lock-free concurrent descent enabled"
2357                );
2358            }
2359
2360            // Config file overlay — mounted `/etc/reddb/config.json`
2361            // (override path via REDDB_CONFIG_FILE). Writes keys with
2362            // write-if-absent semantics so a later user `SET CONFIG`
2363            // always wins. Missing file = silent no-op.
2364            let overlay_path = crate::runtime::config_overlay::config_file_path();
2365            let _ =
2366                crate::runtime::config_overlay::apply_config_file(store.as_ref(), &overlay_path);
2367        }
2368
2369        // VCS ("Git for Data") — create the `red_*` metadata
2370        // collections on first boot. Idempotent: `get_or_create_collection`
2371        // is a no-op if the collection already exists.
2372        {
2373            let store = runtime.inner.db.store();
2374            for name in crate::application::vcs_collections::ALL {
2375                let _ = store.get_or_create_collection(*name);
2376            }
2377            // Seed VCS config namespace with sensible defaults on first
2378            // boot, matching the pattern used by red.ai / red.storage.
2379            store.set_config_tree(
2380                crate::application::vcs_collections::CONFIG_NAMESPACE,
2381                &crate::json!({
2382                    "default_branch": "main",
2383                    "author": crate::json!({
2384                        "name": "reddb",
2385                        "email": "reddb@localhost"
2386                    }),
2387                    "protected_branches": crate::json!(["main"]),
2388                    "closure": crate::json!({
2389                        "enabled": true,
2390                        "lazy": true
2391                    }),
2392                    "merge": crate::json!({
2393                        "default_strategy": "auto",
2394                        "fast_forward": true
2395                    })
2396                }),
2397            );
2398        }
2399
2400        // Migrations — create the `red_migrations` / `red_migration_deps`
2401        // system collections on first boot. Idempotent.
2402        {
2403            let store = runtime.inner.db.store();
2404            for name in crate::application::migration_collections::ALL {
2405                let _ = store.get_or_create_collection(*name);
2406            }
2407        }
2408
2409        // Start background maintenance thread (context index refresh +
2410        // session purge). Held by a WEAK reference to `RuntimeInner`
2411        // so dropping the last `RedDBRuntime` handle actually releases
2412        // the underlying Arc<Pager> (and its file lock). Polling at
2413        // 200ms means shutdown latency is bounded; the real 60-second
2414        // work cadence is tracked independently via a `last_work`
2415        // timestamp.
2416        //
2417        // The previous version captured `rt = runtime.clone()` by
2418        // strong reference and ran an unterminated `loop`, which held
2419        // Arc<RuntimeInner> forever — reopening a persistent database
2420        // in the same process failed with "Database is locked" because
2421        // the pager could never drop. See the regression test
2422        // `finding_1_select_after_bulk_insert_persistent_reopen`.
2423        {
2424            let weak = Arc::downgrade(&runtime.inner);
2425            std::thread::Builder::new()
2426                .name("reddb-maintenance".into())
2427                .spawn(move || {
2428                    let tick = std::time::Duration::from_millis(200);
2429                    let work_interval = std::time::Duration::from_secs(60);
2430                    let mut last_work = std::time::Instant::now();
2431                    loop {
2432                        std::thread::sleep(tick);
2433                        let Some(inner) = weak.upgrade() else {
2434                            // All strong references dropped — the
2435                            // runtime is gone, exit cleanly.
2436                            break;
2437                        };
2438                        if last_work.elapsed() >= work_interval {
2439                            let _stats = inner.db.store().context_index().stats();
2440                            last_work = std::time::Instant::now();
2441                        }
2442                    }
2443                })
2444                .ok();
2445        }
2446
2447        // Start backup scheduler if enabled via red_config
2448        {
2449            let store = runtime.inner.db.store();
2450            let mut backup_enabled = false;
2451            let mut backup_interval = 3600u64;
2452
2453            if let Some(manager) = store.get_collection("red_config") {
2454                manager.for_each_entity(|entity| {
2455                    if let Some(row) = entity.data.as_row() {
2456                        let key = row.get_field("key").and_then(|v| match v {
2457                            crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2458                            _ => None,
2459                        });
2460                        let val = row.get_field("value");
2461                        if key == Some("red.config.backup.enabled") {
2462                            backup_enabled = match val {
2463                                Some(crate::storage::schema::Value::Boolean(true)) => true,
2464                                Some(crate::storage::schema::Value::Text(s)) => &**s == "true",
2465                                _ => false,
2466                            };
2467                        } else if key == Some("red.config.backup.interval_secs") {
2468                            if let Some(crate::storage::schema::Value::Integer(n)) = val {
2469                                backup_interval = *n as u64;
2470                            }
2471                        }
2472                    }
2473                    true
2474                });
2475            }
2476
2477            if backup_enabled {
2478                runtime.inner.backup_scheduler.set_interval(backup_interval);
2479                let rt = runtime.clone();
2480                runtime
2481                    .inner
2482                    .backup_scheduler
2483                    .start(move || rt.trigger_backup().map_err(|e| format!("{}", e)));
2484            }
2485        }
2486
2487        // Load EC registry from red_config and start worker
2488        {
2489            runtime
2490                .inner
2491                .ec_registry
2492                .load_from_config_store(runtime.inner.db.store().as_ref());
2493            if !runtime.inner.ec_registry.async_configs().is_empty() {
2494                runtime.inner.ec_worker.start(
2495                    Arc::clone(&runtime.inner.ec_registry),
2496                    Arc::clone(&runtime.inner.db.store()),
2497                );
2498            }
2499        }
2500
2501        if let crate::replication::ReplicationRole::Replica { primary_addr } =
2502            runtime.inner.db.options().replication.role.clone()
2503        {
2504            let rt = runtime.clone();
2505            std::thread::Builder::new()
2506                .name("reddb-replica".into())
2507                .spawn(move || rt.run_replica_loop(primary_addr))
2508                .ok();
2509        }
2510
2511        // PLAN.md Phase 1 — Lifecycle Contract. Mark Ready once every
2512        // boot stage above has completed (WAL replay, restore-from-
2513        // remote, replica-loop spawn). Health probes flip from 503 to
2514        // 200 here; shutdown begins from this state.
2515        runtime.inner.lifecycle.mark_ready();
2516
2517        Ok(runtime)
2518    }
2519
2520    fn bootstrap_system_keyed_collections(&self) -> RedDBResult<()> {
2521        let mut changed = false;
2522        for (name, model) in [
2523            ("red.config", crate::catalog::CollectionModel::Config),
2524            ("red.vault", crate::catalog::CollectionModel::Vault),
2525        ] {
2526            if self.inner.db.store().get_collection(name).is_none() {
2527                self.inner.db.store().get_or_create_collection(name);
2528                changed = true;
2529            }
2530            if self.inner.db.collection_contract(name).is_none() {
2531                self.inner
2532                    .db
2533                    .save_collection_contract(system_keyed_collection_contract(name, model))
2534                    .map_err(|err| RedDBError::Internal(err.to_string()))?;
2535                changed = true;
2536            }
2537        }
2538        if changed {
2539            self.inner
2540                .db
2541                .persist_metadata()
2542                .map_err(|err| RedDBError::Internal(err.to_string()))?;
2543        }
2544        Ok(())
2545    }
2546
2547    pub fn db(&self) -> Arc<RedDB> {
2548        Arc::clone(&self.inner.db)
2549    }
2550
2551    /// Direct access to the runtime's secondary-index store.
2552    /// Used by bulk-insert entry points (gRPC binary bulk, HTTP bulk,
2553    /// wire bulk) that need to push new rows through the per-index
2554    /// maintenance hook after `store.bulk_insert` returns.
2555    pub fn index_store_ref(&self) -> &super::index_store::IndexStore {
2556        &self.inner.index_store
2557    }
2558
2559    /// Apply a DDL event to the schema-vocabulary reverse index
2560    /// (issue #120). Called by DDL execution paths after the catalog
2561    /// mutation has succeeded so the index never holds entries for
2562    /// half-applied DDL.
2563    pub(crate) fn schema_vocabulary_apply(
2564        &self,
2565        event: crate::runtime::schema_vocabulary::DdlEvent,
2566    ) {
2567        self.inner.schema_vocabulary.write().on_ddl(event);
2568    }
2569
2570    /// Lookup `token` in the schema-vocabulary reverse index. Returns
2571    /// an owned `Vec<VocabHit>` because the underlying read lock
2572    /// cannot be borrowed across the call boundary; the slice from
2573    /// `SchemaVocabulary::lookup` is cloned per hit.
2574    pub fn schema_vocabulary_lookup(
2575        &self,
2576        token: &str,
2577    ) -> Vec<crate::runtime::schema_vocabulary::VocabHit> {
2578        self.inner.schema_vocabulary.read().lookup(token).to_vec()
2579    }
2580
2581    /// Inject an AuthStore into the runtime. Called by server boot
2582    /// after the vault has been bootstrapped, so that `Value::Secret`
2583    /// auto-encrypt/decrypt can reach the vault AES key.
2584    pub fn set_auth_store(&self, store: Arc<crate::auth::store::AuthStore>) {
2585        *self.inner.auth_store.write() = Some(store);
2586    }
2587
2588    /// Read a vault KV secret from the configured AuthStore, if present.
2589    pub fn vault_kv_get(&self, key: &str) -> Option<String> {
2590        self.inner
2591            .auth_store
2592            .read()
2593            .as_ref()
2594            .and_then(|store| store.vault_kv_get(key))
2595    }
2596
2597    /// Write a vault KV secret and fail if the encrypted vault write is
2598    /// unavailable or cannot be made durable.
2599    pub fn vault_kv_try_set(&self, key: String, value: String) -> RedDBResult<()> {
2600        let store = self.inner.auth_store.read().clone().ok_or_else(|| {
2601            RedDBError::Query("secret storage requires an enabled, unsealed vault".to_string())
2602        })?;
2603        store
2604            .vault_kv_try_set(key, value)
2605            .map_err(|err| RedDBError::Query(err.to_string()))
2606    }
2607
2608    /// Inject an `OAuthValidator` into the runtime. When set, HTTP and
2609    /// wire transports try OAuth JWT validation before falling back to
2610    /// the local AuthStore lookup. Pass `None` to disable.
2611    pub fn set_oauth_validator(&self, validator: Option<Arc<crate::auth::oauth::OAuthValidator>>) {
2612        *self.inner.oauth_validator.write() = validator;
2613    }
2614
2615    /// Returns a clone of the configured `OAuthValidator` Arc, if any.
2616    /// Hot path: called per HTTP request when an Authorization header
2617    /// is present, so we hand back a cheap Arc clone.
2618    pub fn oauth_validator(&self) -> Option<Arc<crate::auth::oauth::OAuthValidator>> {
2619        self.inner.oauth_validator.read().clone()
2620    }
2621
2622    /// Returns the vault AES key (`red.secret.aes_key`) if an auth
2623    /// store is wired and a key has been generated. Used by the
2624    /// `Value::Secret` encrypt/decrypt pipeline.
2625    pub(crate) fn secret_aes_key(&self) -> Option<[u8; 32]> {
2626        let guard = self.inner.auth_store.read();
2627        guard.as_ref().and_then(|s| s.vault_secret_key())
2628    }
2629
2630    /// Resolve a boolean flag from `red_config`. Defaults to `default`
2631    /// when the key is missing or not coercible. If the same key has
2632    /// been written multiple times (SET CONFIG appends new rows), the
2633    /// most recent entity wins. Env-var overrides
2634    /// (`REDDB_<UP_DOTTED>`) take highest precedence.
2635    pub(crate) fn config_bool(&self, key: &str, default: bool) -> bool {
2636        if let Some(raw) = self.inner.env_config_overrides.get(key) {
2637            if let Some(crate::storage::schema::Value::Boolean(b)) =
2638                crate::runtime::config_overlay::coerce_env_value(key, raw)
2639            {
2640                return b;
2641            }
2642        }
2643        let store = self.inner.db.store();
2644        let Some(manager) = store.get_collection("red_config") else {
2645            return default;
2646        };
2647        let mut result = default;
2648        let mut latest_id: u64 = 0;
2649        manager.for_each_entity(|entity| {
2650            if let Some(row) = entity.data.as_row() {
2651                let entry_key = row.get_field("key").and_then(|v| match v {
2652                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2653                    _ => None,
2654                });
2655                if entry_key == Some(key) {
2656                    let id = entity.id.raw();
2657                    if id >= latest_id {
2658                        latest_id = id;
2659                        result = match row.get_field("value") {
2660                            Some(crate::storage::schema::Value::Boolean(b)) => *b,
2661                            Some(crate::storage::schema::Value::Text(s)) => {
2662                                matches!(s.as_ref(), "true" | "TRUE" | "True" | "1")
2663                            }
2664                            Some(crate::storage::schema::Value::Integer(n)) => *n != 0,
2665                            _ => default,
2666                        };
2667                    }
2668                }
2669            }
2670            true
2671        });
2672        result
2673    }
2674
2675    pub(crate) fn config_u64(&self, key: &str, default: u64) -> u64 {
2676        if let Some(raw) = self.inner.env_config_overrides.get(key) {
2677            if let Some(crate::storage::schema::Value::UnsignedInteger(n)) =
2678                crate::runtime::config_overlay::coerce_env_value(key, raw)
2679            {
2680                return n;
2681            }
2682        }
2683        let store = self.inner.db.store();
2684        let Some(manager) = store.get_collection("red_config") else {
2685            return default;
2686        };
2687        let mut result = default;
2688        let mut latest_id: u64 = 0;
2689        manager.for_each_entity(|entity| {
2690            if let Some(row) = entity.data.as_row() {
2691                let entry_key = row.get_field("key").and_then(|v| match v {
2692                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2693                    _ => None,
2694                });
2695                if entry_key == Some(key) {
2696                    let id = entity.id.raw();
2697                    if id >= latest_id {
2698                        latest_id = id;
2699                        result = match row.get_field("value") {
2700                            Some(crate::storage::schema::Value::Integer(n)) => *n as u64,
2701                            Some(crate::storage::schema::Value::UnsignedInteger(n)) => *n,
2702                            Some(crate::storage::schema::Value::Text(s)) => {
2703                                s.parse::<u64>().unwrap_or(default)
2704                            }
2705                            _ => default,
2706                        };
2707                    }
2708                }
2709            }
2710            true
2711        });
2712        result
2713    }
2714
2715    pub(crate) fn config_string(&self, key: &str, default: &str) -> String {
2716        if let Some(raw) = self.inner.env_config_overrides.get(key) {
2717            return raw.clone();
2718        }
2719        let store = self.inner.db.store();
2720        let Some(manager) = store.get_collection("red_config") else {
2721            return default.to_string();
2722        };
2723        let mut result = default.to_string();
2724        let mut latest_id: u64 = 0;
2725        manager.for_each_entity(|entity| {
2726            if let Some(row) = entity.data.as_row() {
2727                let entry_key = row.get_field("key").and_then(|v| match v {
2728                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2729                    _ => None,
2730                });
2731                if entry_key == Some(key) {
2732                    let id = entity.id.raw();
2733                    if id >= latest_id {
2734                        latest_id = id;
2735                        if let Some(crate::storage::schema::Value::Text(value)) =
2736                            row.get_field("value")
2737                        {
2738                            result = value.to_string();
2739                        }
2740                    }
2741                }
2742            }
2743            true
2744        });
2745        result
2746    }
2747
2748    fn latest_metadata_for(
2749        &self,
2750        collection: &str,
2751        entity_id: u64,
2752    ) -> Option<crate::serde_json::Value> {
2753        self.inner
2754            .db
2755            .store()
2756            .get_metadata(collection, EntityId::new(entity_id))
2757            .map(|metadata| metadata_to_json(&metadata))
2758    }
2759
2760    fn persist_replica_lsn(&self, lsn: u64) {
2761        self.inner.db.store().set_config_tree(
2762            "red.replication",
2763            &crate::json!({
2764                "last_applied_lsn": lsn
2765            }),
2766        );
2767    }
2768
2769    fn persist_replication_health(
2770        &self,
2771        state: &str,
2772        last_error: &str,
2773        primary_lsn: Option<u64>,
2774        oldest_available_lsn: Option<u64>,
2775    ) {
2776        self.inner.db.store().set_config_tree(
2777            "red.replication",
2778            &crate::json!({
2779                "state": state,
2780                "last_error": last_error,
2781                "last_seen_primary_lsn": primary_lsn.unwrap_or(0),
2782                "last_seen_oldest_lsn": oldest_available_lsn.unwrap_or(0),
2783                "updated_at_unix_ms": SystemTime::now()
2784                    .duration_since(UNIX_EPOCH)
2785                    .unwrap_or_default()
2786                    .as_millis() as u64
2787            }),
2788        );
2789    }
2790
2791    /// Whether `SECRET('...')` literals should be encrypted with the
2792    /// vault AES key on INSERT. Default `true`.
2793    pub(crate) fn secret_auto_encrypt(&self) -> bool {
2794        self.config_bool("red.config.secret.auto_encrypt", true)
2795    }
2796
2797    /// Whether `Value::Secret` columns should be decrypted back to
2798    /// plaintext on SELECT when the vault is unsealed. Default `true`.
2799    /// Turning this off keeps secrets masked as `***` even while the
2800    /// vault is open — useful for audit trails or read-only exports.
2801    pub(crate) fn secret_auto_decrypt(&self) -> bool {
2802        self.config_bool("red.config.secret.auto_decrypt", true)
2803    }
2804
2805    /// Walk every record in `result` and swap `Value::Secret(bytes)`
2806    /// for the decrypted plaintext when the runtime has the vault
2807    /// AES key AND `red.config.secret.auto_decrypt = true`. If the
2808    /// key is missing, the vault is sealed, or auto_decrypt is off,
2809    /// secrets are left as `Value::Secret` which every formatter
2810    /// (Display, JSON) already masks as `***`.
2811    pub(crate) fn apply_secret_decryption(&self, result: &mut RuntimeQueryResult) {
2812        if !self.secret_auto_decrypt() {
2813            return;
2814        }
2815        let Some(key) = self.secret_aes_key() else {
2816            return;
2817        };
2818        for record in result.result.records.iter_mut() {
2819            for value in record.values_mut() {
2820                if let Value::Secret(ref bytes) = value {
2821                    if let Some(plain) =
2822                        super::impl_dml::decrypt_secret_payload(&key, bytes.as_slice())
2823                    {
2824                        if let Ok(text) = String::from_utf8(plain) {
2825                            *value = Value::text(text);
2826                        }
2827                    }
2828                }
2829            }
2830        }
2831    }
2832
2833    /// Emit a CDC change event and replicate to WAL buffer.
2834    /// Create a `MutationEngine` bound to this runtime.
2835    ///
2836    /// The engine is cheap to construct (no allocation) and should be
2837    /// dropped after `apply` returns. Use this from application-layer
2838    /// `create_row` / `create_rows_batch` instead of calling
2839    /// `bulk_insert` + `index_entity_insert` + `cdc_emit` separately.
2840    pub(crate) fn mutation_engine(&self) -> crate::runtime::mutation::MutationEngine<'_> {
2841        crate::runtime::mutation::MutationEngine::new(self)
2842    }
2843
2844    /// Public-mutation gate snapshot (PLAN.md W1).
2845    ///
2846    /// Surfaces that accept untrusted client requests (SQL DML/DDL,
2847    /// gRPC mutating RPCs, HTTP/native wire mutations, admin
2848    /// maintenance, serverless lifecycle) call `check_write` before
2849    /// dispatching to storage. Returns `RedDBError::ReadOnly` on any
2850    /// instance running as a replica or with `options.read_only =
2851    /// true`. The replica internal logical-WAL apply path reaches into
2852    /// the store directly and never calls this method, so legitimate
2853    /// replica catch-up still works.
2854    pub fn check_write(&self, kind: crate::runtime::write_gate::WriteKind) -> RedDBResult<()> {
2855        self.inner.write_gate.check(kind)
2856    }
2857
2858    /// Read-only handle to the gate, useful for transports that want
2859    /// to surface the policy in health/status output without taking on
2860    /// a dependency on the concrete enum.
2861    pub fn write_gate(&self) -> &crate::runtime::write_gate::WriteGate {
2862        &self.inner.write_gate
2863    }
2864
2865    /// Process lifecycle handle (PLAN.md Phase 1). Health probes,
2866    /// admin/shutdown, and signal handlers consult this single
2867    /// state machine.
2868    pub fn lifecycle(&self) -> &crate::runtime::lifecycle::Lifecycle {
2869        &self.inner.lifecycle
2870    }
2871
2872    /// Operator-imposed resource limits (PLAN.md Phase 4.1).
2873    pub fn resource_limits(&self) -> &crate::runtime::resource_limits::ResourceLimits {
2874        &self.inner.resource_limits
2875    }
2876
2877    /// Append-only audit log for admin mutations (PLAN.md Phase 6.5).
2878    pub fn audit_log(&self) -> &crate::runtime::audit_log::AuditLogger {
2879        &self.inner.audit_log
2880    }
2881
2882    /// Shared `Arc` to the audit logger — used by collaborators (the
2883    /// lease lifecycle, future request-context plumbing) that need to
2884    /// keep the logger alive past the runtime's stack frame.
2885    pub fn audit_log_arc(&self) -> Arc<crate::runtime::audit_log::AuditLogger> {
2886        Arc::clone(&self.inner.audit_log)
2887    }
2888
2889    /// Shared `Arc` to the write gate. Same rationale as
2890    /// `audit_log_arc`: collaborators (lease lifecycle, refresh
2891    /// thread) need a clone-cheap handle they can move into a
2892    /// background thread.
2893    pub fn write_gate_arc(&self) -> Arc<crate::runtime::write_gate::WriteGate> {
2894        Arc::clone(&self.inner.write_gate)
2895    }
2896
2897    /// Serverless writer-lease state machine. `None` when the operator
2898    /// did not opt into lease fencing (`RED_LEASE_REQUIRED` unset).
2899    pub fn lease_lifecycle(&self) -> Option<&Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>> {
2900        self.inner.lease_lifecycle.get()
2901    }
2902
2903    /// Install the lease lifecycle. Idempotent; subsequent calls
2904    /// return the previously stored value untouched.
2905    pub fn set_lease_lifecycle(
2906        &self,
2907        lifecycle: Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>,
2908    ) -> Result<(), Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>> {
2909        self.inner.lease_lifecycle.set(lifecycle)
2910    }
2911
2912    /// Reject the call when the requested batch size exceeds
2913    /// `RED_MAX_BATCH_SIZE`. Returns `RedDBError::QuotaExceeded`
2914    /// shaped so the HTTP layer can map it to 413 Payload Too
2915    /// Large (PLAN.md Phase 4.1).
2916    pub fn check_batch_size(&self, requested: usize) -> RedDBResult<()> {
2917        if self.inner.resource_limits.batch_size_exceeded(requested) {
2918            let max = self.inner.resource_limits.max_batch_size.unwrap_or(0);
2919            return Err(RedDBError::QuotaExceeded(format!(
2920                "max_batch_size:{requested}:{max}"
2921            )));
2922        }
2923        Ok(())
2924    }
2925
2926    /// Reject the call when the local DB file exceeds
2927    /// `RED_MAX_DB_SIZE_BYTES`. Reads file metadata once per call —
2928    /// the cost is a single `stat()` syscall, negligible against the
2929    /// I/O the caller is about to do. Returns `QuotaExceeded` shaped
2930    /// for HTTP 507 Insufficient Storage.
2931    pub fn check_db_size(&self) -> RedDBResult<()> {
2932        let Some(limit) = self.inner.resource_limits.max_db_size_bytes else {
2933            return Ok(());
2934        };
2935        if limit == 0 {
2936            return Ok(());
2937        }
2938        let Some(path) = self.inner.db.path() else {
2939            return Ok(());
2940        };
2941        let current = std::fs::metadata(path).map(|m| m.len()).unwrap_or(0);
2942        if current > limit {
2943            return Err(RedDBError::QuotaExceeded(format!(
2944                "max_db_size_bytes:{current}:{limit}"
2945            )));
2946        }
2947        Ok(())
2948    }
2949
2950    /// Graceful shutdown coordinator (PLAN.md Phase 1.1).
2951    ///
2952    /// Steps, in order, all idempotent across re-entrant calls:
2953    ///   1. Move lifecycle into `ShuttingDown` (concurrent callers
2954    ///      observe `Stopped` after first finishes).
2955    ///   2. Flush WAL + run final checkpoint via `db.flush()` so
2956    ///      every acked write is durable on disk.
2957    ///   3. If `backup_on_shutdown == true` and a remote backend is
2958    ///      configured, run a synchronous `trigger_backup()` so the
2959    ///      remote head reflects the final state.
2960    ///   4. Stamp the report and move to `Stopped`. Subsequent calls
2961    ///      return the cached report without re-running anything.
2962    ///
2963    /// On any error, the runtime is still marked `Stopped` so the
2964    /// process can exit; the caller logs the error context but does
2965    /// not retry the same shutdown — the operator can inspect the
2966    /// report fields to see which step failed.
2967    pub fn graceful_shutdown(
2968        &self,
2969        backup_on_shutdown: bool,
2970    ) -> RedDBResult<crate::runtime::lifecycle::ShutdownReport> {
2971        if !self.inner.lifecycle.begin_shutdown() {
2972            // Someone else already shut down (or is in flight). Return
2973            // the cached report so the HTTP caller and SIGTERM handler
2974            // get the same idempotent answer.
2975            return Ok(self.inner.lifecycle.shutdown_report().unwrap_or_default());
2976        }
2977
2978        let started_ms = std::time::SystemTime::now()
2979            .duration_since(std::time::UNIX_EPOCH)
2980            .map(|d| d.as_millis() as u64)
2981            .unwrap_or(0);
2982        let mut report = crate::runtime::lifecycle::ShutdownReport {
2983            started_at_ms: started_ms,
2984            ..Default::default()
2985        };
2986
2987        // Flush WAL + run any pending checkpoint. Local fsync is
2988        // unconditional — even a lease-lost replica needs its WAL on
2989        // disk before exit so a future restore has the latest tail.
2990        // The remote upload is gated separately so a lost-lease writer
2991        // doesn't clobber the new holder's state on its way out.
2992        let flush_res = self.inner.db.flush_local_only();
2993        report.flushed_wal = flush_res.is_ok();
2994        report.final_checkpoint = flush_res.is_ok();
2995        if let Err(err) = &flush_res {
2996            tracing::error!(
2997                target: "reddb::lifecycle",
2998                error = %err,
2999                "graceful_shutdown: local flush failed"
3000            );
3001        } else if let Err(lease_err) =
3002            self.assert_remote_write_allowed("shutdown/checkpoint_upload")
3003        {
3004            tracing::warn!(
3005                target: "reddb::serverless::lease",
3006                error = %lease_err,
3007                "graceful_shutdown: remote upload skipped — lease not held"
3008            );
3009        } else if let Err(err) = self.inner.db.upload_to_remote_backend() {
3010            tracing::error!(
3011                target: "reddb::lifecycle",
3012                error = %err,
3013                "graceful_shutdown: remote upload failed"
3014            );
3015        }
3016
3017        // Optional final backup. Skipped silently when no remote
3018        // backend is configured — `trigger_backup()` returns Err
3019        // anyway in that case, but logging it as a shutdown failure
3020        // would be misleading on a standalone (no-backend) runtime.
3021        if backup_on_shutdown && self.inner.db.remote_backend.is_some() {
3022            // The trigger_backup gate now reads `WriteKind::Backup`,
3023            // which a replica/read_only instance refuses. That's
3024            // intentional — replicas don't drive backups; only the
3025            // primary does. We still want shutdown to flush its WAL
3026            // even if the backup branch is gated off.
3027            match self.trigger_backup() {
3028                Ok(result) => {
3029                    report.backup_uploaded = result.uploaded;
3030                }
3031                Err(err) => {
3032                    tracing::warn!(
3033                        target: "reddb::lifecycle",
3034                        error = %err,
3035                        "graceful_shutdown: final backup skipped"
3036                    );
3037                }
3038            }
3039        }
3040
3041        let completed_ms = std::time::SystemTime::now()
3042            .duration_since(std::time::UNIX_EPOCH)
3043            .map(|d| d.as_millis() as u64)
3044            .unwrap_or(started_ms);
3045        report.completed_at_ms = completed_ms;
3046        report.duration_ms = completed_ms.saturating_sub(started_ms);
3047
3048        self.inner.lifecycle.finish_shutdown(report.clone());
3049        Ok(report)
3050    }
3051
3052    /// Emit a CDC record without invalidating the result cache.
3053    ///
3054    /// Used by `MutationEngine::append_batch` which calls
3055    /// `invalidate_result_cache` once for the whole batch before this
3056    /// loop, avoiding N write-lock acquisitions.
3057    pub(crate) fn cdc_emit_no_cache_invalidate(
3058        &self,
3059        operation: crate::replication::cdc::ChangeOperation,
3060        collection: &str,
3061        entity_id: u64,
3062        entity_kind: &str,
3063    ) -> u64 {
3064        let lsn = self
3065            .inner
3066            .cdc
3067            .emit(operation, collection, entity_id, entity_kind);
3068
3069        // Append to logical WAL replication buffer (if primary mode)
3070        if let Some(ref primary) = self.inner.db.replication {
3071            let store = self.inner.db.store();
3072            let entity = if operation == crate::replication::cdc::ChangeOperation::Delete {
3073                None
3074            } else {
3075                store.get(collection, EntityId::new(entity_id))
3076            };
3077            let record = ChangeRecord {
3078                lsn,
3079                timestamp: SystemTime::now()
3080                    .duration_since(UNIX_EPOCH)
3081                    .unwrap_or_default()
3082                    .as_millis() as u64,
3083                operation,
3084                collection: collection.to_string(),
3085                entity_id,
3086                entity_kind: entity_kind.to_string(),
3087                entity_bytes: entity
3088                    .as_ref()
3089                    .map(|e| UnifiedStore::serialize_entity(e, store.format_version())),
3090                metadata: self.latest_metadata_for(collection, entity_id),
3091            };
3092            let encoded = record.encode();
3093            primary.wal_buffer.append(record.lsn, encoded.clone());
3094            if let Some(spool) = &primary.logical_wal_spool {
3095                let _ = spool.append(record.lsn, &encoded);
3096            }
3097        }
3098        lsn
3099    }
3100
3101    pub(crate) fn cdc_emit_insert_batch_no_cache_invalidate(
3102        &self,
3103        collection: &str,
3104        ids: &[EntityId],
3105        entity_kind: &str,
3106    ) -> Vec<u64> {
3107        if ids.is_empty() {
3108            return Vec::new();
3109        }
3110
3111        // Without logical replication, CDC only needs the in-memory event
3112        // ring. Reserve all LSNs and push the batch under one mutex instead
3113        // of taking the ring lock once per inserted row.
3114        if self.inner.db.replication.is_none() {
3115            return self.inner.cdc.emit_batch_same_collection(
3116                crate::replication::cdc::ChangeOperation::Insert,
3117                collection,
3118                entity_kind,
3119                ids.iter().map(|id| id.raw()),
3120            );
3121        }
3122
3123        // Replication needs one logical-WAL record per entity with the
3124        // serialized entity bytes, so keep the existing per-row path.
3125        ids.iter()
3126            .map(|id| {
3127                self.cdc_emit_no_cache_invalidate(
3128                    crate::replication::cdc::ChangeOperation::Insert,
3129                    collection,
3130                    id.raw(),
3131                    entity_kind,
3132                )
3133            })
3134            .collect()
3135    }
3136
3137    pub fn cdc_emit(
3138        &self,
3139        operation: crate::replication::cdc::ChangeOperation,
3140        collection: &str,
3141        entity_id: u64,
3142        entity_kind: &str,
3143    ) -> u64 {
3144        let lsn = self
3145            .inner
3146            .cdc
3147            .emit(operation, collection, entity_id, entity_kind);
3148        // Perf: prior to this we called `invalidate_result_cache()`
3149        // which wipes EVERY cached query, across every table, under
3150        // a write lock — turning each INSERT into a serialisation
3151        // point for all readers. Swap to the per-table variant so
3152        // unrelated query caches survive.
3153        self.invalidate_result_cache_for_table(collection);
3154
3155        // Append to logical WAL replication buffer (if primary mode)
3156        if let Some(ref primary) = self.inner.db.replication {
3157            let store = self.inner.db.store();
3158            let entity = if operation == crate::replication::cdc::ChangeOperation::Delete {
3159                None
3160            } else {
3161                store.get(collection, EntityId::new(entity_id))
3162            };
3163            let record = ChangeRecord {
3164                lsn,
3165                timestamp: SystemTime::now()
3166                    .duration_since(UNIX_EPOCH)
3167                    .unwrap_or_default()
3168                    .as_millis() as u64,
3169                operation,
3170                collection: collection.to_string(),
3171                entity_id,
3172                entity_kind: entity_kind.to_string(),
3173                entity_bytes: entity
3174                    .as_ref()
3175                    .map(|entity| UnifiedStore::serialize_entity(entity, store.format_version())),
3176                metadata: self.latest_metadata_for(collection, entity_id),
3177            };
3178            let encoded = record.encode();
3179            primary.wal_buffer.append(record.lsn, encoded.clone());
3180            if let Some(spool) = &primary.logical_wal_spool {
3181                let _ = spool.append(record.lsn, &encoded);
3182            }
3183        }
3184        lsn
3185    }
3186
3187    pub(crate) fn cdc_emit_kv(
3188        &self,
3189        operation: crate::replication::cdc::ChangeOperation,
3190        collection: &str,
3191        key: &str,
3192        entity_id: u64,
3193        before: Option<crate::json::Value>,
3194        after: Option<crate::json::Value>,
3195    ) -> u64 {
3196        let lsn = self
3197            .inner
3198            .cdc
3199            .emit_kv(operation, collection, key, entity_id, before, after);
3200        self.inner.kv_stats.incr_watch_events_emitted();
3201        self.invalidate_result_cache_for_table(collection);
3202        lsn
3203    }
3204
3205    pub(crate) fn record_kv_watch_event(
3206        &self,
3207        operation: crate::replication::cdc::ChangeOperation,
3208        collection: &str,
3209        key: &str,
3210        entity_id: u64,
3211        before: Option<crate::json::Value>,
3212        after: Option<crate::json::Value>,
3213    ) {
3214        if self.current_xid().is_some() {
3215            let conn_id = current_connection_id();
3216            let event = crate::replication::cdc::KvWatchEvent {
3217                collection: collection.to_string(),
3218                key: key.to_string(),
3219                op: operation,
3220                before,
3221                after,
3222                lsn: 0,
3223                committed_at: 0,
3224                dropped_event_count: 0,
3225            };
3226            self.inner
3227                .pending_kv_watch_events
3228                .write()
3229                .entry(conn_id)
3230                .or_default()
3231                .push(event);
3232            return;
3233        }
3234
3235        self.cdc_emit_kv(operation, collection, key, entity_id, before, after);
3236    }
3237
3238    pub(crate) fn cdc_emit_prebuilt(
3239        &self,
3240        operation: crate::replication::cdc::ChangeOperation,
3241        collection: &str,
3242        entity: &UnifiedEntity,
3243        entity_kind: &str,
3244        metadata: Option<&crate::storage::Metadata>,
3245        invalidate_cache: bool,
3246    ) -> u64 {
3247        self.cdc_emit_prebuilt_with_columns(
3248            operation,
3249            collection,
3250            entity,
3251            entity_kind,
3252            metadata,
3253            invalidate_cache,
3254            None,
3255        )
3256    }
3257
3258    /// `cdc_emit_prebuilt` plus the list of column names whose values
3259    /// changed on this update. Callers that have already computed a
3260    /// `RowDamageVector` pass it here so downstream CDC consumers can
3261    /// filter events by touched column without re-diffing.
3262    /// `changed_columns` is only meaningful for `Update` operations —
3263    /// insert and delete events ignore it.
3264    pub(crate) fn cdc_emit_prebuilt_with_columns(
3265        &self,
3266        operation: crate::replication::cdc::ChangeOperation,
3267        collection: &str,
3268        entity: &UnifiedEntity,
3269        entity_kind: &str,
3270        metadata: Option<&crate::storage::Metadata>,
3271        invalidate_cache: bool,
3272        changed_columns: Option<Vec<String>>,
3273    ) -> u64 {
3274        if invalidate_cache {
3275            self.invalidate_result_cache();
3276        }
3277
3278        let lsn = self.inner.cdc.emit_with_columns(
3279            operation,
3280            collection,
3281            entity.id.raw(),
3282            entity_kind,
3283            changed_columns,
3284        );
3285
3286        if let Some(ref primary) = self.inner.db.replication {
3287            let store = self.inner.db.store();
3288            let record = ChangeRecord {
3289                lsn,
3290                timestamp: SystemTime::now()
3291                    .duration_since(UNIX_EPOCH)
3292                    .unwrap_or_default()
3293                    .as_millis() as u64,
3294                operation,
3295                collection: collection.to_string(),
3296                entity_id: entity.id.raw(),
3297                entity_kind: entity_kind.to_string(),
3298                entity_bytes: Some(UnifiedStore::serialize_entity(
3299                    entity,
3300                    store.format_version(),
3301                )),
3302                metadata: metadata
3303                    .map(metadata_to_json)
3304                    .or_else(|| self.latest_metadata_for(collection, entity.id.raw())),
3305            };
3306            let encoded = record.encode();
3307            primary.wal_buffer.append(record.lsn, encoded.clone());
3308            if let Some(spool) = &primary.logical_wal_spool {
3309                let _ = spool.append(record.lsn, &encoded);
3310            }
3311        }
3312
3313        lsn
3314    }
3315
3316    pub(crate) fn cdc_emit_prebuilt_batch<'a, I>(
3317        &self,
3318        operation: crate::replication::cdc::ChangeOperation,
3319        entity_kind: &str,
3320        items: I,
3321        invalidate_cache: bool,
3322    ) where
3323        I: IntoIterator<
3324            Item = (
3325                &'a str,
3326                &'a UnifiedEntity,
3327                Option<&'a crate::storage::Metadata>,
3328            ),
3329        >,
3330    {
3331        let items: Vec<(&str, &UnifiedEntity, Option<&crate::storage::Metadata>)> =
3332            items.into_iter().collect();
3333        if items.is_empty() {
3334            return;
3335        }
3336
3337        if invalidate_cache {
3338            self.invalidate_result_cache();
3339        }
3340
3341        for (collection, entity, metadata) in items {
3342            self.cdc_emit_prebuilt(operation, collection, entity, entity_kind, metadata, false);
3343        }
3344    }
3345
3346    fn run_replica_loop(&self, primary_addr: String) {
3347        let endpoint = if primary_addr.starts_with("http") {
3348            primary_addr
3349        } else {
3350            format!("http://{primary_addr}")
3351        };
3352        let poll_ms = self.inner.db.options().replication.poll_interval_ms;
3353        let max_count = self.inner.db.options().replication.max_batch_size;
3354        let mut since_lsn = self.config_u64("red.replication.last_applied_lsn", 0);
3355
3356        let runtime = match tokio::runtime::Builder::new_current_thread()
3357            .enable_all()
3358            .build()
3359        {
3360            Ok(runtime) => runtime,
3361            Err(_) => return,
3362        };
3363
3364        runtime.block_on(async move {
3365            use crate::grpc::proto::red_db_client::RedDbClient;
3366            use crate::grpc::proto::JsonPayloadRequest;
3367
3368            let mut client = loop {
3369                match RedDbClient::connect(endpoint.clone()).await {
3370                    Ok(client) => {
3371                        self.persist_replication_health("connecting", "", None, None);
3372                        break client;
3373                    }
3374                    Err(_) => {
3375                        self.persist_replication_health(
3376                            "connecting",
3377                            "waiting for primary connection",
3378                            None,
3379                            None,
3380                        );
3381                        std::thread::sleep(std::time::Duration::from_millis(poll_ms.max(250)))
3382                    }
3383                }
3384            };
3385
3386            // PLAN.md Phase 11.5 — stateful applier guards LSN
3387            // monotonicity across pulls. Seed with the persisted
3388            // `last_applied_lsn` so reboots don't lose the chain
3389            // pointer.
3390            let applier = crate::replication::logical::LogicalChangeApplier::new(since_lsn);
3391
3392            loop {
3393                let payload = crate::json!({
3394                    "since_lsn": since_lsn,
3395                    "max_count": max_count
3396                });
3397                let request = tonic::Request::new(JsonPayloadRequest {
3398                    payload_json: crate::json::to_string(&payload)
3399                        .unwrap_or_else(|_| "{}".to_string()),
3400                });
3401
3402                if let Ok(response) = client.pull_wal_records(request).await {
3403                    if let Ok(value) =
3404                        crate::json::from_str::<crate::json::Value>(&response.into_inner().payload)
3405                    {
3406                        let current_lsn =
3407                            value.get("current_lsn").and_then(crate::json::Value::as_u64);
3408                        let oldest_available_lsn = value
3409                            .get("oldest_available_lsn")
3410                            .and_then(crate::json::Value::as_u64);
3411                        if since_lsn > 0
3412                            && oldest_available_lsn
3413                                .map(|oldest| oldest > since_lsn.saturating_add(1))
3414                                .unwrap_or(false)
3415                        {
3416                            self.persist_replication_health(
3417                                "stalled_gap",
3418                                "replica is behind the oldest logical WAL available on primary; re-bootstrap required",
3419                                current_lsn,
3420                                oldest_available_lsn,
3421                            );
3422                            std::thread::sleep(std::time::Duration::from_millis(poll_ms.max(250)));
3423                            continue;
3424                        }
3425                        if let Some(records) =
3426                            value.get("records").and_then(crate::json::Value::as_array)
3427                        {
3428                            for record in records {
3429                                let Some(data_hex) =
3430                                    record.get("data").and_then(crate::json::Value::as_str)
3431                                else {
3432                                    continue;
3433                                };
3434                                let Ok(data) = hex::decode(data_hex) else {
3435                                    self.inner.replica_apply_metrics.record(
3436                                        crate::replication::logical::ApplyErrorKind::Decode,
3437                                    );
3438                                    self.persist_replication_health(
3439                                        "apply_error",
3440                                        "failed to decode WAL record hex payload",
3441                                        current_lsn,
3442                                        oldest_available_lsn,
3443                                    );
3444                                    continue;
3445                                };
3446                                let Ok(change) = ChangeRecord::decode(&data) else {
3447                                    self.inner.replica_apply_metrics.record(
3448                                        crate::replication::logical::ApplyErrorKind::Decode,
3449                                    );
3450                                    self.persist_replication_health(
3451                                        "apply_error",
3452                                        "failed to decode logical WAL record",
3453                                        current_lsn,
3454                                        oldest_available_lsn,
3455                                    );
3456                                    continue;
3457                                };
3458                                match applier.apply(
3459                                    self.inner.db.as_ref(),
3460                                    &change,
3461                                    ApplyMode::Replica,
3462                                ) {
3463                                    Ok(crate::replication::logical::ApplyOutcome::Applied) => {
3464                                        since_lsn = since_lsn.max(change.lsn);
3465                                        self.persist_replica_lsn(since_lsn);
3466                                    }
3467                                    Ok(_) => {
3468                                        // Idempotent / Skipped: no advance, no error.
3469                                    }
3470                                    Err(err) => {
3471                                        self.inner.replica_apply_metrics.record(err.kind());
3472                                        // Issue #205 — emit operator-grade event
3473                                        // for the two replication-fatal kinds. `Gap`
3474                                        // / `Apply` / `Decode` already persist via
3475                                        // `persist_replication_health`; the
3476                                        // OperatorEvent variants only cover the
3477                                        // two "stream is broken" / "follower
3478                                        // diverged" conditions an operator must act
3479                                        // on out-of-band.
3480                                        match &err {
3481                                            crate::replication::logical::LogicalApplyError::Divergence { lsn, expected: _, got: _ } => {
3482                                                crate::telemetry::operator_event::OperatorEvent::Divergence {
3483                                                    peer: "primary".to_string(),
3484                                                    leader_lsn: *lsn,
3485                                                    follower_lsn: since_lsn,
3486                                                }
3487                                                .emit_global();
3488                                            }
3489                                            crate::replication::logical::LogicalApplyError::Gap { last, next } => {
3490                                                crate::telemetry::operator_event::OperatorEvent::ReplicationBroken {
3491                                                    peer: "primary".to_string(),
3492                                                    reason: format!("stalled gap last={last} next={next}"),
3493                                                }
3494                                                .emit_global();
3495                                            }
3496                                            _ => {}
3497                                        }
3498                                        let kind = match &err {
3499                                            crate::replication::logical::LogicalApplyError::Gap { .. } => "stalled_gap",
3500                                            crate::replication::logical::LogicalApplyError::Divergence { .. } => "divergence",
3501                                            _ => "apply_error",
3502                                        };
3503                                        self.persist_replication_health(
3504                                            kind,
3505                                            &format!("replica apply rejected: {err}"),
3506                                            current_lsn,
3507                                            oldest_available_lsn,
3508                                        );
3509                                        // Stop applying this batch. The
3510                                        // outer loop will retry on next
3511                                        // pull, which on a real Gap will
3512                                        // not magically heal — operator
3513                                        // must rebootstrap. For
3514                                        // Divergence, we explicitly do
3515                                        // not advance; this keeps the
3516                                        // replica visibly unhealthy
3517                                        // instead of silently swallowing
3518                                        // corruption.
3519                                        break;
3520                                    }
3521                                }
3522                            }
3523                        }
3524                        self.persist_replication_health(
3525                            "healthy",
3526                            "",
3527                            current_lsn,
3528                            oldest_available_lsn,
3529                        );
3530                    } else {
3531                        self.persist_replication_health(
3532                            "apply_error",
3533                            "failed to parse pull_wal_records response",
3534                            None,
3535                            None,
3536                        );
3537                    }
3538                } else {
3539                    self.persist_replication_health(
3540                        "connecting",
3541                        "primary pull_wal_records request failed",
3542                        None,
3543                        None,
3544                    );
3545                }
3546
3547                std::thread::sleep(std::time::Duration::from_millis(poll_ms));
3548            }
3549        });
3550    }
3551
3552    /// Poll CDC events since a given LSN.
3553    pub fn cdc_poll(
3554        &self,
3555        since_lsn: u64,
3556        max_count: usize,
3557    ) -> Vec<crate::replication::cdc::ChangeEvent> {
3558        self.inner.cdc.poll(since_lsn, max_count)
3559    }
3560
3561    /// PLAN.md Phase 11.4 — current CDC LSN. Public mutation
3562    /// surfaces (HTTP query, gRPC entity ops) call this immediately
3563    /// after a successful write to feed `enforce_commit_policy`.
3564    pub fn cdc_current_lsn(&self) -> u64 {
3565        self.inner.cdc.current_lsn()
3566    }
3567
3568    pub fn kv_watch_events_since(
3569        &self,
3570        collection: &str,
3571        key: &str,
3572        since_lsn: u64,
3573        max_count: usize,
3574    ) -> Vec<crate::replication::cdc::KvWatchEvent> {
3575        self.inner
3576            .cdc
3577            .poll(since_lsn, max_count)
3578            .into_iter()
3579            .filter_map(|event| event.kv)
3580            .filter(|event| event.collection == collection && event.key == key)
3581            .collect()
3582    }
3583
3584    pub fn kv_watch_events_since_prefix(
3585        &self,
3586        collection: &str,
3587        prefix: &str,
3588        since_lsn: u64,
3589        max_count: usize,
3590    ) -> Vec<crate::replication::cdc::KvWatchEvent> {
3591        self.inner
3592            .cdc
3593            .poll(since_lsn, max_count)
3594            .into_iter()
3595            .filter_map(|event| event.kv)
3596            .filter(|event| event.collection == collection && event.key.starts_with(prefix))
3597            .collect()
3598    }
3599
3600    pub(crate) fn kv_watch_subscribe<'a>(
3601        &'a self,
3602        collection: impl Into<String>,
3603        key: impl Into<String>,
3604        from_lsn: Option<u64>,
3605    ) -> crate::runtime::kv_watch::KvWatchStream<'a> {
3606        crate::runtime::kv_watch::KvWatchStream::subscribe(
3607            &self.inner.cdc,
3608            &self.inner.kv_stats,
3609            collection,
3610            key,
3611            from_lsn,
3612            self.kv_watch_idle_timeout_ms(),
3613        )
3614    }
3615
3616    pub(crate) fn kv_watch_subscribe_prefix<'a>(
3617        &'a self,
3618        collection: impl Into<String>,
3619        prefix: impl Into<String>,
3620        from_lsn: Option<u64>,
3621    ) -> crate::runtime::kv_watch::KvWatchStream<'a> {
3622        crate::runtime::kv_watch::KvWatchStream::subscribe_prefix(
3623            &self.inner.cdc,
3624            &self.inner.kv_stats,
3625            collection,
3626            prefix,
3627            from_lsn,
3628            self.kv_watch_idle_timeout_ms(),
3629        )
3630    }
3631
3632    pub(crate) fn kv_watch_idle_timeout_ms(&self) -> u64 {
3633        self.config_u64("red.config.kv.watch.idle_timeout_ms", 60_000)
3634    }
3635
3636    /// Get backup scheduler status.
3637    pub fn backup_status(&self) -> crate::replication::scheduler::BackupStatus {
3638        self.inner.backup_scheduler.status()
3639    }
3640
3641    /// Borrow the runtime's result Blob Cache.
3642    ///
3643    /// Wired for the `/admin/blob_cache/sweep` and
3644    /// `/admin/blob_cache/flush_namespace` HTTP handlers (issue #148
3645    /// follow-up): both delegate to
3646    /// `crate::storage::cache::sweeper::BlobCacheSweeper`, which takes a
3647    /// `&BlobCache`. Also used by `trigger_backup` when
3648    /// `red.config.backup.include_blob_cache=true` to locate the L2
3649    /// directory for archival.
3650    pub fn result_blob_cache(&self) -> &crate::storage::cache::BlobCache {
3651        &self.inner.result_blob_cache
3652    }
3653
3654    /// PLAN.md Phase 11.4 — owned snapshot of every registered
3655    /// replica's state on this primary. Returns empty vec on
3656    /// non-primary instances or when no replicas are registered yet.
3657    pub fn primary_replica_snapshots(&self) -> Vec<crate::replication::primary::ReplicaState> {
3658        self.inner
3659            .db
3660            .replication
3661            .as_ref()
3662            .map(|repl| repl.replica_snapshots())
3663            .unwrap_or_default()
3664    }
3665
3666    /// PLAN.md Phase 11.4 — active commit policy. Reads
3667    /// `RED_PRIMARY_COMMIT_POLICY` once at runtime construction;
3668    /// future env reloads will need a reload endpoint. Default is
3669    /// `Local` — current behavior, no replica blocking.
3670    pub fn commit_policy(&self) -> crate::replication::CommitPolicy {
3671        crate::replication::CommitPolicy::from_env()
3672    }
3673
3674    /// PLAN.md Phase 11.5 — accessor for replica-side apply error
3675    /// counters (gap / divergence / apply / decode). Returned
3676    /// snapshot is consistent across the four counters; the labels
3677    /// match `reddb_replica_apply_errors_total{kind}`.
3678    pub fn replica_apply_error_counts(
3679        &self,
3680    ) -> [(crate::replication::logical::ApplyErrorKind, u64); 4] {
3681        self.inner.replica_apply_metrics.snapshot()
3682    }
3683
3684    /// PLAN.md Phase 4.4 — per-caller quota bucket. Always
3685    /// returned; `is_configured()` lets callers short-circuit.
3686    pub fn quota_bucket(&self) -> &crate::runtime::quota_bucket::QuotaBucket {
3687        &self.inner.quota_bucket
3688    }
3689
3690    /// PLAN.md Phase 11.4 — observability snapshot of every
3691    /// replica's durable LSN as known to the commit waiter. Empty
3692    /// vec on non-primary instances or when no replica has acked.
3693    pub fn commit_waiter_snapshot(&self) -> Vec<(String, u64)> {
3694        self.inner
3695            .db
3696            .replication
3697            .as_ref()
3698            .map(|repl| repl.commit_waiter.snapshot())
3699            .unwrap_or_default()
3700    }
3701
3702    /// PLAN.md Phase 11.4 — `(reached, timed_out, not_required, last_micros)`
3703    /// counters for /metrics. Always-zero on non-primary instances.
3704    pub fn commit_waiter_metrics_snapshot(&self) -> (u64, u64, u64, u64) {
3705        self.inner
3706            .db
3707            .replication
3708            .as_ref()
3709            .map(|repl| repl.commit_waiter.metrics_snapshot())
3710            .unwrap_or((0, 0, 0, 0))
3711    }
3712
3713    /// PLAN.md Phase 11.4 — block until at least `count` replicas
3714    /// have durably applied through `target_lsn`, or `timeout`
3715    /// elapses. Returns the `AwaitOutcome` so the caller can decide
3716    /// whether to surface a timeout error to the client or continue
3717    /// (the policy mapping lives in the commit dispatcher).
3718    ///
3719    /// Foundation only — the write commit path doesn't yet call
3720    /// this. Wiring it is a per-surface task gated on the operator
3721    /// flipping `RED_PRIMARY_COMMIT_POLICY` away from `local`.
3722    pub fn await_replica_acks(
3723        &self,
3724        target_lsn: u64,
3725        count: u32,
3726        timeout: std::time::Duration,
3727    ) -> crate::replication::AwaitOutcome {
3728        match &self.inner.db.replication {
3729            Some(repl) => repl.commit_waiter.await_acks(target_lsn, count, timeout),
3730            None => {
3731                // No replication configured: policy must be `Local`.
3732                // Treat as immediate `NotRequired` so callers don't
3733                // block on a degenerate setup.
3734                crate::replication::AwaitOutcome::NotRequired
3735            }
3736        }
3737    }
3738
3739    /// PLAN.md Phase 11.4 — enforce the configured commit policy
3740    /// against `post_lsn` (the LSN of the just-completed write).
3741    /// Returns `Ok(AwaitOutcome)` on every successful enforcement
3742    /// (including `Reached` and `TimedOut` when fail-on-timeout is
3743    /// off). Returns `Err(ReadOnly)` only when:
3744    ///   * policy is `AckN(n)` with `n > 0`
3745    ///   * the wait timed out
3746    ///   * `RED_COMMIT_FAIL_ON_TIMEOUT=true` is set
3747    ///
3748    /// The HTTP / gRPC / wire surfaces map the error to 504 / wire
3749    /// backoff. Default behaviour (env unset) logs warn and returns
3750    /// success — matches PLAN.md "default v1 stays local" semantics
3751    /// while still letting the operator opt into hard-blocking.
3752    pub fn enforce_commit_policy(
3753        &self,
3754        post_lsn: u64,
3755    ) -> RedDBResult<crate::replication::AwaitOutcome> {
3756        let n = match self.commit_policy() {
3757            crate::replication::CommitPolicy::AckN(n) if n > 0 => n,
3758            _ => return Ok(crate::replication::AwaitOutcome::NotRequired),
3759        };
3760        let timeout_ms = std::env::var("RED_REPLICATION_ACK_TIMEOUT_MS")
3761            .ok()
3762            .and_then(|v| v.parse::<u64>().ok())
3763            .unwrap_or(5_000);
3764        let outcome =
3765            self.await_replica_acks(post_lsn, n, std::time::Duration::from_millis(timeout_ms));
3766        if let crate::replication::AwaitOutcome::TimedOut { observed, required } = &outcome {
3767            tracing::warn!(
3768                target: "reddb::commit",
3769                post_lsn,
3770                observed = *observed,
3771                required = *required,
3772                timeout_ms,
3773                "ack_n: timed out waiting for replicas"
3774            );
3775            let fail = std::env::var("RED_COMMIT_FAIL_ON_TIMEOUT")
3776                .ok()
3777                .map(|v| {
3778                    let t = v.trim();
3779                    t.eq_ignore_ascii_case("true") || t == "1" || t.eq_ignore_ascii_case("yes")
3780                })
3781                .unwrap_or(false);
3782            if fail {
3783                return Err(RedDBError::ReadOnly(format!(
3784                    "commit policy timed out at lsn {post_lsn}: observed={observed} required={required} (RED_COMMIT_FAIL_ON_TIMEOUT=true)"
3785                )));
3786            }
3787        }
3788        Ok(outcome)
3789    }
3790
3791    /// PLAN.md Phase 6.3 — whether at-rest encryption is configured.
3792    /// Reads `RED_ENCRYPTION_KEY` / `RED_ENCRYPTION_KEY_FILE` lazily;
3793    /// returns `("enabled", None)` when a key is loadable, `("error", Some(msg))`
3794    /// when the operator set the env but it doesn't parse, and
3795    /// `("disabled", None)` when no key is configured. The pager
3796    /// hookup is deferred — this accessor surfaces the operator's
3797    /// intent for /admin/status without yet using the key in writes.
3798    pub fn encryption_at_rest_status(&self) -> (&'static str, Option<String>) {
3799        match crate::crypto::page_encryption::key_from_env() {
3800            Ok(Some(_)) => ("enabled", None),
3801            Ok(None) => ("disabled", None),
3802            Err(err) => ("error", Some(err)),
3803        }
3804    }
3805
3806    /// PLAN.md Phase 11.5 — current replica apply health label
3807    /// (`ok`, `gap`, `divergence`, `apply_error`, `connecting`,
3808    /// `stalled_gap`). Read from the persisted `red.replication.state`
3809    /// config key updated by the replica loop. Returns `None` on
3810    /// non-replica instances or when no apply has run yet.
3811    pub fn replica_apply_health(&self) -> Option<String> {
3812        let state = self.config_string("red.replication.state", "");
3813        if state.is_empty() {
3814            None
3815        } else {
3816            Some(state)
3817        }
3818    }
3819
3820    /// Current local LSN paired with the LSN of the most recently
3821    /// archived WAL segment. The difference is the replication /
3822    /// archive lag operators alert on (PLAN.md Phase 5.1). Returns
3823    /// `(0, 0)` when neither replication nor archiving is configured.
3824    pub fn wal_archive_progress(&self) -> (u64, u64) {
3825        let current_lsn = self
3826            .inner
3827            .db
3828            .replication
3829            .as_ref()
3830            .map(|repl| {
3831                repl.logical_wal_spool
3832                    .as_ref()
3833                    .map(|spool| spool.current_lsn())
3834                    .unwrap_or_else(|| repl.wal_buffer.current_lsn())
3835            })
3836            .unwrap_or_else(|| self.inner.cdc.current_lsn());
3837        let last_archived_lsn = self.config_u64("red.config.timeline.last_archived_lsn", 0);
3838        (current_lsn, last_archived_lsn)
3839    }
3840
3841    /// Trigger an immediate backup.
3842    pub fn trigger_backup(&self) -> RedDBResult<crate::replication::scheduler::BackupResult> {
3843        self.check_write(crate::runtime::write_gate::WriteKind::Backup)?;
3844        // Defense in depth — check_write above already rejects when
3845        // the lease is NotHeld, but log + audit the lease angle here
3846        // explicitly so dashboards distinguish "lease lost" from a
3847        // generic read-only refusal.
3848        self.assert_remote_write_allowed("admin/backup")?;
3849        let started = std::time::Instant::now();
3850        let snapshot = self.create_snapshot()?;
3851        let mut uploaded = false;
3852
3853        if let (Some(backend), Some(path)) = (&self.inner.db.remote_backend, self.inner.db.path()) {
3854            let default_snapshot_prefix = self.inner.db.options().default_snapshot_prefix();
3855            let default_wal_prefix = self.inner.db.options().default_wal_archive_prefix();
3856            let default_head_key = self.inner.db.options().default_backup_head_key();
3857            let snapshot_prefix = self.config_string(
3858                "red.config.backup.snapshot_prefix",
3859                &default_snapshot_prefix,
3860            );
3861            let wal_prefix =
3862                self.config_string("red.config.wal.archive.prefix", &default_wal_prefix);
3863            let head_key = self.config_string("red.config.backup.head_key", &default_head_key);
3864            let timeline_id = self.config_string("red.config.timeline.id", "main");
3865            let snapshot_key = crate::storage::wal::archive_snapshot(
3866                backend.as_ref(),
3867                path,
3868                snapshot.snapshot_id,
3869                &snapshot_prefix,
3870            )
3871            .map_err(|err| RedDBError::Internal(err.to_string()))?;
3872            let current_lsn = self
3873                .inner
3874                .db
3875                .replication
3876                .as_ref()
3877                .map(|repl| {
3878                    repl.logical_wal_spool
3879                        .as_ref()
3880                        .map(|spool| spool.current_lsn())
3881                        .unwrap_or_else(|| repl.wal_buffer.current_lsn())
3882                })
3883                .unwrap_or_else(|| self.inner.cdc.current_lsn());
3884            let last_archived_lsn = self.config_u64("red.config.timeline.last_archived_lsn", 0);
3885            // Hash the local snapshot bytes so the manifest can carry
3886            // the digest for restore-side verification (PLAN.md
3887            // Phase 4). Failure to hash is non-fatal — we still
3888            // publish the manifest, just without a checksum, so a
3889            // future fix can backfill rather than losing the backup.
3890            let snapshot_sha256 =
3891                crate::storage::wal::SnapshotManifest::compute_snapshot_sha256(path)
3892                    .map_err(|err| {
3893                        tracing::warn!(
3894                            target: "reddb::backup",
3895                            error = %err,
3896                            snapshot_id = snapshot.snapshot_id,
3897                            "snapshot hash failed; manifest will lack checksum"
3898                        );
3899                    })
3900                    .ok();
3901            let manifest = crate::storage::wal::SnapshotManifest {
3902                timeline_id: timeline_id.clone(),
3903                snapshot_key: snapshot_key.clone(),
3904                snapshot_id: snapshot.snapshot_id,
3905                snapshot_time: snapshot.created_at_unix_ms as u64,
3906                base_lsn: current_lsn,
3907                schema_version: crate::api::REDDB_FORMAT_VERSION,
3908                format_version: crate::api::REDDB_FORMAT_VERSION,
3909                snapshot_sha256,
3910            };
3911            crate::storage::wal::publish_snapshot_manifest(backend.as_ref(), &manifest)
3912                .map_err(|err| RedDBError::Internal(err.to_string()))?;
3913
3914            // PLAN.md Phase 11.3 — read the head of the WAL hash chain
3915            // so the new segment can link back. `None` means we're
3916            // starting a fresh timeline (after a clean restore or on
3917            // first archive ever); the segment's `prev_hash` will be
3918            // `None` and restore-side validation accepts that only for
3919            // the first segment in `plan.wal_segments`.
3920            let prev_segment_hash = self.config_string("red.config.timeline.last_segment_hash", "");
3921            let prev_hash_arg = if prev_segment_hash.is_empty() {
3922                None
3923            } else {
3924                Some(prev_segment_hash)
3925            };
3926
3927            let archived_lsn = if let Some(primary) = &self.inner.db.replication {
3928                let oldest = primary
3929                    .logical_wal_spool
3930                    .as_ref()
3931                    .and_then(|spool| spool.oldest_lsn().ok().flatten())
3932                    .or_else(|| primary.wal_buffer.oldest_lsn())
3933                    .unwrap_or(last_archived_lsn);
3934                if last_archived_lsn > 0 && last_archived_lsn < oldest.saturating_sub(1) {
3935                    return Err(RedDBError::Internal(format!(
3936                        "logical WAL gap detected: last_archived_lsn={last_archived_lsn}, oldest_available_lsn={oldest}"
3937                    )));
3938                }
3939                let records = if let Some(spool) = &primary.logical_wal_spool {
3940                    spool
3941                        .read_since(last_archived_lsn, usize::MAX)
3942                        .map_err(|err| RedDBError::Internal(err.to_string()))?
3943                } else {
3944                    primary.wal_buffer.read_since(last_archived_lsn, usize::MAX)
3945                };
3946                if let Some(meta) = crate::storage::wal::archive_change_records(
3947                    backend.as_ref(),
3948                    &wal_prefix,
3949                    &records,
3950                    prev_hash_arg,
3951                )
3952                .map_err(|err| RedDBError::Internal(err.to_string()))?
3953                {
3954                    if let Some(spool) = &primary.logical_wal_spool {
3955                        let _ = spool.prune_through(meta.lsn_end);
3956                    }
3957                    // Advance the chain head so the next archive call
3958                    // links to this segment's hash. If the segment has
3959                    // no sha256 (legacy / hashing failed) we leave the
3960                    // head as-is — the next segment then carries the
3961                    // prior chain head, preserving continuity.
3962                    if let Some(sha) = &meta.sha256 {
3963                        self.inner.db.store().set_config_tree(
3964                            "red.config.timeline",
3965                            &crate::json!({ "last_segment_hash": sha }),
3966                        );
3967                    }
3968                    meta.lsn_end
3969                } else {
3970                    last_archived_lsn
3971                }
3972            } else {
3973                last_archived_lsn
3974            };
3975
3976            let head = crate::storage::wal::BackupHead {
3977                timeline_id,
3978                snapshot_key,
3979                snapshot_id: snapshot.snapshot_id,
3980                snapshot_time: snapshot.created_at_unix_ms as u64,
3981                current_lsn,
3982                last_archived_lsn: archived_lsn,
3983                wal_prefix,
3984            };
3985            crate::storage::wal::publish_backup_head(backend.as_ref(), &head_key, &head)
3986                .map_err(|err| RedDBError::Internal(err.to_string()))?;
3987            self.inner.db.store().set_config_tree(
3988                "red.config.timeline",
3989                &crate::json!({
3990                    "last_archived_lsn": archived_lsn,
3991                    "id": head.timeline_id
3992                }),
3993            );
3994
3995            // PLAN.md Phase 2.4 — refresh the unified `MANIFEST.json`
3996            // at the prefix root so external tooling sees a single
3997            // catalog of every snapshot + WAL segment with their
3998            // checksums. Best-effort: a manifest publish failure
3999            // doesn't fail the backup (the per-artifact sidecars
4000            // already give restore-side integrity), but it does log
4001            // so dashboards can flag stale catalogs.
4002            if let Err(err) = crate::storage::wal::publish_unified_manifest_for_prefix(
4003                backend.as_ref(),
4004                &snapshot_prefix,
4005            ) {
4006                tracing::warn!(
4007                    target: "reddb::backup",
4008                    error = %err,
4009                    snapshot_prefix = %snapshot_prefix,
4010                    "unified MANIFEST.json refresh failed; per-artifact sidecars unaffected"
4011                );
4012            }
4013
4014            // PLAN.md Phase 11.4 — when the operator picked a
4015            // commit policy that demands replica durability, block
4016            // until the configured count of replicas has acked the
4017            // archived LSN (or the timeout fires). For backup the
4018            // policy decides the *DR posture* — `local` returns
4019            // immediately, `ack_n` ensures at least N replicas saw
4020            // the new tail before we report success to the
4021            // operator. A `TimedOut` is logged but does NOT fail
4022            // the backup: the local WAL + remote upload are durable
4023            // regardless; the missing acks are reported via
4024            // /metrics and /admin/status so the operator can decide.
4025            match self.commit_policy() {
4026                crate::replication::CommitPolicy::AckN(n) if n > 0 => {
4027                    let timeout = std::env::var("RED_REPLICATION_ACK_TIMEOUT_MS")
4028                        .ok()
4029                        .and_then(|v| v.parse::<u64>().ok())
4030                        .unwrap_or(5_000);
4031                    let outcome = self.await_replica_acks(
4032                        archived_lsn,
4033                        n,
4034                        std::time::Duration::from_millis(timeout),
4035                    );
4036                    match outcome {
4037                        crate::replication::AwaitOutcome::Reached(count) => {
4038                            tracing::debug!(
4039                                target: "reddb::backup",
4040                                archived_lsn,
4041                                n,
4042                                count,
4043                                "ack_n: replicas synced before backup return"
4044                            );
4045                        }
4046                        crate::replication::AwaitOutcome::TimedOut { observed, required } => {
4047                            tracing::warn!(
4048                                target: "reddb::backup",
4049                                archived_lsn,
4050                                observed,
4051                                required,
4052                                timeout_ms = timeout,
4053                                "ack_n: timed out waiting for replicas; backup uploaded but DR posture degraded"
4054                            );
4055                        }
4056                        crate::replication::AwaitOutcome::NotRequired => {}
4057                    }
4058                }
4059                _ => {} // Local / RemoteWal / Quorum: no blocking yet
4060            }
4061
4062            // Issue #148 follow-up — opt-in archive of the L2 Blob Cache
4063            // directory tree. Default off so a standard backup stays
4064            // small; flip via `red.config.backup.include_blob_cache=true`
4065            // when warm-cache restore is required (per
4066            // docs/operations/blob-cache-backup-restore.md §1).
4067            //
4068            // The L2 tree is *derived* state (ADR 0006) — its absence
4069            // never causes data loss; it only affects post-restore
4070            // p99 latency until the cache re-warms. We therefore log
4071            // (not fail) on per-file upload errors so a partial L2
4072            // upload never aborts a healthy snapshot+WAL backup.
4073            if self.config_bool("red.config.backup.include_blob_cache", false) {
4074                let blob_cache_prefix = self.config_string(
4075                    "red.config.backup.blob_cache_prefix",
4076                    &format!("{snapshot_prefix}blob_cache/"),
4077                );
4078                if let Some(l2_path) = self.inner.result_blob_cache.l2_path() {
4079                    match crate::storage::cache::archive_blob_cache_l2(
4080                        backend.as_ref(),
4081                        l2_path,
4082                        &blob_cache_prefix,
4083                    ) {
4084                        Ok(count) => {
4085                            tracing::info!(
4086                                target: "reddb::backup",
4087                                files_uploaded = count,
4088                                blob_cache_prefix = %blob_cache_prefix,
4089                                "include_blob_cache: archived L2 directory"
4090                            );
4091                        }
4092                        Err(err) => {
4093                            tracing::warn!(
4094                                target: "reddb::backup",
4095                                error = %err,
4096                                blob_cache_prefix = %blob_cache_prefix,
4097                                "include_blob_cache: L2 archive failed; backup proceeding (cache is derived state)"
4098                            );
4099                        }
4100                    }
4101                } else {
4102                    tracing::debug!(
4103                        target: "reddb::backup",
4104                        "include_blob_cache=true but no L2 path configured; nothing to archive"
4105                    );
4106                }
4107            }
4108
4109            uploaded = true;
4110        }
4111
4112        Ok(crate::replication::scheduler::BackupResult {
4113            snapshot_id: snapshot.snapshot_id,
4114            uploaded,
4115            duration_ms: started.elapsed().as_millis() as u64,
4116            timestamp: snapshot.created_at_unix_ms as u64,
4117        })
4118    }
4119
4120    pub fn acquire(&self) -> RedDBResult<RuntimeConnection> {
4121        let mut pool = self
4122            .inner
4123            .pool
4124            .lock()
4125            .map_err(|e| RedDBError::Internal(format!("connection pool lock poisoned: {e}")))?;
4126        if pool.active >= self.inner.pool_config.max_connections {
4127            return Err(RedDBError::Internal(
4128                "connection pool exhausted".to_string(),
4129            ));
4130        }
4131
4132        let id = if let Some(id) = pool.idle.pop() {
4133            id
4134        } else {
4135            let id = pool.next_id;
4136            pool.next_id += 1;
4137            id
4138        };
4139        pool.active += 1;
4140        pool.total_checkouts += 1;
4141        drop(pool);
4142
4143        Ok(RuntimeConnection {
4144            id,
4145            inner: Arc::clone(&self.inner),
4146        })
4147    }
4148
4149    pub fn checkpoint(&self) -> RedDBResult<()> {
4150        // Local fsync always allowed — losing the lease shouldn't
4151        // prevent us from durably persisting what's already in memory.
4152        // The remote upload is the side-effect that risks clobbering a
4153        // peer's state, so it's behind the lease gate.
4154        self.inner.db.flush_local_only().map_err(|err| {
4155            // Issue #205 — local flush failure is a CheckpointFailed
4156            // operator-grade event. The local-flush path also covers
4157            // the WAL fsync we depend on, so a failure here doubles as
4158            // the WalFsyncFailed signal for the runtime entry point.
4159            let msg = err.to_string();
4160            crate::telemetry::operator_event::OperatorEvent::CheckpointFailed {
4161                lsn: 0,
4162                error: msg.clone(),
4163            }
4164            .emit_global();
4165            crate::telemetry::operator_event::OperatorEvent::WalFsyncFailed {
4166                path: "<flush_local_only>".to_string(),
4167                error: msg.clone(),
4168            }
4169            .emit_global();
4170            RedDBError::Engine(msg)
4171        })?;
4172        if let Err(err) = self.assert_remote_write_allowed("checkpoint") {
4173            tracing::warn!(
4174                target: "reddb::serverless::lease",
4175                error = %err,
4176                "checkpoint: skipping remote upload — lease not held"
4177            );
4178            return Ok(());
4179        }
4180        self.inner
4181            .db
4182            .upload_to_remote_backend()
4183            .map_err(|err| RedDBError::Engine(err.to_string()))
4184    }
4185
4186    /// Guard remote-mutating operations on the writer lease.
4187    /// Returns `Ok(())` when no remote backend is configured (the
4188    /// lease is irrelevant) or the lease state is `NotRequired` /
4189    /// `Held`. Returns `RedDBError::ReadOnly` when the lease is
4190    /// `NotHeld`, with an audit-friendly action label so the caller
4191    /// can record the rejection.
4192    pub(crate) fn assert_remote_write_allowed(&self, action: &str) -> RedDBResult<()> {
4193        if self.inner.db.remote_backend.is_none() {
4194            return Ok(());
4195        }
4196        match self.inner.write_gate.lease_state() {
4197            crate::runtime::write_gate::LeaseGateState::NotHeld => {
4198                self.inner.audit_log.record(
4199                    action,
4200                    "system",
4201                    "remote_backend",
4202                    "err: writer lease not held",
4203                    crate::json::Value::Null,
4204                );
4205                Err(RedDBError::ReadOnly(format!(
4206                    "writer lease not held — {action} blocked (serverless fence)"
4207                )))
4208            }
4209            _ => Ok(()),
4210        }
4211    }
4212
4213    pub fn run_maintenance(&self) -> RedDBResult<()> {
4214        self.inner
4215            .db
4216            .run_maintenance()
4217            .map_err(|err| RedDBError::Internal(err.to_string()))
4218    }
4219
4220    pub fn scan_collection(
4221        &self,
4222        collection: &str,
4223        cursor: Option<ScanCursor>,
4224        limit: usize,
4225    ) -> RedDBResult<ScanPage> {
4226        let store = self.inner.db.store();
4227        let manager = store
4228            .get_collection(collection)
4229            .ok_or_else(|| RedDBError::NotFound(collection.to_string()))?;
4230
4231        let mut entities = manager.query_all(|_| true);
4232        entities.sort_by_key(|entity| entity.id.raw());
4233
4234        let offset = cursor.map(|cursor| cursor.offset).unwrap_or(0);
4235        let total = entities.len();
4236        let end = total.min(offset.saturating_add(limit.max(1)));
4237        let items = if offset >= total {
4238            Vec::new()
4239        } else {
4240            entities[offset..end].to_vec()
4241        };
4242        let next = (end < total).then_some(ScanCursor { offset: end });
4243
4244        Ok(ScanPage {
4245            collection: collection.to_string(),
4246            items,
4247            next,
4248            total,
4249        })
4250    }
4251
4252    pub fn catalog(&self) -> CatalogModelSnapshot {
4253        self.inner.db.catalog_model_snapshot()
4254    }
4255
4256    pub fn catalog_consistency_report(&self) -> crate::catalog::CatalogConsistencyReport {
4257        self.inner.db.catalog_consistency_report()
4258    }
4259
4260    pub fn catalog_attention_summary(&self) -> CatalogAttentionSummary {
4261        crate::catalog::attention_summary(&self.catalog())
4262    }
4263
4264    pub fn collection_attention(&self) -> Vec<CollectionDescriptor> {
4265        crate::catalog::collection_attention(&self.catalog())
4266    }
4267
4268    pub fn index_attention(&self) -> Vec<CatalogIndexStatus> {
4269        crate::catalog::index_attention(&self.catalog())
4270    }
4271
4272    pub fn graph_projection_attention(&self) -> Vec<CatalogGraphProjectionStatus> {
4273        crate::catalog::graph_projection_attention(&self.catalog())
4274    }
4275
4276    pub fn analytics_job_attention(&self) -> Vec<CatalogAnalyticsJobStatus> {
4277        crate::catalog::analytics_job_attention(&self.catalog())
4278    }
4279
4280    pub fn stats(&self) -> RuntimeStats {
4281        let pool = runtime_pool_lock(self);
4282        RuntimeStats {
4283            active_connections: pool.active,
4284            idle_connections: pool.idle.len(),
4285            total_checkouts: pool.total_checkouts,
4286            paged_mode: self.inner.db.is_paged(),
4287            started_at_unix_ms: self.inner.started_at_unix_ms,
4288            store: self.inner.db.stats(),
4289            system: SystemInfo::collect(),
4290            result_blob_cache: self.inner.result_blob_cache.stats(),
4291            kv: self.inner.kv_stats.snapshot(),
4292        }
4293    }
4294
4295    /// Execute a query under a typed scope override without embedding
4296    /// the tenant / user / role values into the SQL string. Use this
4297    /// from transport middleware (HTTP / gRPC / worker loops) where the
4298    /// scope is resolved from auth claims and the SQL is a parameterised
4299    /// template — avoids the string-concat injection risk of building
4300    /// `WITHIN TENANT '<id>' …` manually, and is drop-in compatible with
4301    /// prepared statements that didn't know about tenancy.
4302    ///
4303    /// Precedence matches the `WITHIN` clause: the passed `scope`
4304    /// overrides `SET LOCAL TENANT`, which overrides `SET TENANT`.
4305    /// The override is pushed on the thread-local scope stack for the
4306    /// duration of the call and popped on return — pool-shared
4307    /// connections cannot leak it across requests.
4308    pub fn execute_query_with_scope(
4309        &self,
4310        query: &str,
4311        scope: crate::runtime::within_clause::ScopeOverride,
4312    ) -> RedDBResult<RuntimeQueryResult> {
4313        if scope.is_empty() {
4314            return self.execute_query(query);
4315        }
4316        let _scope_guard = ScopeOverrideGuard::install(scope);
4317        self.execute_query(query)
4318    }
4319
4320    /// Issue #205 — single lifecycle exit for slow-query logging.
4321    ///
4322    /// `execute_query_inner` does the real work; this wrapper times it
4323    /// and, if elapsed exceeds the configured threshold, hands the
4324    /// triple `(QueryKind, elapsed_ms, sql_redacted, scope)` to the
4325    /// SlowQueryLogger. The threshold + sample_pct were captured at
4326    /// SlowQueryLogger construction (runtime startup), so the per-call
4327    /// cost on below-threshold paths is one relaxed atomic load.
4328    pub fn execute_query(&self, query: &str) -> RedDBResult<RuntimeQueryResult> {
4329        let started = std::time::Instant::now();
4330        let result = self.execute_query_inner(query);
4331        let elapsed_ms = started.elapsed().as_millis() as u64;
4332
4333        // Build EffectiveScope from the same thread-locals frame-build
4334        // consults — keeps the slow-log row consistent with the audit /
4335        // RLS view of "this statement". `ai_scope()` is the canonical
4336        // builder.
4337        let scope = self.ai_scope();
4338        let kind = match result
4339            .as_ref()
4340            .map(|r| r.statement_type)
4341            .unwrap_or("select")
4342        {
4343            "select" => crate::telemetry::slow_query_logger::QueryKind::Select,
4344            "insert" => crate::telemetry::slow_query_logger::QueryKind::Insert,
4345            "update" => crate::telemetry::slow_query_logger::QueryKind::Update,
4346            "delete" => crate::telemetry::slow_query_logger::QueryKind::Delete,
4347            _ => crate::telemetry::slow_query_logger::QueryKind::Internal,
4348        };
4349        // SQL redaction: pass the raw query through. The slow-query
4350        // logger writes structured JSON so embedded literals stay
4351        // escape-safe at the JSON boundary (proven by
4352        // `adversarial_sql_is_escape_safe` in slow_query_logger.rs).
4353        // PII redaction (e.g. literal masking) is a follow-up.
4354        self.inner
4355            .slow_query_logger
4356            .record(kind, elapsed_ms, query.to_string(), &scope);
4357
4358        result
4359    }
4360
4361    #[inline(never)]
4362    fn execute_query_inner(&self, query: &str) -> RedDBResult<RuntimeQueryResult> {
4363        // ── ULTRA-TURBO: autocommit `SELECT * FROM t WHERE _entity_id = N` ──
4364        //
4365        // Moved above every boot-cost the normal path pays (WITHIN
4366        // strip, SET LOCAL parse, tx_local_tenants read, snapshot
4367        // guard, tracing span, tx_contexts read) because the bench's
4368        // `select_point` scenario was observed at 28× vs PostgreSQL —
4369        // the dominant cost wasn't the entity fetch but the ceremony
4370        // before it. Only fires when there's no ambient transaction
4371        // context or WITHIN override, so the snapshot install we skip
4372        // truly is a no-op for this query.
4373        if !has_scope_override_active()
4374            && !query.trim_start().starts_with("WITHIN")
4375            && !query.trim_start().starts_with("within")
4376            && !self
4377                .inner
4378                .tx_contexts
4379                .read()
4380                .contains_key(&current_connection_id())
4381        {
4382            if let Some(result) = self.try_fast_entity_lookup(query) {
4383                return result;
4384            }
4385        }
4386
4387        // `WITHIN TENANT '<id>' [USER '<u>'] [AS ROLE '<r>'] <stmt>` —
4388        // strip the prefix, push a stack-scoped override, recurse on
4389        // the inner statement, pop on return. Stack lives in a
4390        // thread-local but is balanced by the RAII guard, so a
4391        // pool-shared connection cannot leak the override across
4392        // requests and an early `?` return still pops cleanly.
4393        match crate::runtime::within_clause::try_strip_within_prefix(query) {
4394            Ok(Some((scope, inner))) => {
4395                let _scope_guard = ScopeOverrideGuard::install(scope);
4396                // Re-enter the inner path, NOT `execute_query`, so the
4397                // slow-query lifecycle hook records exactly one row per
4398                // top-level statement (the WITHIN-stripped form would
4399                // double-record).
4400                return self.execute_query_inner(inner);
4401            }
4402            Ok(None) => {}
4403            Err(msg) => return Err(RedDBError::Query(msg)),
4404        }
4405
4406        // `EXPLAIN <stmt>` — introspection. Runs the planner on the
4407        // inner statement (WITHOUT executing it) and returns the
4408        // CanonicalLogicalNode tree as rows so the caller can see the
4409        // operator shape and estimated cost. `EXPLAIN ALTER FOR ...`
4410        // is a distinct schema-diff command and continues down the
4411        // regular SQL path.
4412        if let Some(inner) = strip_explain_prefix(query) {
4413            return self.explain_as_rows(query, inner);
4414        }
4415
4416        // `SET LOCAL TENANT '<id>'` — write the per-transaction tenant
4417        // override and return. Outside a transaction the statement is
4418        // an error (matches PG semantics: SET LOCAL only takes effect
4419        // within an active transaction).
4420        if let Some(value) = parse_set_local_tenant(query)? {
4421            let conn_id = current_connection_id();
4422            if !self.inner.tx_contexts.read().contains_key(&conn_id) {
4423                return Err(RedDBError::Query(
4424                    "SET LOCAL TENANT requires an active transaction".to_string(),
4425                ));
4426            }
4427            self.inner
4428                .tx_local_tenants
4429                .write()
4430                .insert(conn_id, value.clone());
4431            return Ok(RuntimeQueryResult::ok_message(
4432                query.to_string(),
4433                &match &value {
4434                    Some(id) => format!("local tenant set: {id}"),
4435                    None => "local tenant cleared".to_string(),
4436                },
4437                "set_local_tenant",
4438            ));
4439        }
4440
4441        if super::red_schema::is_system_schema_write(query) {
4442            return Err(RedDBError::Query(
4443                super::red_schema::READ_ONLY_ERROR.to_string(),
4444            ));
4445        }
4446
4447        let rewritten_query = super::red_schema::rewrite_virtual_names(query);
4448        let execution_query = rewritten_query.as_deref().unwrap_or(query);
4449
4450        let frame = super::statement_frame::StatementExecutionFrame::build(self, execution_query)?;
4451        let _frame_guards = frame.install(self);
4452
4453        // Phase 6 logging: enter a span stamped with conn_id / tenant
4454        // / query_len. Every downstream tracing::info!/warn!/error!
4455        // inherits these fields — no need to thread them manually
4456        // through storage/scan layers. Entered AFTER the WITHIN /
4457        // SET LOCAL TENANT resolution above so the span reflects the
4458        // effective scope for this statement.
4459        let _log_span = crate::telemetry::span::query_span(query).entered();
4460
4461        // ── CTE prelude (#41) — `WITH x AS (...) SELECT ... FROM x` ──
4462        if let Some(rewritten) = frame.prepare_cte(execution_query)? {
4463            return self.execute_query_expr(rewritten);
4464        }
4465
4466        // ── TURBO: bypass SQL parse for SELECT * FROM x WHERE _entity_id = N ──
4467        if let Some(result) = self.try_fast_entity_lookup(execution_query) {
4468            return result;
4469        }
4470
4471        // ── Result cache: return cached result if still fresh (30s TTL) ──
4472        if let Some(result) = frame.read_result_cache(self) {
4473            return Ok(result);
4474        }
4475
4476        let prepared = frame.prepare_statement(self, execution_query)?;
4477        let mode = prepared.mode;
4478        let expr = prepared.expr;
4479
4480        let statement = query_expr_name(&expr);
4481        let result_cache_scopes = query_expr_result_cache_scopes(&expr);
4482
4483        let _lock_guard = frame.prepare_dispatch(self, &expr)?;
4484        let frame_iface: &dyn super::statement_frame::ReadFrame = &frame;
4485
4486        let query_result = match expr {
4487            QueryExpr::Graph(_) | QueryExpr::Path(_) => {
4488                // Apply MVCC visibility + RLS gate while materialising the
4489                // graph: every node entity is screened against the source
4490                // collection's policy chain (basic and `Nodes`-targeted)
4491                // and dropped when the caller's tenant / role doesn't
4492                // admit it. Edges are pruned automatically because the
4493                // graph builder skips edges whose endpoints aren't in
4494                // `allowed_nodes`.
4495                let (graph, node_properties) = self.materialize_graph_with_rls()?;
4496                let result =
4497                    crate::storage::query::unified::UnifiedExecutor::execute_on_with_node_properties(
4498                        &graph,
4499                        &expr,
4500                        node_properties,
4501                    )
4502                        .map_err(|err| RedDBError::Query(err.to_string()))?;
4503
4504                Ok(RuntimeQueryResult {
4505                    query: query.to_string(),
4506                    mode,
4507                    statement,
4508                    engine: "materialized-graph",
4509                    result,
4510                    affected_rows: 0,
4511                    statement_type: "select",
4512                })
4513            }
4514            QueryExpr::Table(table) => {
4515                if super::red_schema::is_virtual_table(&table.table) {
4516                    return Ok(RuntimeQueryResult {
4517                        query: query.to_string(),
4518                        mode,
4519                        statement,
4520                        engine: "runtime-red-schema",
4521                        result: super::red_schema::red_query(
4522                            self,
4523                            &table.table,
4524                            &table,
4525                            &frame as &dyn super::statement_frame::ReadFrame,
4526                        )?,
4527                        affected_rows: 0,
4528                        statement_type: "select",
4529                    });
4530                }
4531
4532                // Foreign-table intercept (Phase 3.2.2 PG parity).
4533                //
4534                // When the referenced table matches a `CREATE FOREIGN TABLE`
4535                // registration, short-circuit into the FDW scan. Phase 3.2
4536                // wrappers don't yet support pushdown, so filters/projections
4537                // apply post-scan via `apply_foreign_table_filters` — good
4538                // enough for correctness; perf work lands in 3.2.3.
4539                if self.inner.foreign_tables.is_foreign_table(&table.table) {
4540                    let records = self
4541                        .inner
4542                        .foreign_tables
4543                        .scan(&table.table)
4544                        .map_err(|e| RedDBError::Internal(e.to_string()))?;
4545                    let result = apply_foreign_table_filters(records, &table);
4546                    return Ok(RuntimeQueryResult {
4547                        query: query.to_string(),
4548                        mode,
4549                        statement,
4550                        engine: "runtime-fdw",
4551                        result,
4552                        affected_rows: 0,
4553                        statement_type: "select",
4554                    });
4555                }
4556
4557                // Row-Level Security enforcement (Phase 2.5.2 PG parity).
4558                //
4559                // When RLS is enabled on this table, fetch every policy
4560                // that applies to the current (role, SELECT) pair and
4561                // fold them into the query's WHERE clause: policies
4562                // OR-combine (any of them admitting the row is enough),
4563                // then AND into the caller's existing filter.
4564                //
4565                // Anonymous callers (no thread-local identity) pass
4566                // `role = None`; policies with a specific `TO role`
4567                // clause skip, but `TO PUBLIC` policies still apply.
4568                //
4569                // When `inject_rls_filters` returns `None` the table has
4570                // RLS enabled but no policy admits the caller's role —
4571                // short-circuit with an empty result set instead of
4572                // synthesising a contradiction filter.
4573                let Some(table_with_rls) = self.authorize_relational_table_select(
4574                    table,
4575                    &frame as &dyn super::statement_frame::ReadFrame,
4576                )?
4577                else {
4578                    let empty = crate::storage::query::unified::UnifiedResult::empty();
4579                    return Ok(RuntimeQueryResult {
4580                        query: query.to_string(),
4581                        mode,
4582                        statement,
4583                        engine: "runtime-table-rls",
4584                        result: empty,
4585                        affected_rows: 0,
4586                        statement_type: "select",
4587                    });
4588                };
4589                Ok(RuntimeQueryResult {
4590                    query: query.to_string(),
4591                    mode,
4592                    statement,
4593                    engine: "runtime-table",
4594                    result: execute_runtime_table_query(
4595                        &self.inner.db,
4596                        &table_with_rls,
4597                        Some(&self.inner.index_store),
4598                    )?,
4599                    affected_rows: 0,
4600                    statement_type: "select",
4601                })
4602            }
4603            QueryExpr::Join(join) => {
4604                // Fold per-table RLS filters into each `QueryExpr::Table`
4605                // leaf of the join tree before executing. Without this
4606                // the join executor scans both tables raw and ignores
4607                // policies — a `WITHIN TENANT 'x'` against a join of
4608                // two tenant-scoped tables would leak cross-tenant rows.
4609                // When any leaf has RLS enabled and zero matching policy,
4610                // short-circuit to an empty join result instead of
4611                // emitting a contradiction filter.
4612                let join_with_rls = match self.authorize_relational_join_select(
4613                    join,
4614                    &frame as &dyn super::statement_frame::ReadFrame,
4615                )? {
4616                    Some(j) => j,
4617                    None => {
4618                        return Ok(RuntimeQueryResult {
4619                            query: query.to_string(),
4620                            mode,
4621                            statement,
4622                            engine: "runtime-join-rls",
4623                            result: crate::storage::query::unified::UnifiedResult::empty(),
4624                            affected_rows: 0,
4625                            statement_type: "select",
4626                        });
4627                    }
4628                };
4629                Ok(RuntimeQueryResult {
4630                    query: query.to_string(),
4631                    mode,
4632                    statement,
4633                    engine: "runtime-join",
4634                    result: execute_runtime_join_query(&self.inner.db, &join_with_rls)?,
4635                    affected_rows: 0,
4636                    statement_type: "select",
4637                })
4638            }
4639            QueryExpr::Vector(vector) => Ok(RuntimeQueryResult {
4640                query: query.to_string(),
4641                mode,
4642                statement,
4643                engine: "runtime-vector",
4644                result: execute_runtime_vector_query(&self.inner.db, &vector)?,
4645                affected_rows: 0,
4646                statement_type: "select",
4647            }),
4648            QueryExpr::Hybrid(hybrid) => Ok(RuntimeQueryResult {
4649                query: query.to_string(),
4650                mode,
4651                statement,
4652                engine: "runtime-hybrid",
4653                result: execute_runtime_hybrid_query(&self.inner.db, &hybrid)?,
4654                affected_rows: 0,
4655                statement_type: "select",
4656            }),
4657            // DML execution
4658            QueryExpr::Insert(ref insert) if super::red_schema::is_virtual_table(&insert.table) => {
4659                Err(RedDBError::Query(
4660                    super::red_schema::READ_ONLY_ERROR.to_string(),
4661                ))
4662            }
4663            QueryExpr::Update(ref update) if super::red_schema::is_virtual_table(&update.table) => {
4664                Err(RedDBError::Query(
4665                    super::red_schema::READ_ONLY_ERROR.to_string(),
4666                ))
4667            }
4668            QueryExpr::Delete(ref delete) if super::red_schema::is_virtual_table(&delete.table) => {
4669                Err(RedDBError::Query(
4670                    super::red_schema::READ_ONLY_ERROR.to_string(),
4671                ))
4672            }
4673            QueryExpr::Insert(ref insert) => self.execute_insert(query, insert),
4674            QueryExpr::Update(ref update) => self.execute_update(query, update),
4675            QueryExpr::Delete(ref delete) => self.execute_delete(query, delete),
4676            // DDL execution
4677            QueryExpr::CreateTable(ref create) => self.execute_create_table(query, create),
4678            QueryExpr::DropTable(ref drop_tbl) => self.execute_drop_table(query, drop_tbl),
4679            QueryExpr::DropGraph(ref drop_graph) => self.execute_drop_graph(query, drop_graph),
4680            QueryExpr::DropVector(ref drop_vector) => self.execute_drop_vector(query, drop_vector),
4681            QueryExpr::DropDocument(ref drop_document) => {
4682                self.execute_drop_document(query, drop_document)
4683            }
4684            QueryExpr::DropKv(ref drop_kv) => self.execute_drop_kv(query, drop_kv),
4685            QueryExpr::DropCollection(ref drop_collection) => {
4686                self.execute_drop_collection(query, drop_collection)
4687            }
4688            QueryExpr::Truncate(ref truncate) => self.execute_truncate(query, truncate),
4689            QueryExpr::AlterTable(ref alter) => self.execute_alter_table(query, alter),
4690            QueryExpr::ExplainAlter(ref explain) => self.execute_explain_alter(query, explain),
4691            // Graph analytics commands
4692            QueryExpr::GraphCommand(ref cmd) => self.execute_graph_command(query, cmd),
4693            // Search commands
4694            QueryExpr::SearchCommand(ref cmd) => self.execute_search_command(query, cmd),
4695            // ASK: RAG query with LLM synthesis
4696            QueryExpr::Ask(ref ask) => self.execute_ask(query, ask),
4697            QueryExpr::CreateIndex(ref create_idx) => self.execute_create_index(query, create_idx),
4698            QueryExpr::DropIndex(ref drop_idx) => self.execute_drop_index(query, drop_idx),
4699            QueryExpr::ProbabilisticCommand(ref cmd) => {
4700                self.execute_probabilistic_command(query, cmd)
4701            }
4702            // Time-series DDL
4703            QueryExpr::CreateTimeSeries(ref ts) => self.execute_create_timeseries(query, ts),
4704            QueryExpr::DropTimeSeries(ref ts) => self.execute_drop_timeseries(query, ts),
4705            // Queue DDL and commands
4706            QueryExpr::CreateQueue(ref q) => self.execute_create_queue(query, q),
4707            QueryExpr::AlterQueue(ref q) => self.execute_alter_queue(query, q),
4708            QueryExpr::DropQueue(ref q) => self.execute_drop_queue(query, q),
4709            QueryExpr::QueueSelect(ref q) => self.execute_queue_select(query, q),
4710            QueryExpr::QueueCommand(ref cmd) => self.execute_queue_command(query, cmd),
4711            QueryExpr::EventsBackfill(ref backfill) => {
4712                self.execute_events_backfill(query, backfill)
4713            }
4714            QueryExpr::EventsBackfillStatus { ref collection } => Err(RedDBError::Query(format!(
4715                "EVENTS BACKFILL STATUS for '{collection}' is not implemented in this slice"
4716            ))),
4717            QueryExpr::KvCommand(ref cmd) => self.execute_kv_command(query, cmd),
4718            QueryExpr::ConfigCommand(ref cmd) => self.execute_config_command(query, cmd),
4719            QueryExpr::CreateTree(ref tree) => self.execute_create_tree(query, tree),
4720            QueryExpr::DropTree(ref tree) => self.execute_drop_tree(query, tree),
4721            QueryExpr::TreeCommand(ref cmd) => self.execute_tree_command(query, cmd),
4722            // SET CONFIG key = value
4723            QueryExpr::SetConfig { ref key, ref value } => {
4724                if key.starts_with("red.secret.") {
4725                    return Err(RedDBError::Query(
4726                        "red.secret.* is reserved for vault secrets; use SET SECRET".to_string(),
4727                    ));
4728                }
4729                let store = self.inner.db.store();
4730                let json_val = match value {
4731                    Value::Text(s) => crate::serde_json::Value::String(s.to_string()),
4732                    Value::Integer(n) => crate::serde_json::Value::Number(*n as f64),
4733                    Value::Float(n) => crate::serde_json::Value::Number(*n),
4734                    Value::Boolean(b) => crate::serde_json::Value::Bool(*b),
4735                    _ => crate::serde_json::Value::String(value.to_string()),
4736                };
4737                store.set_config_tree(key, &json_val);
4738                update_current_config_value(key, value.clone());
4739                // Config changes can flip runtime behavior mid-session
4740                // (auto_decrypt, auto_encrypt, etc.) — invalidate the
4741                // result cache so subsequent reads re-execute against
4742                // the new config.
4743                self.invalidate_result_cache();
4744                Ok(RuntimeQueryResult::ok_message(
4745                    query.to_string(),
4746                    &format!("config set: {key}"),
4747                    "set",
4748                ))
4749            }
4750            // SET SECRET key = value
4751            QueryExpr::SetSecret { ref key, ref value } => {
4752                if key.starts_with("red.config.") {
4753                    return Err(RedDBError::Query(
4754                        "red.config.* is reserved for config; use SET CONFIG".to_string(),
4755                    ));
4756                }
4757                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
4758                    RedDBError::Query("SET SECRET requires an enabled, unsealed vault".to_string())
4759                })?;
4760                if matches!(value, Value::Null) {
4761                    auth_store
4762                        .vault_kv_try_delete(key)
4763                        .map_err(|err| RedDBError::Query(err.to_string()))?;
4764                    update_current_secret_value(key, None);
4765                    self.invalidate_result_cache();
4766                    return Ok(RuntimeQueryResult::ok_message(
4767                        query.to_string(),
4768                        &format!("secret deleted: {key}"),
4769                        "delete_secret",
4770                    ));
4771                }
4772                let value = secret_sql_value_to_string(value)?;
4773                auth_store
4774                    .vault_kv_try_set(key.clone(), value.clone())
4775                    .map_err(|err| RedDBError::Query(err.to_string()))?;
4776                update_current_secret_value(key, Some(value));
4777                self.invalidate_result_cache();
4778                Ok(RuntimeQueryResult::ok_message(
4779                    query.to_string(),
4780                    &format!("secret set: {key}"),
4781                    "set_secret",
4782                ))
4783            }
4784            // DELETE SECRET key
4785            QueryExpr::DeleteSecret { ref key } => {
4786                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
4787                    RedDBError::Query(
4788                        "DELETE SECRET requires an enabled, unsealed vault".to_string(),
4789                    )
4790                })?;
4791                let deleted = auth_store
4792                    .vault_kv_try_delete(key)
4793                    .map_err(|err| RedDBError::Query(err.to_string()))?;
4794                if deleted {
4795                    update_current_secret_value(key, None);
4796                }
4797                self.invalidate_result_cache();
4798                Ok(RuntimeQueryResult::ok_message(
4799                    query.to_string(),
4800                    &format!("secret deleted: {key}"),
4801                    if deleted {
4802                        "delete_secret"
4803                    } else {
4804                        "delete_secret_not_found"
4805                    },
4806                ))
4807            }
4808            // SHOW SECRET[S] [prefix]
4809            QueryExpr::ShowSecrets { ref prefix } => {
4810                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
4811                    RedDBError::Query("SHOW SECRET requires an enabled, unsealed vault".to_string())
4812                })?;
4813                if !auth_store.is_vault_backed() {
4814                    return Err(RedDBError::Query(
4815                        "SHOW SECRET requires an enabled, unsealed vault".to_string(),
4816                    ));
4817                }
4818                let mut keys = auth_store.vault_kv_keys();
4819                keys.sort();
4820                let mut result = UnifiedResult::with_columns(vec![
4821                    "key".into(),
4822                    "value".into(),
4823                    "status".into(),
4824                ]);
4825                for key in keys {
4826                    if let Some(ref pfx) = prefix {
4827                        if !key.starts_with(pfx) {
4828                            continue;
4829                        }
4830                    }
4831                    let mut record = UnifiedRecord::new();
4832                    record.set("key", Value::text(key));
4833                    record.set("value", Value::text("***"));
4834                    record.set("status", Value::text("active"));
4835                    result.push(record);
4836                }
4837                Ok(RuntimeQueryResult {
4838                    query: query.to_string(),
4839                    mode,
4840                    statement: "show_secrets",
4841                    engine: "runtime-secret",
4842                    result,
4843                    affected_rows: 0,
4844                    statement_type: "select",
4845                })
4846            }
4847            // SHOW CONFIG [prefix]
4848            QueryExpr::ShowConfig { ref prefix } => {
4849                let store = self.inner.db.store();
4850                let all_collections = store.list_collections();
4851                if !all_collections.contains(&"red_config".to_string()) {
4852                    let result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
4853                    return Ok(RuntimeQueryResult {
4854                        query: query.to_string(),
4855                        mode,
4856                        statement: "show_config",
4857                        engine: "runtime-config",
4858                        result,
4859                        affected_rows: 0,
4860                        statement_type: "select",
4861                    });
4862                }
4863                let manager = store
4864                    .get_collection("red_config")
4865                    .ok_or_else(|| RedDBError::NotFound("red_config".to_string()))?;
4866                let entities = manager.query_all(|_| true);
4867                let mut latest = std::collections::BTreeMap::<String, (u64, Value, Value)>::new();
4868                for entity in entities {
4869                    if let EntityData::Row(ref row) = entity.data {
4870                        if let Some(ref named) = row.named {
4871                            let key_val = named.get("key").cloned().unwrap_or(Value::Null);
4872                            let val = named.get("value").cloned().unwrap_or(Value::Null);
4873                            let key_str = match &key_val {
4874                                Value::Text(s) => s.as_ref(),
4875                                _ => continue,
4876                            };
4877                            if let Some(ref pfx) = prefix {
4878                                if !key_str.starts_with(pfx.as_str()) {
4879                                    continue;
4880                                }
4881                            }
4882                            let entity_id = entity.id.raw();
4883                            match latest.get(key_str) {
4884                                Some((prev_id, _, _)) if *prev_id > entity_id => {}
4885                                _ => {
4886                                    latest.insert(key_str.to_string(), (entity_id, key_val, val));
4887                                }
4888                            }
4889                        }
4890                    }
4891                }
4892                let mut result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
4893                for (_, key_val, val) in latest.into_values() {
4894                    let mut record = UnifiedRecord::new();
4895                    record.set("key", key_val);
4896                    record.set("value", val);
4897                    result.push(record);
4898                }
4899                Ok(RuntimeQueryResult {
4900                    query: query.to_string(),
4901                    mode,
4902                    statement: "show_config",
4903                    engine: "runtime-config",
4904                    result,
4905                    affected_rows: 0,
4906                    statement_type: "select",
4907                })
4908            }
4909            // Session-local multi-tenancy handle (Phase 2.5.3).
4910            //
4911            // SET TENANT 'id' / SET TENANT NULL / RESET TENANT — writes
4912            // the thread-local; SHOW TENANT returns it. Paired with the
4913            // CURRENT_TENANT() scalar for use in RLS policies.
4914            QueryExpr::SetTenant(ref value) => {
4915                match value {
4916                    Some(id) => set_current_tenant(id.clone()),
4917                    None => clear_current_tenant(),
4918                }
4919                Ok(RuntimeQueryResult::ok_message(
4920                    query.to_string(),
4921                    &match value {
4922                        Some(id) => format!("tenant set: {id}"),
4923                        None => "tenant cleared".to_string(),
4924                    },
4925                    "set_tenant",
4926                ))
4927            }
4928            QueryExpr::ShowTenant => {
4929                let mut result = UnifiedResult::with_columns(vec!["tenant".into()]);
4930                let mut record = UnifiedRecord::new();
4931                record.set(
4932                    "tenant",
4933                    current_tenant().map(Value::text).unwrap_or(Value::Null),
4934                );
4935                result.push(record);
4936                Ok(RuntimeQueryResult {
4937                    query: query.to_string(),
4938                    mode,
4939                    statement: "show_tenant",
4940                    engine: "runtime-tenant",
4941                    result,
4942                    affected_rows: 0,
4943                    statement_type: "select",
4944                })
4945            }
4946            // Transaction control (Phase 2.3 PG parity).
4947            //
4948            // BEGIN allocates a real `Xid` and stores a `TxnContext` keyed by
4949            // the current connection's id. COMMIT/ROLLBACK release it through
4950            // the `SnapshotManager` so future snapshots see the correct set of
4951            // active/aborted transactions.
4952            //
4953            // Tuple stamping (xmin/xmax) and read-path visibility filtering
4954            // land in Phase 2.3.2 — this dispatch only manages the snapshot
4955            // registry. Statements running outside a TxnContext still behave
4956            // as autocommit (xid=0 → visible to every snapshot).
4957            QueryExpr::TransactionControl(ref ctl) => {
4958                use crate::storage::query::ast::TxnControl;
4959                use crate::storage::transaction::snapshot::{TxnContext, Xid};
4960                use crate::storage::transaction::IsolationLevel;
4961
4962                // Phase 2.3 keys transactions by a thread-local connection id.
4963                // The stdio/gRPC paths wire a real per-connection id later;
4964                // for embedded use (one RedDBRuntime per process-ish caller)
4965                // we fall back to a deterministic placeholder.
4966                let conn_id = current_connection_id();
4967
4968                let (kind, msg) = match ctl {
4969                    TxnControl::Begin => {
4970                        let mgr = Arc::clone(&self.inner.snapshot_manager);
4971                        let xid = mgr.begin();
4972                        let snapshot = mgr.snapshot(xid);
4973                        let ctx = TxnContext {
4974                            xid,
4975                            isolation: IsolationLevel::SnapshotIsolation,
4976                            snapshot,
4977                            savepoints: Vec::new(),
4978                            released_sub_xids: Vec::new(),
4979                        };
4980                        self.inner.tx_contexts.write().insert(conn_id, ctx);
4981                        ("begin", format!("BEGIN — xid={xid} (snapshot isolation)"))
4982                    }
4983                    TxnControl::Commit => {
4984                        // SET LOCAL TENANT ends with the transaction.
4985                        self.inner.tx_local_tenants.write().remove(&conn_id);
4986                        let ctx = self.inner.tx_contexts.write().remove(&conn_id);
4987                        match ctx {
4988                            Some(ctx) => {
4989                                // Phase 2.3.2e: commit every open sub-xid
4990                                // so they also become visible. Their
4991                                // work is promoted to the parent txn's
4992                                // result exactly like a RELEASE would
4993                                // have done.
4994                                for (_, sub) in &ctx.savepoints {
4995                                    self.inner.snapshot_manager.commit(*sub);
4996                                }
4997                                for sub in &ctx.released_sub_xids {
4998                                    self.inner.snapshot_manager.commit(*sub);
4999                                }
5000                                self.inner.snapshot_manager.commit(ctx.xid);
5001                                // Phase 2.3.2b: physically remove tuples the txn
5002                                // marked for deletion. Before commit the rows
5003                                // only had their xmax stamped — now the
5004                                // deletion is durable.
5005                                self.finalize_pending_tombstones(conn_id);
5006                                self.finalize_pending_kv_watch_events(conn_id);
5007                                ("commit", format!("COMMIT — xid={} committed", ctx.xid))
5008                            }
5009                            None => (
5010                                "commit",
5011                                "COMMIT outside transaction — no-op (autocommit)".to_string(),
5012                            ),
5013                        }
5014                    }
5015                    TxnControl::Rollback => {
5016                        self.inner.tx_local_tenants.write().remove(&conn_id);
5017                        let ctx = self.inner.tx_contexts.write().remove(&conn_id);
5018                        match ctx {
5019                            Some(ctx) => {
5020                                // Phase 2.3.2e: abort every open sub-xid
5021                                // too so their writes stay hidden.
5022                                for (_, sub) in &ctx.savepoints {
5023                                    self.inner.snapshot_manager.rollback(*sub);
5024                                }
5025                                for sub in &ctx.released_sub_xids {
5026                                    self.inner.snapshot_manager.rollback(*sub);
5027                                }
5028                                self.inner.snapshot_manager.rollback(ctx.xid);
5029                                // Phase 2.3.2b: tuples that the txn had
5030                                // xmax-stamped become live again — wipe xmax
5031                                // back to 0 so later snapshots see them.
5032                                self.revive_pending_tombstones(conn_id);
5033                                self.discard_pending_kv_watch_events(conn_id);
5034                                ("rollback", format!("ROLLBACK — xid={} aborted", ctx.xid))
5035                            }
5036                            None => (
5037                                "rollback",
5038                                "ROLLBACK outside transaction — no-op (autocommit)".to_string(),
5039                            ),
5040                        }
5041                    }
5042                    // Phase 2.3.2e: savepoints map onto sub-xids. Each
5043                    // SAVEPOINT allocates a fresh xid and pushes it
5044                    // onto the per-txn stack so subsequent writes can
5045                    // be selectively rolled back. RELEASE pops without
5046                    // aborting; ROLLBACK TO aborts the sub-xid (and
5047                    // any nested ones) + revives their tombstones.
5048                    TxnControl::Savepoint(name) => {
5049                        let mgr = Arc::clone(&self.inner.snapshot_manager);
5050                        let mut guard = self.inner.tx_contexts.write();
5051                        match guard.get_mut(&conn_id) {
5052                            Some(ctx) => {
5053                                let sub = mgr.begin();
5054                                ctx.savepoints.push((name.clone(), sub));
5055                                ("savepoint", format!("SAVEPOINT {name} — sub_xid={sub}"))
5056                            }
5057                            None => (
5058                                "savepoint",
5059                                "SAVEPOINT outside transaction — no-op".to_string(),
5060                            ),
5061                        }
5062                    }
5063                    TxnControl::ReleaseSavepoint(name) => {
5064                        let mut guard = self.inner.tx_contexts.write();
5065                        match guard.get_mut(&conn_id) {
5066                            Some(ctx) => {
5067                                let pos = ctx
5068                                    .savepoints
5069                                    .iter()
5070                                    .position(|(n, _)| n == name)
5071                                    .ok_or_else(|| {
5072                                        RedDBError::Internal(format!(
5073                                            "savepoint {name} does not exist"
5074                                        ))
5075                                    })?;
5076                                // RELEASE pops the named savepoint and
5077                                // any nested ones. Their sub-xids move
5078                                // to `released_sub_xids` so they commit
5079                                // (or roll back) alongside the parent
5080                                // xid — PG semantics: released
5081                                // savepoints still contribute their
5082                                // work, but their names are gone.
5083                                let released = ctx.savepoints.len() - pos;
5084                                let popped: Vec<Xid> = ctx
5085                                    .savepoints
5086                                    .split_off(pos)
5087                                    .into_iter()
5088                                    .map(|(_, x)| x)
5089                                    .collect();
5090                                ctx.released_sub_xids.extend(popped);
5091                                (
5092                                    "release_savepoint",
5093                                    format!("RELEASE SAVEPOINT {name} — {released} level(s)"),
5094                                )
5095                            }
5096                            None => (
5097                                "release_savepoint",
5098                                "RELEASE outside transaction — no-op".to_string(),
5099                            ),
5100                        }
5101                    }
5102                    TxnControl::RollbackToSavepoint(name) => {
5103                        let mgr = Arc::clone(&self.inner.snapshot_manager);
5104                        // Splice out the savepoint + nested ones under
5105                        // a narrow lock, then run the snapshot-manager
5106                        // + tombstone side-effects without the tx map
5107                        // held so nothing re-enters.
5108                        let drop_result: Option<(Xid, Vec<Xid>)> = {
5109                            let mut guard = self.inner.tx_contexts.write();
5110                            if let Some(ctx) = guard.get_mut(&conn_id) {
5111                                let pos = ctx
5112                                    .savepoints
5113                                    .iter()
5114                                    .position(|(n, _)| n == name)
5115                                    .ok_or_else(|| {
5116                                        RedDBError::Internal(format!(
5117                                            "savepoint {name} does not exist"
5118                                        ))
5119                                    })?;
5120                                let savepoint_xid = ctx.savepoints[pos].1;
5121                                let aborted: Vec<Xid> = ctx
5122                                    .savepoints
5123                                    .split_off(pos)
5124                                    .into_iter()
5125                                    .map(|(_, x)| x)
5126                                    .collect();
5127                                Some((savepoint_xid, aborted))
5128                            } else {
5129                                None
5130                            }
5131                        };
5132
5133                        match drop_result {
5134                            Some((savepoint_xid, aborted)) => {
5135                                for x in &aborted {
5136                                    mgr.rollback(*x);
5137                                }
5138                                let revived = self.revive_tombstones_since(conn_id, savepoint_xid);
5139                                (
5140                                    "rollback_to_savepoint",
5141                                    format!(
5142                                        "ROLLBACK TO SAVEPOINT {name} — aborted {} sub_xid(s), revived {revived} tombstone(s)",
5143                                        aborted.len()
5144                                    ),
5145                                )
5146                            }
5147                            None => (
5148                                "rollback_to_savepoint",
5149                                "ROLLBACK TO outside transaction — no-op".to_string(),
5150                            ),
5151                        }
5152                    }
5153                };
5154                Ok(RuntimeQueryResult::ok_message(
5155                    query.to_string(),
5156                    &msg,
5157                    kind,
5158                ))
5159            }
5160            // Schema + Sequence DDL (Phase 1.3 PG parity).
5161            //
5162            // Schemas are lightweight logical namespaces: a CREATE SCHEMA call
5163            // just registers the name in `red_config` under `schema.{name}`.
5164            // Table lookups still happen by collection name; clients using
5165            // `schema.table` qualified names collapse to collection `schema.table`.
5166            //
5167            // Sequences persist a 64-bit counter + metadata (start, increment)
5168            // in `red_config` under `sequence.{name}.*`. Scalar callers
5169            // `nextval('name')` / `currval('name')` arrive with the MVCC phase
5170            // once we have a proper mutating-function dispatch path; for now the
5171            // DDL just establishes the catalog entry so clients don't error.
5172            QueryExpr::CreateSchema(ref q) => {
5173                let store = self.inner.db.store();
5174                let key = format!("schema.{}", q.name);
5175                if store.get_config(&key).is_some() {
5176                    if q.if_not_exists {
5177                        return Ok(RuntimeQueryResult::ok_message(
5178                            query.to_string(),
5179                            &format!("schema {} already exists — skipped", q.name),
5180                            "create_schema",
5181                        ));
5182                    }
5183                    return Err(RedDBError::Internal(format!(
5184                        "schema {} already exists",
5185                        q.name
5186                    )));
5187                }
5188                store.set_config_tree(&key, &crate::serde_json::Value::Bool(true));
5189                Ok(RuntimeQueryResult::ok_message(
5190                    query.to_string(),
5191                    &format!("schema {} created", q.name),
5192                    "create_schema",
5193                ))
5194            }
5195            QueryExpr::DropSchema(ref q) => {
5196                let store = self.inner.db.store();
5197                let key = format!("schema.{}", q.name);
5198                let existed = store.get_config(&key).is_some();
5199                if !existed && !q.if_exists {
5200                    return Err(RedDBError::Internal(format!(
5201                        "schema {} does not exist",
5202                        q.name
5203                    )));
5204                }
5205                // Remove marker from red_config via set to null.
5206                store.set_config_tree(&key, &crate::serde_json::Value::Null);
5207                let suffix = if q.cascade {
5208                    " (CASCADE accepted — tables untouched)"
5209                } else {
5210                    ""
5211                };
5212                Ok(RuntimeQueryResult::ok_message(
5213                    query.to_string(),
5214                    &format!("schema {} dropped{}", q.name, suffix),
5215                    "drop_schema",
5216                ))
5217            }
5218            QueryExpr::CreateSequence(ref q) => {
5219                let store = self.inner.db.store();
5220                let base = format!("sequence.{}", q.name);
5221                let start_key = format!("{base}.start");
5222                let incr_key = format!("{base}.increment");
5223                let curr_key = format!("{base}.current");
5224                if store.get_config(&start_key).is_some() {
5225                    if q.if_not_exists {
5226                        return Ok(RuntimeQueryResult::ok_message(
5227                            query.to_string(),
5228                            &format!("sequence {} already exists — skipped", q.name),
5229                            "create_sequence",
5230                        ));
5231                    }
5232                    return Err(RedDBError::Internal(format!(
5233                        "sequence {} already exists",
5234                        q.name
5235                    )));
5236                }
5237                // Persist start + increment, and set current so the first
5238                // nextval returns `start`.
5239                let initial_current = q.start - q.increment;
5240                store.set_config_tree(
5241                    &start_key,
5242                    &crate::serde_json::Value::Number(q.start as f64),
5243                );
5244                store.set_config_tree(
5245                    &incr_key,
5246                    &crate::serde_json::Value::Number(q.increment as f64),
5247                );
5248                store.set_config_tree(
5249                    &curr_key,
5250                    &crate::serde_json::Value::Number(initial_current as f64),
5251                );
5252                Ok(RuntimeQueryResult::ok_message(
5253                    query.to_string(),
5254                    &format!(
5255                        "sequence {} created (start={}, increment={})",
5256                        q.name, q.start, q.increment
5257                    ),
5258                    "create_sequence",
5259                ))
5260            }
5261            QueryExpr::DropSequence(ref q) => {
5262                let store = self.inner.db.store();
5263                let base = format!("sequence.{}", q.name);
5264                let existed = store.get_config(&format!("{base}.start")).is_some();
5265                if !existed && !q.if_exists {
5266                    return Err(RedDBError::Internal(format!(
5267                        "sequence {} does not exist",
5268                        q.name
5269                    )));
5270                }
5271                for k in ["start", "increment", "current"] {
5272                    store.set_config_tree(&format!("{base}.{k}"), &crate::serde_json::Value::Null);
5273                }
5274                Ok(RuntimeQueryResult::ok_message(
5275                    query.to_string(),
5276                    &format!("sequence {} dropped", q.name),
5277                    "drop_sequence",
5278                ))
5279            }
5280            // Views — CREATE [MATERIALIZED] VIEW (Phase 2.1 PG parity).
5281            //
5282            // The view definition is stored in-memory on RuntimeInner (not
5283            // persisted). SELECTs that reference the view name will substitute
5284            // the stored `QueryExpr` via `resolve_view_reference` during
5285            // planning (same entry point used by table-name resolution).
5286            //
5287            // Materialized views additionally allocate a slot in
5288            // `MaterializedViewCache`; a REFRESH repopulates that slot.
5289            QueryExpr::CreateView(ref q) => {
5290                let mut views = self.inner.views.write();
5291                if views.contains_key(&q.name) && !q.or_replace {
5292                    if q.if_not_exists {
5293                        return Ok(RuntimeQueryResult::ok_message(
5294                            query.to_string(),
5295                            &format!("view {} already exists — skipped", q.name),
5296                            "create_view",
5297                        ));
5298                    }
5299                    return Err(RedDBError::Internal(format!(
5300                        "view {} already exists",
5301                        q.name
5302                    )));
5303                }
5304                views.insert(q.name.clone(), Arc::new(q.clone()));
5305                drop(views);
5306
5307                // Materialized view: register cache slot (data is empty until REFRESH).
5308                if q.materialized {
5309                    use crate::storage::cache::result::{MaterializedViewDef, RefreshPolicy};
5310                    let def = MaterializedViewDef {
5311                        name: q.name.clone(),
5312                        query: format!("<parsed view {}>", q.name),
5313                        dependencies: collect_table_refs(&q.query),
5314                        refresh: RefreshPolicy::Manual,
5315                    };
5316                    self.inner.materialized_views.write().register(def);
5317                }
5318                // Plan cache may have cached a plan that didn't know about this
5319                // view — invalidate so future references pick up the new binding.
5320                // Result cache gets flushed too: OR REPLACE must not serve a
5321                // prior execution of the obsolete body.
5322                self.invalidate_plan_cache();
5323                self.invalidate_result_cache();
5324
5325                Ok(RuntimeQueryResult::ok_message(
5326                    query.to_string(),
5327                    &format!(
5328                        "{}view {} created",
5329                        if q.materialized { "materialized " } else { "" },
5330                        q.name
5331                    ),
5332                    "create_view",
5333                ))
5334            }
5335            QueryExpr::DropView(ref q) => {
5336                let mut views = self.inner.views.write();
5337                let existed = views.remove(&q.name).is_some();
5338                drop(views);
5339                if q.materialized || existed {
5340                    // Try the materialised cache too — silent if absent.
5341                    self.inner.materialized_views.write().remove(&q.name);
5342                }
5343                // Drop any plan / result cache entries that baked the
5344                // view body into their QueryExpr.
5345                self.invalidate_plan_cache();
5346                self.invalidate_result_cache();
5347                if !existed && !q.if_exists {
5348                    return Err(RedDBError::Internal(format!(
5349                        "view {} does not exist",
5350                        q.name
5351                    )));
5352                }
5353                self.invalidate_plan_cache();
5354                Ok(RuntimeQueryResult::ok_message(
5355                    query.to_string(),
5356                    &format!("view {} dropped", q.name),
5357                    "drop_view",
5358                ))
5359            }
5360            QueryExpr::RefreshMaterializedView(ref q) => {
5361                // Look up the view definition, execute its underlying query,
5362                // and stash the serialized result in the materialised cache.
5363                let view = {
5364                    let views = self.inner.views.read();
5365                    views.get(&q.name).cloned()
5366                };
5367                let view = match view {
5368                    Some(v) => v,
5369                    None => {
5370                        return Err(RedDBError::Internal(format!(
5371                            "view {} does not exist",
5372                            q.name
5373                        )))
5374                    }
5375                };
5376                if !view.materialized {
5377                    return Err(RedDBError::Internal(format!(
5378                        "view {} is not materialized — REFRESH requires \
5379                         CREATE MATERIALIZED VIEW",
5380                        q.name
5381                    )));
5382                }
5383                // Execute the underlying query fresh.
5384                let inner_result = self.execute_query_expr((*view.query).clone())?;
5385                // Cache data = JSON-serialised result (opaque blob; read path
5386                // returns it verbatim for now).
5387                let serialized = format!("{:?}", inner_result.result);
5388                self.inner
5389                    .materialized_views
5390                    .write()
5391                    .refresh(&q.name, serialized.into_bytes());
5392                Ok(RuntimeQueryResult::ok_message(
5393                    query.to_string(),
5394                    &format!("materialized view {} refreshed", q.name),
5395                    "refresh_materialized_view",
5396                ))
5397            }
5398            // Row Level Security (Phase 2.5 PG parity).
5399            //
5400            // Policies live in an in-memory registry keyed by (table, name).
5401            // Enforcement (AND-ing the policy's USING clause into every
5402            // query's WHERE for the table) arrives in Phase 2.5.2 via the
5403            // filter compiler; this dispatch only manages the catalog.
5404            QueryExpr::CreatePolicy(ref q) => {
5405                let key = (q.table.clone(), q.name.clone());
5406                self.inner
5407                    .rls_policies
5408                    .write()
5409                    .insert(key, Arc::new(q.clone()));
5410                self.invalidate_plan_cache();
5411                // Issue #120 — surface policy names in the
5412                // schema-vocabulary so AskPipeline (#121) can resolve
5413                // a policy reference back to its table.
5414                self.schema_vocabulary_apply(
5415                    crate::runtime::schema_vocabulary::DdlEvent::CreatePolicy {
5416                        collection: q.table.clone(),
5417                        policy: q.name.clone(),
5418                    },
5419                );
5420                Ok(RuntimeQueryResult::ok_message(
5421                    query.to_string(),
5422                    &format!("policy {} on {} created", q.name, q.table),
5423                    "create_policy",
5424                ))
5425            }
5426            QueryExpr::DropPolicy(ref q) => {
5427                let removed = self
5428                    .inner
5429                    .rls_policies
5430                    .write()
5431                    .remove(&(q.table.clone(), q.name.clone()))
5432                    .is_some();
5433                if !removed && !q.if_exists {
5434                    return Err(RedDBError::Internal(format!(
5435                        "policy {} on {} does not exist",
5436                        q.name, q.table
5437                    )));
5438                }
5439                self.invalidate_plan_cache();
5440                // Issue #120 — keep the schema-vocabulary policy
5441                // entry in sync.
5442                self.schema_vocabulary_apply(
5443                    crate::runtime::schema_vocabulary::DdlEvent::DropPolicy {
5444                        collection: q.table.clone(),
5445                        policy: q.name.clone(),
5446                    },
5447                );
5448                Ok(RuntimeQueryResult::ok_message(
5449                    query.to_string(),
5450                    &format!("policy {} on {} dropped", q.name, q.table),
5451                    "drop_policy",
5452                ))
5453            }
5454            // Foreign Data Wrappers (Phase 3.2 PG parity).
5455            //
5456            // CREATE SERVER / CREATE FOREIGN TABLE register into the shared
5457            // `ForeignTableRegistry`. The read path consults that registry
5458            // before dispatching a SELECT — when the table name matches a
5459            // registered foreign table, we forward the scan to the wrapper
5460            // and skip the normal collection lookup.
5461            //
5462            // Phase 3.2 is in-memory only; persistence across restarts is a
5463            // 3.2.2 follow-up that mirrors the view registry pattern.
5464            QueryExpr::CreateServer(ref q) => {
5465                use crate::storage::fdw::FdwOptions;
5466                let registry = Arc::clone(&self.inner.foreign_tables);
5467                if registry.server(&q.name).is_some() {
5468                    if q.if_not_exists {
5469                        return Ok(RuntimeQueryResult::ok_message(
5470                            query.to_string(),
5471                            &format!("server {} already exists — skipped", q.name),
5472                            "create_server",
5473                        ));
5474                    }
5475                    return Err(RedDBError::Internal(format!(
5476                        "server {} already exists",
5477                        q.name
5478                    )));
5479                }
5480                let mut opts = FdwOptions::new();
5481                for (k, v) in &q.options {
5482                    opts.values.insert(k.clone(), v.clone());
5483                }
5484                registry
5485                    .create_server(&q.name, &q.wrapper, opts)
5486                    .map_err(|e| RedDBError::Internal(e.to_string()))?;
5487                Ok(RuntimeQueryResult::ok_message(
5488                    query.to_string(),
5489                    &format!("server {} created (wrapper {})", q.name, q.wrapper),
5490                    "create_server",
5491                ))
5492            }
5493            QueryExpr::DropServer(ref q) => {
5494                let existed = self.inner.foreign_tables.drop_server(&q.name);
5495                if !existed && !q.if_exists {
5496                    return Err(RedDBError::Internal(format!(
5497                        "server {} does not exist",
5498                        q.name
5499                    )));
5500                }
5501                Ok(RuntimeQueryResult::ok_message(
5502                    query.to_string(),
5503                    &format!(
5504                        "server {} dropped{}",
5505                        q.name,
5506                        if q.cascade { " (cascade)" } else { "" }
5507                    ),
5508                    "drop_server",
5509                ))
5510            }
5511            QueryExpr::CreateForeignTable(ref q) => {
5512                use crate::storage::fdw::{FdwOptions, ForeignColumn, ForeignTable};
5513                let registry = Arc::clone(&self.inner.foreign_tables);
5514                if registry.foreign_table(&q.name).is_some() {
5515                    if q.if_not_exists {
5516                        return Ok(RuntimeQueryResult::ok_message(
5517                            query.to_string(),
5518                            &format!("foreign table {} already exists — skipped", q.name),
5519                            "create_foreign_table",
5520                        ));
5521                    }
5522                    return Err(RedDBError::Internal(format!(
5523                        "foreign table {} already exists",
5524                        q.name
5525                    )));
5526                }
5527                let mut opts = FdwOptions::new();
5528                for (k, v) in &q.options {
5529                    opts.values.insert(k.clone(), v.clone());
5530                }
5531                let columns: Vec<ForeignColumn> = q
5532                    .columns
5533                    .iter()
5534                    .map(|c| ForeignColumn {
5535                        name: c.name.clone(),
5536                        data_type: c.data_type.clone(),
5537                        not_null: c.not_null,
5538                    })
5539                    .collect();
5540                registry
5541                    .create_foreign_table(ForeignTable {
5542                        name: q.name.clone(),
5543                        server_name: q.server.clone(),
5544                        columns,
5545                        options: opts,
5546                    })
5547                    .map_err(|e| RedDBError::Internal(e.to_string()))?;
5548                self.invalidate_plan_cache();
5549                Ok(RuntimeQueryResult::ok_message(
5550                    query.to_string(),
5551                    &format!("foreign table {} created (server {})", q.name, q.server),
5552                    "create_foreign_table",
5553                ))
5554            }
5555            QueryExpr::DropForeignTable(ref q) => {
5556                let existed = self.inner.foreign_tables.drop_foreign_table(&q.name);
5557                if !existed && !q.if_exists {
5558                    return Err(RedDBError::Internal(format!(
5559                        "foreign table {} does not exist",
5560                        q.name
5561                    )));
5562                }
5563                self.invalidate_plan_cache();
5564                Ok(RuntimeQueryResult::ok_message(
5565                    query.to_string(),
5566                    &format!("foreign table {} dropped", q.name),
5567                    "drop_foreign_table",
5568                ))
5569            }
5570            // COPY table FROM 'path' (Phase 1.5 PG parity).
5571            //
5572            // Stream CSV rows through the shared `CsvImporter`. The collection
5573            // is auto-created on first insert (via `insert_auto`-style path);
5574            // VACUUM/ANALYZE afterwards is up to the caller.
5575            QueryExpr::CopyFrom(ref q) => {
5576                use crate::storage::import::{CsvConfig, CsvImporter};
5577                let store = self.inner.db.store();
5578                let cfg = CsvConfig {
5579                    collection: q.table.clone(),
5580                    has_header: q.has_header,
5581                    delimiter: q.delimiter.map(|c| c as u8).unwrap_or(b','),
5582                    ..CsvConfig::default()
5583                };
5584                let importer = CsvImporter::new(cfg);
5585                let stats = importer
5586                    .import_file(&q.path, store.as_ref())
5587                    .map_err(|e| RedDBError::Internal(format!("COPY failed: {e}")))?;
5588                // Tables are written → invalidate cached plans / result cache.
5589                self.note_table_write(&q.table);
5590                Ok(RuntimeQueryResult::ok_message(
5591                    query.to_string(),
5592                    &format!(
5593                        "COPY imported {} rows into {} ({} errors skipped, {}ms)",
5594                        stats.records_imported, q.table, stats.errors_skipped, stats.duration_ms
5595                    ),
5596                    "copy_from",
5597                ))
5598            }
5599            // Maintenance commands (Phase 1.2 PG parity).
5600            //
5601            // - VACUUM [FULL] [table]: refreshes planner stats for the target
5602            //   collection(s) and — when FULL — triggers a full pager persist
5603            //   (flushes dirty pages + fsync). Also invalidates the result cache
5604            //   so subsequent reads re-execute against the freshly compacted
5605            //   storage. RedDB's segment/btree GC runs continuously via the
5606            //   background lifecycle; explicit space reclamation for sealed
5607            //   segments arrives with Phase 2.3 (MVCC + dead-tuple reclamation).
5608            // - ANALYZE [table]: reruns `analyze_collection` +
5609            //   `persist_table_stats` via `refresh_table_planner_stats` so the
5610            //   planner has fresh histograms, distinct estimates, null counts.
5611            //
5612            // Both commands accept an optional target; omitting the target
5613            // iterates every collection in the store.
5614            QueryExpr::MaintenanceCommand(ref cmd) => {
5615                use crate::storage::query::ast::MaintenanceCommand as Mc;
5616                let store = self.inner.db.store();
5617                let (kind, msg) = match cmd {
5618                    Mc::Analyze { target } => {
5619                        let targets: Vec<String> = match target {
5620                            Some(t) => vec![t.clone()],
5621                            None => store.list_collections(),
5622                        };
5623                        for t in &targets {
5624                            self.refresh_table_planner_stats(t);
5625                        }
5626                        (
5627                            "analyze",
5628                            format!("ANALYZE refreshed stats for {} table(s)", targets.len()),
5629                        )
5630                    }
5631                    Mc::Vacuum { target, full } => {
5632                        let targets: Vec<String> = match target {
5633                            Some(t) => vec![t.clone()],
5634                            None => store.list_collections(),
5635                        };
5636                        // Stats refresh covers every target (same as ANALYZE).
5637                        for t in &targets {
5638                            self.refresh_table_planner_stats(t);
5639                        }
5640                        // FULL forces a pager persist (dirty-page flush + fsync).
5641                        // Regular VACUUM relies on the background writer / segment
5642                        // lifecycle so the command is non-blocking.
5643                        let persisted = if *full {
5644                            match store.persist() {
5645                                Ok(()) => true,
5646                                Err(e) => {
5647                                    return Err(RedDBError::Internal(format!(
5648                                        "VACUUM FULL persist failed: {e:?}"
5649                                    )));
5650                                }
5651                            }
5652                        } else {
5653                            false
5654                        };
5655                        // Result cache depended on pre-vacuum state.
5656                        self.invalidate_result_cache();
5657                        (
5658                            "vacuum",
5659                            format!(
5660                                "VACUUM{} processed {} table(s){}",
5661                                if *full { " FULL" } else { "" },
5662                                targets.len(),
5663                                if persisted {
5664                                    " (pages flushed to disk)"
5665                                } else {
5666                                    ""
5667                                }
5668                            ),
5669                        )
5670                    }
5671                };
5672                Ok(RuntimeQueryResult::ok_message(
5673                    query.to_string(),
5674                    &msg,
5675                    kind,
5676                ))
5677            }
5678            // GRANT / REVOKE / ALTER USER (RBAC milestone).
5679            //
5680            // These hit the AuthStore directly. The privilege-check
5681            // gate at the top of `execute_query_expr` already decided
5682            // whether the caller may even run the statement; here we
5683            // just translate the AST into AuthStore calls.
5684            QueryExpr::Grant(ref g) => self.execute_grant_statement(query, g),
5685            QueryExpr::Revoke(ref r) => self.execute_revoke_statement(query, r),
5686            QueryExpr::AlterUser(ref a) => self.execute_alter_user_statement(query, a),
5687            QueryExpr::CreateIamPolicy { ref id, ref json } => {
5688                self.execute_create_iam_policy(query, id, json)
5689            }
5690            QueryExpr::DropIamPolicy { ref id } => self.execute_drop_iam_policy(query, id),
5691            QueryExpr::AttachPolicy {
5692                ref policy_id,
5693                ref principal,
5694            } => self.execute_attach_policy(query, policy_id, principal),
5695            QueryExpr::DetachPolicy {
5696                ref policy_id,
5697                ref principal,
5698            } => self.execute_detach_policy(query, policy_id, principal),
5699            QueryExpr::ShowPolicies { ref filter } => {
5700                self.execute_show_policies(query, filter.as_ref())
5701            }
5702            QueryExpr::ShowEffectivePermissions {
5703                ref user,
5704                ref resource,
5705            } => self.execute_show_effective_permissions(query, user, resource.as_ref()),
5706            QueryExpr::SimulatePolicy {
5707                ref user,
5708                ref action,
5709                ref resource,
5710            } => self.execute_simulate_policy(query, user, action, resource),
5711            QueryExpr::CreateMigration(ref q) => self.execute_create_migration(query, q),
5712            QueryExpr::ApplyMigration(ref q) => self.execute_apply_migration(query, q),
5713            QueryExpr::RollbackMigration(ref q) => self.execute_rollback_migration(query, q),
5714            QueryExpr::ExplainMigration(ref q) => self.execute_explain_migration(query, q),
5715        };
5716
5717        // Decrypt Value::Secret columns in-place before caching, so
5718        // cached results match the post-decrypt shape and repeat
5719        // queries skip the per-row AES-GCM pass.
5720        let mut query_result = query_result;
5721        if let Ok(ref mut result) = query_result {
5722            if result.statement_type == "select" {
5723                self.apply_secret_decryption(result);
5724            }
5725        }
5726
5727        // Cache SELECT results for 30s.
5728        // Skip: pre-serialized JSON (large clone), and result sets > 5 rows.
5729        // Large multi-row results (range scans, filtered scans) are rarely
5730        // repeated with the same literal values so the cache hit rate is near
5731        // zero while the clone cost (100 records × ~16 fields each) is high.
5732        // Aggregations (1 row) and point lookups (1 row) still benefit.
5733        if let Ok(ref result) = query_result {
5734            frame.write_result_cache(self, result, result_cache_scopes);
5735        }
5736
5737        query_result
5738    }
5739
5740    /// Execute a pre-parsed `QueryExpr` directly, bypassing SQL parsing and the
5741    /// plan cache. Used by the prepared-statement fast path so that `execute_prepared`
5742    /// calls pay zero parse + cache overhead.
5743    ///
5744    /// Applies secret decryption on SELECT results, identical to `execute_query`.
5745    pub fn execute_query_expr(&self, expr: QueryExpr) -> RedDBResult<RuntimeQueryResult> {
5746        let _config_snapshot_guard = ConfigSnapshotGuard::install(Arc::clone(&self.inner.db));
5747        let _secret_store_guard = SecretStoreGuard::install(self.inner.auth_store.read().clone());
5748        // View rewrite (Phase 2.1): substitute any `QueryExpr::Table(tq)`
5749        // whose `tq.table` matches a registered view with the view's
5750        // underlying query. Safe to call even when no views are registered.
5751        let expr = self.rewrite_view_refs(expr);
5752
5753        self.validate_model_operations_before_auth(&expr)?;
5754        // Granular RBAC privilege check. Runs before dispatch so a
5755        // denied caller never reaches storage. Fail-closed: any error
5756        // resolving the action / resource produces PermissionDenied.
5757        if let Err(err) = self.check_query_privilege(&expr) {
5758            return Err(RedDBError::Query(format!("permission denied: {err}")));
5759        }
5760
5761        let statement = query_expr_name(&expr);
5762        let mode = detect_mode(statement);
5763        let query_str = statement;
5764
5765        let result = self.dispatch_expr(expr, query_str, mode)?;
5766        let mut r = result;
5767        if r.statement_type == "select" {
5768            self.apply_secret_decryption(&mut r);
5769        }
5770        Ok(r)
5771    }
5772
5773    pub(super) fn validate_model_operations_before_auth(
5774        &self,
5775        expr: &QueryExpr,
5776    ) -> RedDBResult<()> {
5777        use crate::catalog::CollectionModel;
5778        use crate::runtime::ddl::polymorphic_resolver;
5779        use crate::storage::query::ast::KvCommand;
5780
5781        let system_schema_target = match expr {
5782            QueryExpr::DropTable(q) => Some(q.name.as_str()),
5783            QueryExpr::DropGraph(q) => Some(q.name.as_str()),
5784            QueryExpr::DropVector(q) => Some(q.name.as_str()),
5785            QueryExpr::DropDocument(q) => Some(q.name.as_str()),
5786            QueryExpr::DropKv(q) => Some(q.name.as_str()),
5787            QueryExpr::DropCollection(q) => Some(q.name.as_str()),
5788            QueryExpr::Truncate(q) => Some(q.name.as_str()),
5789            _ => None,
5790        };
5791        if system_schema_target.is_some_and(crate::runtime::impl_ddl::is_system_schema_name) {
5792            return Err(RedDBError::Query("system schema is read-only".to_string()));
5793        }
5794
5795        let expected = match expr {
5796            QueryExpr::DropTable(q) => Some((q.name.as_str(), CollectionModel::Table)),
5797            QueryExpr::DropGraph(q) => Some((q.name.as_str(), CollectionModel::Graph)),
5798            QueryExpr::DropVector(q) => Some((q.name.as_str(), CollectionModel::Vector)),
5799            QueryExpr::DropDocument(q) => Some((q.name.as_str(), CollectionModel::Document)),
5800            QueryExpr::DropKv(q) => Some((q.name.as_str(), q.model)),
5801            QueryExpr::Truncate(q) => q.model.map(|model| (q.name.as_str(), model)),
5802            QueryExpr::KvCommand(cmd) => {
5803                let (collection, model) = match cmd {
5804                    KvCommand::Put {
5805                        collection, model, ..
5806                    }
5807                    | KvCommand::Get {
5808                        collection, model, ..
5809                    }
5810                    | KvCommand::Incr {
5811                        collection, model, ..
5812                    }
5813                    | KvCommand::Cas {
5814                        collection, model, ..
5815                    }
5816                    | KvCommand::Delete {
5817                        collection, model, ..
5818                    } => (collection.as_str(), *model),
5819                    KvCommand::Rotate { collection, .. }
5820                    | KvCommand::History { collection, .. }
5821                    | KvCommand::List { collection, .. }
5822                    | KvCommand::Purge { collection, .. } => {
5823                        (collection.as_str(), CollectionModel::Vault)
5824                    }
5825                    KvCommand::InvalidateTags { collection, .. } => {
5826                        (collection.as_str(), CollectionModel::Kv)
5827                    }
5828                    KvCommand::Watch {
5829                        collection, model, ..
5830                    } => (collection.as_str(), *model),
5831                    KvCommand::Unseal { collection, .. } => {
5832                        (collection.as_str(), CollectionModel::Vault)
5833                    }
5834                };
5835                Some((collection, model))
5836            }
5837            QueryExpr::ConfigCommand(cmd) => {
5838                self.validate_config_command_before_auth(cmd)?;
5839                None
5840            }
5841            _ => None,
5842        };
5843
5844        let Some((name, expected_model)) = expected else {
5845            return Ok(());
5846        };
5847        let snapshot = self.inner.db.catalog_model_snapshot();
5848        let Some(actual_model) = snapshot
5849            .collections
5850            .iter()
5851            .find(|collection| collection.name == name)
5852            .map(|collection| collection.declared_model.unwrap_or(collection.model))
5853        else {
5854            return Ok(());
5855        };
5856        polymorphic_resolver::ensure_model_match(expected_model, actual_model)
5857    }
5858
5859    /// Walk a `QueryExpr` and replace `QueryExpr::Table(tq)` nodes whose
5860    /// `tq.table` matches a registered view name with the view's stored
5861    /// body. Recurses through joins so `SELECT ... FROM t JOIN myview ...`
5862    /// resolves correctly. Pure operation — no side effects.
5863    pub(super) fn rewrite_view_refs(&self, expr: QueryExpr) -> QueryExpr {
5864        // Fast path: no views registered → return original expression.
5865        if self.inner.views.read().is_empty() {
5866            return expr;
5867        }
5868        self.rewrite_view_refs_inner(expr)
5869    }
5870
5871    fn rewrite_view_refs_inner(&self, expr: QueryExpr) -> QueryExpr {
5872        use crate::storage::query::ast::{Filter, TableSource};
5873        match expr {
5874            QueryExpr::Table(mut tq) => {
5875                // 1. If the TableSource is a subquery, recurse into it so
5876                //    `SELECT ... FROM (SELECT ... FROM myview) t` expands.
5877                //    The legacy `table` field (set to a synthetic
5878                //    "__subq_NNNN" sentinel) stays as-is so callers that
5879                //    read it keep compiling.
5880                if let Some(TableSource::Subquery(body)) = tq.source.take() {
5881                    tq.source = Some(TableSource::Subquery(Box::new(
5882                        self.rewrite_view_refs_inner(*body),
5883                    )));
5884                    return QueryExpr::Table(tq);
5885                }
5886
5887                // 2. Restore the source field (took it above for match).
5888                // When the source was `None` or `TableSource::Name(_)`, the
5889                // real lookup key is `tq.table` — check the view registry.
5890                let maybe_view = {
5891                    let views = self.inner.views.read();
5892                    views.get(&tq.table).cloned()
5893                };
5894                let Some(view) = maybe_view else {
5895                    return QueryExpr::Table(tq);
5896                };
5897
5898                // Recurse into the view body — views may reference other
5899                // views. The recursion yields the final QueryExpr we need
5900                // to merge the outer's filter / limit / offset into.
5901                let inner_expr = self.rewrite_view_refs_inner((*view.query).clone());
5902
5903                // Phase 5: when the body is a Table we merge the outer
5904                // TableQuery's WHERE / LIMIT / OFFSET into it so stacked
5905                // views filter recursively. Non-table bodies (Search,
5906                // Ask, Vector, Graph, Hybrid) can't meaningfully combine
5907                // with an outer Table query today — return the body
5908                // verbatim; outer predicates are lost. Full projection
5909                // merge lands in Phase 5.2.
5910                match inner_expr {
5911                    QueryExpr::Table(mut inner_tq) => {
5912                        if let Some(outer_filter) = tq.filter.take() {
5913                            inner_tq.filter = Some(match inner_tq.filter.take() {
5914                                Some(existing) => {
5915                                    Filter::And(Box::new(existing), Box::new(outer_filter))
5916                                }
5917                                None => outer_filter,
5918                            });
5919                        }
5920                        if let Some(outer_limit) = tq.limit {
5921                            inner_tq.limit = Some(match inner_tq.limit {
5922                                Some(existing) => existing.min(outer_limit),
5923                                None => outer_limit,
5924                            });
5925                        }
5926                        if let Some(outer_offset) = tq.offset {
5927                            inner_tq.offset = Some(match inner_tq.offset {
5928                                Some(existing) => existing + outer_offset,
5929                                None => outer_offset,
5930                            });
5931                        }
5932                        QueryExpr::Table(inner_tq)
5933                    }
5934                    other => other,
5935                }
5936            }
5937            QueryExpr::Join(mut jq) => {
5938                jq.left = Box::new(self.rewrite_view_refs_inner(*jq.left));
5939                jq.right = Box::new(self.rewrite_view_refs_inner(*jq.right));
5940                QueryExpr::Join(jq)
5941            }
5942            // Other variants don't carry nested QueryExpr that can reference
5943            // a view by table name. Return as-is.
5944            other => other,
5945        }
5946    }
5947
5948    /// Internal dispatch: route a `QueryExpr` to the appropriate executor.
5949    /// Shared by `execute_query` (after parse/cache) and `execute_query_expr`
5950    /// (direct call from prepared-statement handler).
5951    fn authorize_relational_table_select(
5952        &self,
5953        mut table: TableQuery,
5954        frame: &dyn super::statement_frame::ReadFrame,
5955    ) -> RedDBResult<Option<TableQuery>> {
5956        if let Some(TableSource::Subquery(inner)) = table.source.take() {
5957            let authorized_inner = self.authorize_relational_select_expr(*inner, frame)?;
5958            table.source = Some(TableSource::Subquery(Box::new(authorized_inner)));
5959            return Ok(Some(table));
5960        }
5961
5962        self.check_table_column_projection_authz(&table, frame)?;
5963
5964        if self.inner.rls_enabled_tables.read().contains(&table.table) {
5965            return Ok(inject_rls_filters(self, frame, table));
5966        }
5967
5968        Ok(Some(table))
5969    }
5970
5971    fn authorize_relational_join_select(
5972        &self,
5973        mut join: JoinQuery,
5974        frame: &dyn super::statement_frame::ReadFrame,
5975    ) -> RedDBResult<Option<JoinQuery>> {
5976        self.check_join_column_projection_authz(&join, frame)?;
5977        join.left = Box::new(self.authorize_relational_join_child(*join.left, frame)?);
5978        join.right = Box::new(self.authorize_relational_join_child(*join.right, frame)?);
5979        Ok(inject_rls_into_join(self, frame, join))
5980    }
5981
5982    fn authorize_relational_join_child(
5983        &self,
5984        expr: QueryExpr,
5985        frame: &dyn super::statement_frame::ReadFrame,
5986    ) -> RedDBResult<QueryExpr> {
5987        match expr {
5988            QueryExpr::Table(mut table) => {
5989                if let Some(TableSource::Subquery(inner)) = table.source.take() {
5990                    let authorized_inner = self.authorize_relational_select_expr(*inner, frame)?;
5991                    table.source = Some(TableSource::Subquery(Box::new(authorized_inner)));
5992                }
5993                Ok(QueryExpr::Table(table))
5994            }
5995            QueryExpr::Join(join) => self
5996                .authorize_relational_join_select(join, frame)?
5997                .map(QueryExpr::Join)
5998                .ok_or_else(|| {
5999                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6000                }),
6001            other => Ok(other),
6002        }
6003    }
6004
6005    fn authorize_relational_select_expr(
6006        &self,
6007        expr: QueryExpr,
6008        frame: &dyn super::statement_frame::ReadFrame,
6009    ) -> RedDBResult<QueryExpr> {
6010        match expr {
6011            QueryExpr::Table(table) => self
6012                .authorize_relational_table_select(table, frame)?
6013                .map(QueryExpr::Table)
6014                .ok_or_else(|| {
6015                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6016                }),
6017            QueryExpr::Join(join) => self
6018                .authorize_relational_join_select(join, frame)?
6019                .map(QueryExpr::Join)
6020                .ok_or_else(|| {
6021                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6022                }),
6023            other => Ok(other),
6024        }
6025    }
6026
6027    fn check_table_column_projection_authz(
6028        &self,
6029        table: &TableQuery,
6030        frame: &dyn super::statement_frame::ReadFrame,
6031    ) -> RedDBResult<()> {
6032        let Some((username, role)) = frame.identity() else {
6033            return Ok(());
6034        };
6035        let Some(auth_store) = self.inner.auth_store.read().clone() else {
6036            return Ok(());
6037        };
6038
6039        let columns = self.resolved_table_projection_columns(table)?;
6040        let request = ColumnAccessRequest::select(table.table.clone(), columns);
6041        let principal = UserId::from_parts(frame.effective_scope(), username);
6042        let ctx = runtime_iam_context(role, frame.effective_scope());
6043        let outcome = auth_store.check_column_projection_authz(&principal, &request, &ctx);
6044        if outcome.allowed() {
6045            return Ok(());
6046        }
6047
6048        if let Some(denied) = outcome.first_denied_column() {
6049            return Err(RedDBError::Query(format!(
6050                "permission denied: principal=`{username}` cannot select column `{}`",
6051                denied.resource.name
6052            )));
6053        }
6054        Err(RedDBError::Query(format!(
6055            "permission denied: principal=`{username}` cannot select table `{}`",
6056            table.table
6057        )))
6058    }
6059
6060    fn check_join_column_projection_authz(
6061        &self,
6062        join: &JoinQuery,
6063        frame: &dyn super::statement_frame::ReadFrame,
6064    ) -> RedDBResult<()> {
6065        let mut by_table: HashMap<String, BTreeSet<String>> = HashMap::new();
6066        let projections = crate::storage::query::sql_lowering::effective_join_projections(join);
6067        self.collect_join_projection_columns(join, &projections, &mut by_table)?;
6068
6069        for (table, columns) in by_table {
6070            let query = TableQuery {
6071                table,
6072                source: None,
6073                alias: None,
6074                select_items: Vec::new(),
6075                columns: columns.into_iter().map(Projection::Column).collect(),
6076                where_expr: None,
6077                filter: None,
6078                group_by_exprs: Vec::new(),
6079                group_by: Vec::new(),
6080                having_expr: None,
6081                having: None,
6082                order_by: Vec::new(),
6083                limit: None,
6084                limit_param: None,
6085                offset: None,
6086                offset_param: None,
6087                expand: None,
6088                as_of: None,
6089            };
6090            self.check_table_column_projection_authz(&query, frame)?;
6091        }
6092        Ok(())
6093    }
6094
6095    fn collect_join_projection_columns(
6096        &self,
6097        join: &JoinQuery,
6098        projections: &[Projection],
6099        out: &mut HashMap<String, BTreeSet<String>>,
6100    ) -> RedDBResult<()> {
6101        let left = table_side_context(join.left.as_ref());
6102        let right = table_side_context(join.right.as_ref());
6103
6104        if projections
6105            .iter()
6106            .any(|projection| matches!(projection, Projection::All))
6107        {
6108            for side in [left.as_ref(), right.as_ref()].into_iter().flatten() {
6109                out.entry(side.table.clone())
6110                    .or_default()
6111                    .extend(self.table_all_projection_columns(&side.table)?);
6112            }
6113            return Ok(());
6114        }
6115
6116        for projection in projections {
6117            collect_projection_columns_for_join_side(
6118                projection,
6119                left.as_ref(),
6120                right.as_ref(),
6121                out,
6122            )?;
6123        }
6124        Ok(())
6125    }
6126
6127    fn resolved_table_projection_columns(&self, table: &TableQuery) -> RedDBResult<Vec<String>> {
6128        let projections = crate::storage::query::sql_lowering::effective_table_projections(table);
6129        if projections
6130            .iter()
6131            .any(|projection| matches!(projection, Projection::All))
6132        {
6133            return self.table_all_projection_columns(&table.table);
6134        }
6135
6136        let mut columns = BTreeSet::new();
6137        for projection in &projections {
6138            collect_projection_columns_for_table(
6139                projection,
6140                &table.table,
6141                table.alias.as_deref(),
6142                &mut columns,
6143            );
6144        }
6145        Ok(columns.into_iter().collect())
6146    }
6147
6148    fn table_all_projection_columns(&self, table: &str) -> RedDBResult<Vec<String>> {
6149        if let Some(contract) = self.inner.db.collection_contract_arc(table) {
6150            let columns: Vec<String> = contract
6151                .declared_columns
6152                .iter()
6153                .map(|column| column.name.clone())
6154                .collect();
6155            if !columns.is_empty() {
6156                return Ok(columns);
6157            }
6158        }
6159
6160        let records = scan_runtime_table_source_records_limited(&self.inner.db, table, Some(1))?;
6161        Ok(records
6162            .first()
6163            .map(|record| {
6164                record
6165                    .column_names()
6166                    .into_iter()
6167                    .map(|column| column.to_string())
6168                    .collect()
6169            })
6170            .unwrap_or_default())
6171    }
6172
6173    fn dispatch_expr(
6174        &self,
6175        expr: QueryExpr,
6176        query_str: &str,
6177        mode: QueryMode,
6178    ) -> RedDBResult<RuntimeQueryResult> {
6179        let statement = query_expr_name(&expr);
6180        match expr {
6181            QueryExpr::Graph(_) | QueryExpr::Path(_) => {
6182                // Graph queries are not cacheable as prepared statements.
6183                Err(RedDBError::Query(
6184                    "graph queries cannot be used as prepared statements".to_string(),
6185                ))
6186            }
6187            QueryExpr::Table(table) => {
6188                let scope = self.ai_scope();
6189                if super::red_schema::is_virtual_table(&table.table) {
6190                    return Ok(RuntimeQueryResult {
6191                        query: query_str.to_string(),
6192                        mode,
6193                        statement,
6194                        engine: "runtime-red-schema",
6195                        result: super::red_schema::red_query(
6196                            self,
6197                            &table.table,
6198                            &table,
6199                            &scope as &dyn super::statement_frame::ReadFrame,
6200                        )?,
6201                        affected_rows: 0,
6202                        statement_type: "select",
6203                    });
6204                }
6205                let Some(table_with_rls) = self.authorize_relational_table_select(
6206                    table,
6207                    &scope as &dyn super::statement_frame::ReadFrame,
6208                )?
6209                else {
6210                    return Ok(RuntimeQueryResult {
6211                        query: query_str.to_string(),
6212                        mode,
6213                        statement,
6214                        engine: "runtime-table-rls",
6215                        result: crate::storage::query::unified::UnifiedResult::empty(),
6216                        affected_rows: 0,
6217                        statement_type: "select",
6218                    });
6219                };
6220                Ok(RuntimeQueryResult {
6221                    query: query_str.to_string(),
6222                    mode,
6223                    statement,
6224                    engine: "runtime-table",
6225                    result: execute_runtime_table_query(
6226                        &self.inner.db,
6227                        &table_with_rls,
6228                        Some(&self.inner.index_store),
6229                    )?,
6230                    affected_rows: 0,
6231                    statement_type: "select",
6232                })
6233            }
6234            QueryExpr::Join(join) => {
6235                let scope = self.ai_scope();
6236                let Some(join_with_rls) = self.authorize_relational_join_select(
6237                    join,
6238                    &scope as &dyn super::statement_frame::ReadFrame,
6239                )?
6240                else {
6241                    return Ok(RuntimeQueryResult {
6242                        query: query_str.to_string(),
6243                        mode,
6244                        statement,
6245                        engine: "runtime-join-rls",
6246                        result: crate::storage::query::unified::UnifiedResult::empty(),
6247                        affected_rows: 0,
6248                        statement_type: "select",
6249                    });
6250                };
6251                Ok(RuntimeQueryResult {
6252                    query: query_str.to_string(),
6253                    mode,
6254                    statement,
6255                    engine: "runtime-join",
6256                    result: execute_runtime_join_query(&self.inner.db, &join_with_rls)?,
6257                    affected_rows: 0,
6258                    statement_type: "select",
6259                })
6260            }
6261            QueryExpr::Vector(vector) => Ok(RuntimeQueryResult {
6262                query: query_str.to_string(),
6263                mode,
6264                statement,
6265                engine: "runtime-vector",
6266                result: execute_runtime_vector_query(&self.inner.db, &vector)?,
6267                affected_rows: 0,
6268                statement_type: "select",
6269            }),
6270            QueryExpr::Hybrid(hybrid) => Ok(RuntimeQueryResult {
6271                query: query_str.to_string(),
6272                mode,
6273                statement,
6274                engine: "runtime-hybrid",
6275                result: execute_runtime_hybrid_query(&self.inner.db, &hybrid)?,
6276                affected_rows: 0,
6277                statement_type: "select",
6278            }),
6279            _ => Err(RedDBError::Query(format!(
6280                "prepared-statement execution does not support {statement} statements"
6281            ))),
6282        }
6283    }
6284
6285    /// Ultra-fast path: detect `SELECT * FROM table WHERE _entity_id = N` by string pattern
6286    /// and execute it without SQL parsing or planning. Returns None if pattern doesn't match.
6287    fn try_fast_entity_lookup(&self, query: &str) -> Option<RedDBResult<RuntimeQueryResult>> {
6288        // Pattern: "SELECT * FROM <table> WHERE _entity_id = <id>"
6289        // or "SELECT * FROM <table> WHERE _entity_id =<id>"
6290        let q = query.trim();
6291        if !q.starts_with("SELECT") && !q.starts_with("select") {
6292            return None;
6293        }
6294
6295        // Find "WHERE _entity_id = " or "WHERE _entity_id ="
6296        let where_pos = q
6297            .find("WHERE _entity_id")
6298            .or_else(|| q.find("where _entity_id"))?;
6299        let after_field = &q[where_pos + 16..].trim_start(); // skip "WHERE _entity_id"
6300        let after_eq = after_field.strip_prefix('=')?.trim_start();
6301
6302        // Parse the entity ID number
6303        let id_str = after_eq.trim();
6304        let entity_id: u64 = id_str.parse().ok()?;
6305
6306        // Extract table name: between "FROM " and " WHERE"
6307        let from_pos = q.find("FROM ").or_else(|| q.find("from "))? + 5;
6308        let table = q[from_pos..where_pos].trim();
6309        if table.is_empty()
6310            || table.contains(' ') && !table.contains(" AS ") && !table.contains(" as ")
6311        {
6312            return None; // complex query, fall through
6313        }
6314        let table_name = table.split_whitespace().next()?;
6315
6316        // Direct entity lookup — skips SQL parse, plan cache, result
6317        // cache, view rewriter, RLS gate. Safe because the gating in
6318        // `execute_query` guarantees no scope override / no
6319        // transaction context is active. MVCC visibility is still
6320        // honoured against the current snapshot.
6321        let store = self.inner.db.store();
6322        let entity = store
6323            .get(
6324                table_name,
6325                crate::storage::unified::EntityId::new(entity_id),
6326            )
6327            .filter(entity_visible_under_current_snapshot);
6328
6329        let count = if entity.is_some() { 1u64 } else { 0 };
6330
6331        // Materialize a record so downstream consumers that walk
6332        // `result.records` (embedded runtime API, decrypt pass, CLI)
6333        // see the row. Previously only `pre_serialized_json` was
6334        // filled, which caused those consumers to see zero rows and
6335        // skewed benchmarks.
6336        let records: Vec<crate::storage::query::unified::UnifiedRecord> = entity
6337            .as_ref()
6338            .and_then(|e| runtime_table_record_from_entity(e.clone()))
6339            .into_iter()
6340            .collect();
6341
6342        let json = match entity {
6343            Some(ref e) => execute_runtime_serialize_single_entity(e),
6344            None => r#"{"columns":[],"record_count":0,"selection":{"scope":"any"},"records":[]}"#
6345                .to_string(),
6346        };
6347
6348        Some(Ok(RuntimeQueryResult {
6349            query: query.to_string(),
6350            mode: crate::storage::query::modes::QueryMode::Sql,
6351            statement: "select",
6352            engine: "fast-entity-lookup",
6353            result: crate::storage::query::unified::UnifiedResult {
6354                columns: Vec::new(),
6355                records,
6356                stats: crate::storage::query::unified::QueryStats {
6357                    rows_scanned: count,
6358                    ..Default::default()
6359                },
6360                pre_serialized_json: Some(json),
6361            },
6362            affected_rows: 0,
6363            statement_type: "select",
6364        }))
6365    }
6366
6367    fn result_cache_backend(&self) -> RuntimeResultCacheBackend {
6368        match self
6369            .config_string(RESULT_CACHE_BACKEND_KEY, RESULT_CACHE_DEFAULT_BACKEND)
6370            .as_str()
6371        {
6372            "blob_cache" => RuntimeResultCacheBackend::BlobCache,
6373            "shadow" => RuntimeResultCacheBackend::Shadow,
6374            _ => RuntimeResultCacheBackend::Legacy,
6375        }
6376    }
6377
6378    pub(super) fn get_result_cache_entry(&self, key: &str) -> Option<RuntimeQueryResult> {
6379        match self.result_cache_backend() {
6380            RuntimeResultCacheBackend::Legacy => self.get_legacy_result_cache_entry(key),
6381            RuntimeResultCacheBackend::BlobCache => self.get_blob_result_cache_entry(key),
6382            RuntimeResultCacheBackend::Shadow => {
6383                let legacy = self.get_legacy_result_cache_entry(key);
6384                let blob = self.get_blob_result_cache_entry(key);
6385                if let (Some(ref legacy), Some(ref blob)) = (&legacy, &blob) {
6386                    if result_cache_fingerprint(legacy) != result_cache_fingerprint(blob) {
6387                        self.inner
6388                            .result_cache_shadow_divergences
6389                            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
6390                        tracing::warn!(
6391                            key,
6392                            metric = crate::runtime::METRIC_CACHE_SHADOW_DIVERGENCE_TOTAL,
6393                            "result cache shadow backend diverged from legacy"
6394                        );
6395                    }
6396                }
6397                legacy
6398            }
6399        }
6400    }
6401
6402    fn get_legacy_result_cache_entry(&self, key: &str) -> Option<RuntimeQueryResult> {
6403        let cache = self.inner.result_cache.read();
6404        cache.0.get(key).and_then(|entry| {
6405            if entry.cached_at.elapsed().as_secs() < RESULT_CACHE_TTL_SECS {
6406                Some(entry.result.clone())
6407            } else {
6408                None
6409            }
6410        })
6411    }
6412
6413    fn get_blob_result_cache_entry(&self, key: &str) -> Option<RuntimeQueryResult> {
6414        let hit = self
6415            .inner
6416            .result_blob_cache
6417            .get(RESULT_CACHE_BLOB_NAMESPACE, key)?;
6418        {
6419            let cache = self.inner.result_blob_entries.read();
6420            if let Some(entry) = cache.0.get(key) {
6421                return Some(entry.result.clone());
6422            }
6423        }
6424
6425        let (result, scopes) = decode_result_cache_payload(hit.value())?;
6426        let mut cache = self.inner.result_blob_entries.write();
6427        let (ref mut map, ref mut order) = *cache;
6428        if !map.contains_key(key) {
6429            order.push_back(key.to_string());
6430        }
6431        map.insert(
6432            key.to_string(),
6433            RuntimeResultCacheEntry {
6434                result: result.clone(),
6435                cached_at: std::time::Instant::now(),
6436                scopes,
6437            },
6438        );
6439        trim_result_cache(map, order);
6440        Some(result)
6441    }
6442
6443    pub(super) fn put_result_cache_entry(&self, key: &str, entry: RuntimeResultCacheEntry) {
6444        match self.result_cache_backend() {
6445            RuntimeResultCacheBackend::Legacy => self.put_legacy_result_cache_entry(key, entry),
6446            RuntimeResultCacheBackend::BlobCache => self.put_blob_result_cache_entry(key, entry),
6447            RuntimeResultCacheBackend::Shadow => {
6448                self.put_legacy_result_cache_entry(key, entry.clone());
6449                self.put_blob_result_cache_entry(key, entry);
6450            }
6451        }
6452    }
6453
6454    fn put_legacy_result_cache_entry(&self, key: &str, entry: RuntimeResultCacheEntry) {
6455        let mut cache = self.inner.result_cache.write();
6456        let (ref mut map, ref mut order) = *cache;
6457        if !map.contains_key(key) {
6458            order.push_back(key.to_string());
6459        }
6460        map.insert(key.to_string(), entry);
6461        trim_result_cache(map, order);
6462    }
6463
6464    fn put_blob_result_cache_entry(&self, key: &str, entry: RuntimeResultCacheEntry) {
6465        let policy = crate::storage::cache::BlobCachePolicy::default()
6466            .ttl_ms(RESULT_CACHE_TTL_SECS * 1000)
6467            .priority(200);
6468        let dependencies = entry.scopes.iter().cloned().collect::<Vec<_>>();
6469        let bytes = encode_result_cache_payload(&entry)
6470            .unwrap_or_else(|| result_cache_fingerprint(&entry.result).into_bytes());
6471        let put = crate::storage::cache::BlobCachePut::new(bytes)
6472            .with_dependencies(dependencies)
6473            .with_policy(policy);
6474        if self
6475            .inner
6476            .result_blob_cache
6477            .put(RESULT_CACHE_BLOB_NAMESPACE, key, put)
6478            .is_err()
6479        {
6480            return;
6481        }
6482
6483        let mut cache = self.inner.result_blob_entries.write();
6484        let (ref mut map, ref mut order) = *cache;
6485        if !map.contains_key(key) {
6486            order.push_back(key.to_string());
6487        }
6488        map.insert(key.to_string(), entry);
6489        trim_result_cache(map, order);
6490    }
6491
6492    pub fn result_cache_shadow_divergences(&self) -> u64 {
6493        self.inner
6494            .result_cache_shadow_divergences
6495            .load(std::sync::atomic::Ordering::Relaxed)
6496    }
6497
6498    /// Invalidate the result cache (call after any write operation).
6499    /// Full clear — use for DDL (DROP TABLE, schema changes) or when table is unknown.
6500    pub fn invalidate_result_cache(&self) {
6501        let mut cache = self.inner.result_cache.write();
6502        cache.0.clear();
6503        cache.1.clear();
6504        let mut blob_entries = self.inner.result_blob_entries.write();
6505        blob_entries.0.clear();
6506        blob_entries.1.clear();
6507        self.inner
6508            .result_blob_cache
6509            .invalidate_namespace(RESULT_CACHE_BLOB_NAMESPACE);
6510    }
6511
6512    /// Invalidate only result cache entries that declared a dependency on `table`.
6513    /// Cheaper than a full clear: unrelated tables keep their cached results.
6514    pub(crate) fn invalidate_result_cache_for_table(&self, table: &str) {
6515        // Hot-path probe both backends before taking write locks. The blob
6516        // backend is node-local, same as the legacy result cache.
6517        let legacy_has_match = {
6518            let cache = self.inner.result_cache.read();
6519            let (ref map, _) = *cache;
6520            !map.is_empty() && map.values().any(|entry| entry.scopes.contains(table))
6521        };
6522        let blob_has_match = {
6523            let cache = self.inner.result_blob_entries.read();
6524            let (ref map, _) = *cache;
6525            !map.is_empty() && map.values().any(|entry| entry.scopes.contains(table))
6526        };
6527        if legacy_has_match {
6528            let mut cache = self.inner.result_cache.write();
6529            let (ref mut map, ref mut order) = *cache;
6530            map.retain(|_, entry| !entry.scopes.contains(table));
6531            order.retain(|key| map.contains_key(key));
6532        }
6533
6534        if matches!(
6535            self.result_cache_backend(),
6536            RuntimeResultCacheBackend::BlobCache | RuntimeResultCacheBackend::Shadow
6537        ) {
6538            let mut blob_entries = self.inner.result_blob_entries.write();
6539            let (ref mut blob_map, ref mut blob_order) = *blob_entries;
6540            blob_map.clear();
6541            blob_order.clear();
6542            self.inner
6543                .result_blob_cache
6544                .invalidate_namespace(RESULT_CACHE_BLOB_NAMESPACE);
6545        } else if blob_has_match {
6546            let mut blob_entries = self.inner.result_blob_entries.write();
6547            let (ref mut blob_map, ref mut blob_order) = *blob_entries;
6548            blob_map.retain(|_, entry| !entry.scopes.contains(table));
6549            blob_order.retain(|key| blob_map.contains_key(key));
6550        }
6551    }
6552
6553    pub(crate) fn invalidate_plan_cache(&self) {
6554        self.inner.query_cache.write().clear();
6555        self.inner
6556            .ddl_epoch
6557            .fetch_add(1, std::sync::atomic::Ordering::Release);
6558    }
6559
6560    /// Read the monotonic DDL epoch counter. Bumped by every
6561    /// `invalidate_plan_cache` call so prepared-statement holders can
6562    /// detect schema drift between PREPARE and EXECUTE.
6563    pub fn ddl_epoch(&self) -> u64 {
6564        self.inner
6565            .ddl_epoch
6566            .load(std::sync::atomic::Ordering::Acquire)
6567    }
6568
6569    pub(crate) fn clear_table_planner_stats(&self, table: &str) {
6570        let store = self.inner.db.store();
6571        crate::storage::query::planner::stats_catalog::clear_table_stats(store.as_ref(), table);
6572        self.invalidate_plan_cache();
6573    }
6574
6575    /// Replay `tenant_tables.*.column` keys from red_config at boot so
6576    /// `CREATE TABLE ... TENANT BY (col)` declarations persist across
6577    /// restarts (Phase 2.5.4). Reads every row of the `red_config`
6578    /// collection, picks the keys matching the tenant-marker shape,
6579    /// and calls `register_tenant_table` for each.
6580    ///
6581    /// Safe no-op when `red_config` doesn't exist (first boot on a
6582    /// fresh datadir).
6583    pub(crate) fn rehydrate_tenant_tables(&self) {
6584        let store = self.inner.db.store();
6585        let Some(manager) = store.get_collection("red_config") else {
6586            return;
6587        };
6588        // Replay in insertion order (SegmentManager iteration). Multiple
6589        // toggles on the same table leave several rows behind — the
6590        // last one processed wins because each register/unregister
6591        // call overwrites the in-memory state.
6592        for entity in manager.query_all(|_| true) {
6593            let crate::storage::unified::entity::EntityData::Row(row) = &entity.data else {
6594                continue;
6595            };
6596            let Some(named) = &row.named else { continue };
6597            let Some(crate::storage::schema::Value::Text(key)) = named.get("key") else {
6598                continue;
6599            };
6600            // Shape: tenant_tables.{table}.column
6601            let Some(rest) = key.strip_prefix("tenant_tables.") else {
6602                continue;
6603            };
6604            let Some((table, suffix)) = rest.rsplit_once('.') else {
6605                // Issue #205 — a `tenant_tables.*` row that doesn't
6606                // split cleanly is a schema-shape regression: the
6607                // metadata writer must always emit the `.column`
6608                // suffix, so reaching this branch means an upgrade
6609                // with incompatible state or external tampering.
6610                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
6611                    collection: "red_config".to_string(),
6612                    detail: format!("malformed tenant_tables key: {key}"),
6613                }
6614                .emit_global();
6615                continue;
6616            };
6617            if suffix != "column" {
6618                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
6619                    collection: "red_config".to_string(),
6620                    detail: format!("unexpected tenant_tables suffix: {key}"),
6621                }
6622                .emit_global();
6623                continue;
6624            }
6625            match named.get("value") {
6626                Some(crate::storage::schema::Value::Text(column)) => {
6627                    self.register_tenant_table(table, column);
6628                }
6629                // Null / missing value = DISABLE TENANCY marker.
6630                Some(crate::storage::schema::Value::Null) | None => {
6631                    self.unregister_tenant_table(table);
6632                }
6633                _ => {}
6634            }
6635        }
6636    }
6637
6638    /// Register a table as tenant-scoped (Phase 2.5.4). Installs the
6639    /// in-memory column mapping, the implicit RLS policy, and enables
6640    /// row-level security on the table. Idempotent — re-registering
6641    /// the same `(table, column)` replaces the prior auto-policy.
6642    pub fn register_tenant_table(&self, table: &str, column: &str) {
6643        use crate::storage::query::ast::{
6644            CompareOp, CreatePolicyQuery, Expr, FieldRef, Filter, Span,
6645        };
6646        self.inner
6647            .tenant_tables
6648            .write()
6649            .insert(table.to_string(), column.to_string());
6650
6651        // Build the policy: col = CURRENT_TENANT()
6652        // Uses CompareExpr so the comparison happens at runtime against
6653        // the thread-local tenant value read by the CURRENT_TENANT
6654        // scalar. Spans are synthetic — there's no source location for
6655        // an auto-generated policy.
6656        let lhs = Expr::Column {
6657            field: FieldRef::TableColumn {
6658                table: table.to_string(),
6659                column: column.to_string(),
6660            },
6661            span: Span::synthetic(),
6662        };
6663        let rhs = Expr::FunctionCall {
6664            name: "CURRENT_TENANT".to_string(),
6665            args: Vec::new(),
6666            span: Span::synthetic(),
6667        };
6668        let policy_filter = Filter::CompareExpr {
6669            lhs,
6670            op: CompareOp::Eq,
6671            rhs,
6672        };
6673
6674        let policy = CreatePolicyQuery {
6675            name: "__tenant_iso".to_string(),
6676            table: table.to_string(),
6677            action: None, // None = ALL actions (SELECT/INSERT/UPDATE/DELETE)
6678            role: None,   // None = every role
6679            using: Box::new(policy_filter),
6680            // Auto-tenancy defaults to Table targets. Collections of
6681            // other kinds (graph / vector / queue / timeseries) that
6682            // opt in via `ALTER ... ENABLE TENANCY` should use the
6683            // matching kind — but for now we keep the auto-policy
6684            // kind-agnostic so the evaluator can apply it to any
6685            // entity living in the collection.
6686            target_kind: crate::storage::query::ast::PolicyTargetKind::Table,
6687        };
6688
6689        // Replace any prior auto-policy for this table (column rename).
6690        self.inner.rls_policies.write().insert(
6691            (table.to_string(), "__tenant_iso".to_string()),
6692            Arc::new(policy),
6693        );
6694        self.inner
6695            .rls_enabled_tables
6696            .write()
6697            .insert(table.to_string());
6698
6699        // Auto-build a hash index on the tenant column. Every read/write
6700        // against a tenant-scoped table carries an implicit
6701        // `col = CURRENT_TENANT()` predicate from the auto-policy, so an
6702        // index on that column is on the hot path of every query. Without
6703        // it, every SELECT/UPDATE/DELETE degrades to a full scan.
6704        self.ensure_tenant_index(table, column);
6705    }
6706
6707    /// Auto-create the hash index that backs the tenant-iso RLS predicate.
6708    /// Skipped when:
6709    ///   * the column is dotted (nested path — flat secondary indices
6710    ///     don't cover those today; RLS still works via the policy)
6711    ///   * `__tenant_idx_{table}` already exists (idempotent on rehydrate)
6712    ///   * the user already registered an index whose first column matches
6713    ///     (avoids redundant duplicates of a user-defined composite)
6714    fn ensure_tenant_index(&self, table: &str, column: &str) {
6715        if column.contains('.') {
6716            return;
6717        }
6718        let index_name = format!("__tenant_idx_{table}");
6719        let registry = self.inner.index_store.list_indices(table);
6720        if registry.iter().any(|idx| idx.name == index_name) {
6721            return;
6722        }
6723        if registry
6724            .iter()
6725            .any(|idx| idx.columns.first().map(|c| c.as_str()) == Some(column))
6726        {
6727            return;
6728        }
6729
6730        let store = self.inner.db.store();
6731        let Some(manager) = store.get_collection(table) else {
6732            return;
6733        };
6734        let entities = manager.query_all(|_| true);
6735        let entity_fields: Vec<(
6736            crate::storage::unified::EntityId,
6737            Vec<(String, crate::storage::schema::Value)>,
6738        )> = entities
6739            .iter()
6740            .map(|e| {
6741                let fields = match &e.data {
6742                    crate::storage::EntityData::Row(row) => {
6743                        if let Some(ref named) = row.named {
6744                            named.iter().map(|(k, v)| (k.clone(), v.clone())).collect()
6745                        } else if let Some(ref schema) = row.schema {
6746                            schema
6747                                .iter()
6748                                .zip(row.columns.iter())
6749                                .map(|(k, v)| (k.clone(), v.clone()))
6750                                .collect()
6751                        } else {
6752                            Vec::new()
6753                        }
6754                    }
6755                    crate::storage::EntityData::Node(node) => node
6756                        .properties
6757                        .iter()
6758                        .map(|(k, v)| (k.clone(), v.clone()))
6759                        .collect(),
6760                    _ => Vec::new(),
6761                };
6762                (e.id, fields)
6763            })
6764            .collect();
6765
6766        let columns = vec![column.to_string()];
6767        if self
6768            .inner
6769            .index_store
6770            .create_index(
6771                &index_name,
6772                table,
6773                &columns,
6774                super::index_store::IndexMethodKind::Hash,
6775                false,
6776                &entity_fields,
6777            )
6778            .is_err()
6779        {
6780            return;
6781        }
6782        self.inner
6783            .index_store
6784            .register(super::index_store::RegisteredIndex {
6785                name: index_name,
6786                collection: table.to_string(),
6787                columns,
6788                method: super::index_store::IndexMethodKind::Hash,
6789                unique: false,
6790            });
6791        self.invalidate_plan_cache();
6792    }
6793
6794    /// Drop the auto-generated tenant index, if one exists. Called from
6795    /// `unregister_tenant_table` so DISABLE TENANCY / DROP TABLE clean up.
6796    fn drop_tenant_index(&self, table: &str) {
6797        let index_name = format!("__tenant_idx_{table}");
6798        self.inner.index_store.drop_index(&index_name, table);
6799    }
6800
6801    /// Retrieve the tenant column for a table, if any (Phase 2.5.4).
6802    /// Used by the INSERT auto-fill path to know which column to
6803    /// populate with `current_tenant()` when the user didn't name it.
6804    pub fn tenant_column(&self, table: &str) -> Option<String> {
6805        self.inner.tenant_tables.read().get(table).cloned()
6806    }
6807
6808    /// Remove a table's tenant registration (Phase 2.5.4). Called by
6809    /// DROP TABLE / ALTER TABLE DISABLE TENANCY. Removes the auto-policy
6810    /// but leaves any user-installed explicit policies intact.
6811    pub fn unregister_tenant_table(&self, table: &str) {
6812        self.inner.tenant_tables.write().remove(table);
6813        self.inner
6814            .rls_policies
6815            .write()
6816            .remove(&(table.to_string(), "__tenant_iso".to_string()));
6817        self.drop_tenant_index(table);
6818        // Only clear RLS enablement if no other policies remain.
6819        let has_other_policies = self
6820            .inner
6821            .rls_policies
6822            .read()
6823            .keys()
6824            .any(|(t, _)| t == table);
6825        if !has_other_policies {
6826            self.inner.rls_enabled_tables.write().remove(table);
6827        }
6828    }
6829
6830    /// Record that the running transaction has marked `id` in `collection`
6831    /// for deletion (Phase 2.3.2b MVCC tombstones). `stamper_xid` is the
6832    /// xid that was written into `xmax` — either the parent txn xid or
6833    /// the innermost savepoint sub-xid. Savepoint rollback filters by
6834    /// this xid to revive only its own tombstones.
6835    pub(crate) fn record_pending_tombstone(
6836        &self,
6837        conn_id: u64,
6838        collection: &str,
6839        id: crate::storage::unified::entity::EntityId,
6840        stamper_xid: crate::storage::transaction::snapshot::Xid,
6841    ) {
6842        self.inner
6843            .pending_tombstones
6844            .write()
6845            .entry(conn_id)
6846            .or_default()
6847            .push((collection.to_string(), id, stamper_xid));
6848    }
6849
6850    /// Flush tombstones on COMMIT — tuples are physically removed from
6851    /// storage. Safe to call with an empty list (no-op).
6852    pub(crate) fn finalize_pending_tombstones(&self, conn_id: u64) {
6853        let Some(pending) = self.inner.pending_tombstones.write().remove(&conn_id) else {
6854            return;
6855        };
6856        if pending.is_empty() {
6857            return;
6858        }
6859
6860        // Group by collection so every batch issues a single `delete_batch`.
6861        let mut grouped: HashMap<String, Vec<crate::storage::unified::entity::EntityId>> =
6862            HashMap::new();
6863        for (collection, id, _xid) in pending {
6864            grouped.entry(collection).or_default().push(id);
6865        }
6866
6867        let store = self.inner.db.store();
6868        for (collection, ids) in grouped {
6869            if let Err(err) = store.delete_batch(&collection, &ids) {
6870                // Best-effort: COMMIT already succeeded at the MVCC level
6871                // (xmax keeps the row hidden), so log and move on. A
6872                // later VACUUM will reclaim the storage.
6873                eprintln!(
6874                    "pending tombstone delete_batch failed for {collection}: {err}; \
6875                     rows stay xmax-stamped (reader-invisible) until VACUUM"
6876                );
6877                continue;
6878            }
6879            for id in &ids {
6880                store.context_index().remove_entity(*id);
6881                self.cdc_emit(
6882                    crate::replication::cdc::ChangeOperation::Delete,
6883                    &collection,
6884                    id.raw(),
6885                    "entity",
6886                );
6887            }
6888        }
6889    }
6890
6891    /// Revive tombstones on ROLLBACK — reset `xmax` to 0 so the tuples
6892    /// become visible again to future snapshots. Best-effort: a row
6893    /// already reclaimed by a concurrent VACUUM stays gone, but VACUUM
6894    /// never reclaims tuples whose xmax is still referenced by any
6895    /// active snapshot, so this case is only reachable via external
6896    /// storage corruption.
6897    pub(crate) fn revive_pending_tombstones(&self, conn_id: u64) {
6898        let Some(pending) = self.inner.pending_tombstones.write().remove(&conn_id) else {
6899            return;
6900        };
6901
6902        let store = self.inner.db.store();
6903        for (collection, id, _xid) in pending {
6904            let Some(manager) = store.get_collection(&collection) else {
6905                continue;
6906            };
6907            if let Some(mut entity) = manager.get(id) {
6908                entity.set_xmax(0);
6909                let _ = manager.update(entity);
6910            }
6911        }
6912    }
6913
6914    pub(crate) fn finalize_pending_kv_watch_events(&self, conn_id: u64) {
6915        let Some(pending) = self.inner.pending_kv_watch_events.write().remove(&conn_id) else {
6916            return;
6917        };
6918        for event in pending {
6919            self.cdc_emit_kv(
6920                event.op,
6921                &event.collection,
6922                &event.key,
6923                0,
6924                event.before,
6925                event.after,
6926            );
6927        }
6928    }
6929
6930    pub(crate) fn discard_pending_kv_watch_events(&self, conn_id: u64) {
6931        self.inner.pending_kv_watch_events.write().remove(&conn_id);
6932    }
6933
6934    /// Materialise the entire graph store while applying MVCC visibility
6935    /// AND per-collection RLS to each candidate node and edge. Mirrors
6936    /// `materialize_graph` but routes every entity through the same
6937    /// gate the SELECT path uses, with the correct `PolicyTargetKind`
6938    /// per entity kind (`Nodes` for graph nodes, `Edges` for graph
6939    /// edges). Returns the filtered `GraphStore` plus the
6940    /// `node_id → properties` map the executor needs for `RETURN n.*`
6941    /// projections.
6942    fn materialize_graph_with_rls(
6943        &self,
6944    ) -> RedDBResult<(
6945        crate::storage::engine::GraphStore,
6946        std::collections::HashMap<
6947            String,
6948            std::collections::HashMap<String, crate::storage::schema::Value>,
6949        >,
6950    )> {
6951        use crate::storage::engine::GraphStore;
6952        use crate::storage::query::ast::{PolicyAction, PolicyTargetKind};
6953        use crate::storage::unified::entity::{EntityData, EntityKind};
6954        use std::collections::{HashMap, HashSet};
6955
6956        let store = self.inner.db.store();
6957        let snap_ctx = capture_current_snapshot();
6958        let role = current_auth_identity().map(|(_, r)| r.as_str().to_string());
6959
6960        let graph = GraphStore::new();
6961        let mut node_properties: HashMap<String, HashMap<String, crate::storage::schema::Value>> =
6962            HashMap::new();
6963        let mut allowed_nodes: HashSet<String> = HashSet::new();
6964
6965        // Per-collection cached compiled filters — Nodes-kind for
6966        // first pass, Edges-kind for the second. None entries mean
6967        // "RLS enabled, zero matching policy → deny all of this kind".
6968        let mut node_rls: HashMap<String, Option<crate::storage::query::ast::Filter>> =
6969            HashMap::new();
6970        let mut edge_rls: HashMap<String, Option<crate::storage::query::ast::Filter>> =
6971            HashMap::new();
6972
6973        let collections = store.list_collections();
6974
6975        // First pass — gather nodes.
6976        for collection in &collections {
6977            let Some(manager) = store.get_collection(collection) else {
6978                continue;
6979            };
6980            let entities = manager.query_all(|_| true);
6981            for entity in entities {
6982                if !entity_visible_with_context(snap_ctx.as_ref(), &entity) {
6983                    continue;
6984                }
6985                let EntityKind::GraphNode(ref node) = entity.kind else {
6986                    continue;
6987                };
6988                if !node_passes_rls(self, collection, role.as_deref(), &mut node_rls, &entity) {
6989                    continue;
6990                }
6991                let id_str = entity.id.raw().to_string();
6992                graph
6993                    .add_node_with_label(
6994                        &id_str,
6995                        &node.label,
6996                        &super::graph_node_label(&node.node_type),
6997                    )
6998                    .map_err(|err| RedDBError::Query(err.to_string()))?;
6999                allowed_nodes.insert(id_str.clone());
7000                if let EntityData::Node(node_data) = &entity.data {
7001                    node_properties.insert(id_str, node_data.properties.clone());
7002                }
7003            }
7004        }
7005
7006        // Second pass — gather edges. An edge appears only when both
7007        // endpoint nodes survived the RLS pass AND the edge itself
7008        // passes its own RLS gate.
7009        for collection in &collections {
7010            let Some(manager) = store.get_collection(collection) else {
7011                continue;
7012            };
7013            let entities = manager.query_all(|_| true);
7014            for entity in entities {
7015                if !entity_visible_with_context(snap_ctx.as_ref(), &entity) {
7016                    continue;
7017                }
7018                let EntityKind::GraphEdge(ref edge) = entity.kind else {
7019                    continue;
7020                };
7021                if !allowed_nodes.contains(&edge.from_node)
7022                    || !allowed_nodes.contains(&edge.to_node)
7023                {
7024                    continue;
7025                }
7026                if !edge_passes_rls(self, collection, role.as_deref(), &mut edge_rls, &entity) {
7027                    continue;
7028                }
7029                let weight = match &entity.data {
7030                    EntityData::Edge(e) => e.weight,
7031                    _ => edge.weight as f32 / 1000.0,
7032                };
7033                graph
7034                    .add_edge_with_label(
7035                        &edge.from_node,
7036                        &edge.to_node,
7037                        &super::graph_edge_label(&edge.label),
7038                        weight,
7039                    )
7040                    .map_err(|err| RedDBError::Query(err.to_string()))?;
7041            }
7042        }
7043
7044        // Suppress unused-PolicyAction/PolicyTargetKind warnings — both
7045        // are used inside the helper closures via the per-kind helpers
7046        // declared at the bottom of this file.
7047        let _ = (PolicyAction::Select, PolicyTargetKind::Nodes);
7048
7049        Ok((graph, node_properties))
7050    }
7051
7052    /// Phase 1.1 MVCC universal: post-save hook that stamps `xmin` on a
7053    /// freshly-inserted entity when the current connection holds an
7054    /// open transaction. Used by graph / vector / queue / timeseries
7055    /// write paths that go through the DevX builder API (`db.node(...)
7056    /// .save()` and friends) — those live in the storage crate and
7057    /// can't reach `current_xid()` without crossing layers, so the
7058    /// application layer calls this helper right after `save()` to
7059    /// finalise the MVCC stamp.
7060    ///
7061    /// Autocommit (outside BEGIN) is a no-op — no extra lookup or
7062    /// write, so the non-transactional hot path stays untouched.
7063    ///
7064    /// Best-effort: if the collection or entity disappears between
7065    /// the save and the stamp (concurrent DROP), we silently skip.
7066    pub(crate) fn stamp_xmin_if_in_txn(
7067        &self,
7068        collection: &str,
7069        id: crate::storage::unified::entity::EntityId,
7070    ) {
7071        let Some(xid) = self.current_xid() else {
7072            return;
7073        };
7074        let store = self.inner.db.store();
7075        let Some(manager) = store.get_collection(collection) else {
7076            return;
7077        };
7078        if let Some(mut entity) = manager.get(id) {
7079            entity.set_xmin(xid);
7080            let _ = manager.update(entity);
7081        }
7082    }
7083
7084    /// Revive tombstones stamped by `stamper_xid` or any sub-xid
7085    /// allocated after it (Phase 2.3.2e savepoint rollback). Any
7086    /// pending entries with `xid < stamper_xid` stay queued because
7087    /// they belong to the enclosing scope — they'll either flush on
7088    /// COMMIT or revive on an outer ROLLBACK TO SAVEPOINT.
7089    ///
7090    /// Returns the number of tuples whose `xmax` was wiped back to 0.
7091    pub(crate) fn revive_tombstones_since(&self, conn_id: u64, stamper_xid: u64) -> usize {
7092        let mut guard = self.inner.pending_tombstones.write();
7093        let Some(pending) = guard.get_mut(&conn_id) else {
7094            return 0;
7095        };
7096
7097        let store = self.inner.db.store();
7098        let mut revived = 0usize;
7099        pending.retain(|(collection, id, xid)| {
7100            if *xid < stamper_xid {
7101                // Stamped before the savepoint — keep in queue.
7102                return true;
7103            }
7104            if let Some(manager) = store.get_collection(collection) {
7105                if let Some(mut entity) = manager.get(*id) {
7106                    entity.set_xmax(0);
7107                    let _ = manager.update(entity);
7108                    revived += 1;
7109                }
7110            }
7111            false
7112        });
7113        if pending.is_empty() {
7114            guard.remove(&conn_id);
7115        }
7116        revived
7117    }
7118
7119    /// Return the snapshot the current connection should use for visibility
7120    /// checks (Phase 2.3 PG parity).
7121    ///
7122    /// * If the connection is inside a BEGIN-wrapped transaction, reuse
7123    ///   the snapshot stored in its `TxnContext`.
7124    /// * Otherwise (autocommit), capture a fresh snapshot tied to an
7125    ///   implicit xid=0 — the read path treats pre-MVCC rows as always
7126    ///   visible so this degrades to "see everything committed".
7127    pub fn current_snapshot(&self) -> crate::storage::transaction::snapshot::Snapshot {
7128        let conn_id = current_connection_id();
7129        if let Some(ctx) = self.inner.tx_contexts.read().get(&conn_id).cloned() {
7130            return ctx.snapshot;
7131        }
7132        // Autocommit: take a fresh snapshot bounded by `peek_next_xid` so
7133        // every already-committed xid (which is strictly less) passes the
7134        // `xmin <= snap.xid` gate, while concurrently-active xids land in
7135        // the `in_progress` set and stay hidden until they commit. Using
7136        // xid=0 would incorrectly hide every MVCC-stamped tuple.
7137        let high_water = self.inner.snapshot_manager.peek_next_xid();
7138        self.inner.snapshot_manager.snapshot(high_water)
7139    }
7140
7141    /// Xid of the current connection's active transaction, or `None` when
7142    /// running outside a BEGIN/COMMIT block. Write paths call this to
7143    /// decide whether to stamp `xmin`/`xmax` on tuples.
7144    /// Phase 2.3.2e: when a savepoint is open, `writer_xid` returns the
7145    /// sub-xid so new writes can be selectively rolled back. Otherwise
7146    /// the parent txn's xid is returned, matching pre-savepoint
7147    /// behaviour. Callers that need the enclosing *transaction* xid
7148    /// (e.g. VACUUM min-active calculations) should read `ctx.xid`
7149    /// directly.
7150    pub fn current_xid(&self) -> Option<crate::storage::transaction::snapshot::Xid> {
7151        let conn_id = current_connection_id();
7152        self.inner
7153            .tx_contexts
7154            .read()
7155            .get(&conn_id)
7156            .map(|ctx| ctx.writer_xid())
7157    }
7158
7159    /// Access the shared `SnapshotManager` — useful for VACUUM to compute
7160    /// the oldest-active xid when reclaiming dead tuples.
7161    pub fn snapshot_manager(&self) -> Arc<crate::storage::transaction::snapshot::SnapshotManager> {
7162        Arc::clone(&self.inner.snapshot_manager)
7163    }
7164
7165    /// Own-tx xids (parent + open savepoints) for the current
7166    /// connection. Transports + tests that build a `SnapshotContext`
7167    /// manually (outside the `execute_query` scope) need this set so
7168    /// the writer's own uncommitted tuples stay visible to self.
7169    pub fn current_txn_own_xids(
7170        &self,
7171    ) -> std::collections::HashSet<crate::storage::transaction::snapshot::Xid> {
7172        let mut set = std::collections::HashSet::new();
7173        if let Some(ctx) = self.inner.tx_contexts.read().get(&current_connection_id()) {
7174            set.insert(ctx.xid);
7175            for (_, sub) in &ctx.savepoints {
7176                set.insert(*sub);
7177            }
7178        }
7179        set
7180    }
7181
7182    /// Access the shared `ForeignTableRegistry` (Phase 3.2 PG parity).
7183    ///
7184    /// Callers use this to check whether a table name is a registered
7185    /// foreign table (`registry.is_foreign_table(name)`) and, if so, to
7186    /// scan it (`registry.scan(name)`). The read-path rewriter consults
7187    /// this before dispatching into native-collection lookup.
7188    pub fn foreign_tables(&self) -> Arc<crate::storage::fdw::ForeignTableRegistry> {
7189        Arc::clone(&self.inner.foreign_tables)
7190    }
7191
7192    /// Is Row-Level Security enabled for this table? (Phase 2.5 PG parity)
7193    pub fn is_rls_enabled(&self, table: &str) -> bool {
7194        self.inner.rls_enabled_tables.read().contains(table)
7195    }
7196
7197    /// Collect the USING predicates that apply to this `(table, role, action)`.
7198    ///
7199    /// Returned filters should be OR-combined (a row passes RLS when *any*
7200    /// matching policy accepts it) and then AND-ed into the query's WHERE.
7201    /// When the table has RLS disabled this returns an empty Vec — callers
7202    /// can fast-path back to the unfiltered read.
7203    pub fn matching_rls_policies(
7204        &self,
7205        table: &str,
7206        role: Option<&str>,
7207        action: crate::storage::query::ast::PolicyAction,
7208    ) -> Vec<crate::storage::query::ast::Filter> {
7209        // Default kind = Table preserves the pre-Phase-2.5.5 behaviour:
7210        // callers that don't name a kind only see Table-scoped
7211        // policies (which is what execute SELECT / UPDATE / DELETE
7212        // expect).
7213        self.matching_rls_policies_for_kind(
7214            table,
7215            role,
7216            action,
7217            crate::storage::query::ast::PolicyTargetKind::Table,
7218        )
7219    }
7220
7221    /// Kind-aware variant used by cross-model scans (Phase 2.5.5).
7222    ///
7223    /// Graph scans request `Nodes` / `Edges`, vector ANN requests
7224    /// `Vectors`, queue consumers request `Messages`, and timeseries
7225    /// range scans request `Points`. Policies tagged with a
7226    /// different kind are skipped so a graph-scoped policy doesn't
7227    /// accidentally gate a table SELECT on the same collection.
7228    pub fn matching_rls_policies_for_kind(
7229        &self,
7230        table: &str,
7231        role: Option<&str>,
7232        action: crate::storage::query::ast::PolicyAction,
7233        kind: crate::storage::query::ast::PolicyTargetKind,
7234    ) -> Vec<crate::storage::query::ast::Filter> {
7235        if !self.is_rls_enabled(table) {
7236            return Vec::new();
7237        }
7238        let policies = self.inner.rls_policies.read();
7239        policies
7240            .iter()
7241            .filter_map(|((t, _), p)| {
7242                if t != table {
7243                    return None;
7244                }
7245                // Kind gate — Table policies also apply to every
7246                // other kind *iff* the policy predicate evaluates
7247                // against entity fields that exist uniformly; the
7248                // caller's kind filter is the stricter check, so
7249                // match literally. Auto-tenancy policies stamp
7250                // Table and the caller passes the concrete kind —
7251                // we allow Table policies to apply cross-kind for
7252                // backwards compat.
7253                if p.target_kind != kind
7254                    && p.target_kind != crate::storage::query::ast::PolicyTargetKind::Table
7255                {
7256                    return None;
7257                }
7258                // Action gate — `None` means "ALL" actions.
7259                if let Some(a) = p.action {
7260                    if a != action {
7261                        return None;
7262                    }
7263                }
7264                // Role gate — `None` means "any role".
7265                if let Some(p_role) = p.role.as_deref() {
7266                    match role {
7267                        Some(r) if r == p_role => {}
7268                        _ => return None,
7269                    }
7270                }
7271                Some((*p.using).clone())
7272            })
7273            .collect()
7274    }
7275
7276    pub(crate) fn refresh_table_planner_stats(&self, table: &str) {
7277        let store = self.inner.db.store();
7278        if let Some(stats) =
7279            crate::storage::query::planner::stats_catalog::analyze_collection(store.as_ref(), table)
7280        {
7281            crate::storage::query::planner::stats_catalog::persist_table_stats(
7282                store.as_ref(),
7283                &stats,
7284            );
7285        } else {
7286            crate::storage::query::planner::stats_catalog::clear_table_stats(store.as_ref(), table);
7287        }
7288        self.invalidate_plan_cache();
7289    }
7290
7291    pub(crate) fn note_table_write(&self, table: &str) {
7292        // Skip the write lock when the table is already marked
7293        // dirty. With single-row UPDATEs in a loop this used to
7294        // grab the planner_dirty_tables write lock N times even
7295        // though the first call already flipped the flag.
7296        let already_dirty = self.inner.planner_dirty_tables.read().contains(table);
7297        if !already_dirty {
7298            self.inner
7299                .planner_dirty_tables
7300                .write()
7301                .insert(table.to_string());
7302        }
7303        self.invalidate_result_cache_for_table(table);
7304    }
7305
7306    /// Wrap the planner's `RuntimeQueryExplain` as rows on a
7307    /// `RuntimeQueryResult` so callers over the SQL interface see the
7308    /// plan tree in the same shape a SELECT produces.
7309    ///
7310    /// Columns: `op`, `source`, `est_rows`, `est_cost`, `depth`.
7311    /// Nodes are walked depth-first; `depth` counts from 0 at the
7312    /// root so a text renderer can indent without re-walking.
7313    fn explain_as_rows(&self, raw_query: &str, inner_sql: &str) -> RedDBResult<RuntimeQueryResult> {
7314        let explain = self.explain_query(inner_sql)?;
7315
7316        let columns = vec![
7317            "op".to_string(),
7318            "source".to_string(),
7319            "est_rows".to_string(),
7320            "est_cost".to_string(),
7321            "depth".to_string(),
7322        ];
7323
7324        let mut records: Vec<crate::storage::query::unified::UnifiedRecord> = Vec::new();
7325
7326        // Prepend `CteScan` markers when the query carried a leading
7327        // WITH clause. The CTE bodies are already inlined into the
7328        // main plan tree, but operators reading EXPLAIN need to see
7329        // which named CTEs were resolved — without this row the plan
7330        // would look indistinguishable from a hand-inlined query.
7331        for name in &explain.cte_materializations {
7332            use std::sync::Arc;
7333            let mut rec = crate::storage::query::unified::UnifiedRecord::default();
7334            rec.set_arc(Arc::from("op"), Value::text("CteScan".to_string()));
7335            rec.set_arc(Arc::from("source"), Value::text(name.clone()));
7336            rec.set_arc(Arc::from("est_rows"), Value::Float(0.0));
7337            rec.set_arc(Arc::from("est_cost"), Value::Float(0.0));
7338            rec.set_arc(Arc::from("depth"), Value::Integer(0));
7339            records.push(rec);
7340        }
7341
7342        walk_plan_node(&explain.logical_plan.root, 0, &mut records);
7343
7344        let result = crate::storage::query::unified::UnifiedResult {
7345            columns,
7346            records,
7347            stats: Default::default(),
7348            pre_serialized_json: None,
7349        };
7350
7351        Ok(RuntimeQueryResult {
7352            query: raw_query.to_string(),
7353            mode: explain.mode,
7354            statement: "explain",
7355            engine: "runtime-explain",
7356            result,
7357            affected_rows: 0,
7358            statement_type: "select",
7359        })
7360    }
7361
7362    // -----------------------------------------------------------------
7363    // Granular RBAC — privilege gate + GRANT/REVOKE/ALTER USER dispatch
7364    // -----------------------------------------------------------------
7365
7366    /// Project a `QueryExpr` to the (action, resource) pair the
7367    /// privilege engine cares about. Returns `Ok(())` for statements
7368    /// that don't touch user data (transaction control, SHOW, SET, etc.).
7369    pub(super) fn check_query_privilege(
7370        &self,
7371        expr: &crate::storage::query::ast::QueryExpr,
7372    ) -> Result<(), String> {
7373        use crate::auth::privileges::{Action, AuthzContext, Resource};
7374        use crate::auth::UserId;
7375        use crate::storage::query::ast::QueryExpr;
7376
7377        // No auth store wired (embedded mode / fresh DB / tests) → bypass.
7378        // The bootstrap path itself goes through `execute_query` so this
7379        // is the only sensible default; once auth is wired, the gate
7380        // becomes active.
7381        let auth_store = match self.inner.auth_store.read().clone() {
7382            Some(s) => s,
7383            None => return Ok(()),
7384        };
7385
7386        // Resolve principal + role from the thread-local identity.
7387        // Anonymous (no identity) is allowed to read the bootstrap path
7388        // only when auth_store says so; we treat missing identity as
7389        // platform-admin-equivalent here so embedded test harnesses
7390        // continue to work without setting an identity.
7391        let (username, role) = match current_auth_identity() {
7392            Some(p) => p,
7393            None => return Ok(()),
7394        };
7395        let tenant = current_tenant();
7396
7397        let ctx = AuthzContext {
7398            principal: &username,
7399            effective_role: role,
7400            tenant: tenant.as_deref(),
7401        };
7402        let principal_id = UserId::from_parts(tenant.as_deref(), &username);
7403
7404        // Map QueryExpr → (Action, Resource).
7405        let (action, resource) = match expr {
7406            QueryExpr::Table(t) => (Action::Select, Resource::table_from_name(&t.table)),
7407            QueryExpr::QueueSelect(q) => (Action::Select, Resource::table_from_name(&q.queue)),
7408            QueryExpr::Graph(g) => {
7409                if auth_store.iam_authorization_enabled() {
7410                    self.check_graph_property_projection_privilege(
7411                        &auth_store,
7412                        &principal_id,
7413                        role,
7414                        tenant.as_deref(),
7415                        g,
7416                    )?;
7417                    return Ok(());
7418                }
7419                return Ok(());
7420            }
7421            QueryExpr::Vector(v) => {
7422                if auth_store.iam_authorization_enabled() {
7423                    self.check_table_like_column_projection_privilege(
7424                        &auth_store,
7425                        &principal_id,
7426                        role,
7427                        tenant.as_deref(),
7428                        &v.collection,
7429                        &["content".to_string()],
7430                    )?;
7431                    return Ok(());
7432                }
7433                return Ok(());
7434            }
7435            QueryExpr::Insert(i) => (Action::Insert, Resource::table_from_name(&i.table)),
7436            QueryExpr::Update(u) => (Action::Update, Resource::table_from_name(&u.table)),
7437            QueryExpr::Delete(d) => (Action::Delete, Resource::table_from_name(&d.table)),
7438            // Joins inherit the read privilege from any constituent
7439            // table — for now we emit a single Select on the database
7440            // (admins bypass; non-admins need a Database/Schema grant).
7441            QueryExpr::Join(_) => (Action::Select, Resource::Database),
7442            // GRANT / REVOKE / ALTER USER are authority statements;
7443            // require Admin (the helper methods enforce).
7444            QueryExpr::Grant(_) | QueryExpr::Revoke(_) | QueryExpr::AlterUser(_) => {
7445                return if role == crate::auth::Role::Admin {
7446                    Ok(())
7447                } else {
7448                    Err(format!(
7449                        "principal=`{}` role=`{:?}` cannot issue ACL/auth DDL",
7450                        username, role
7451                    ))
7452                };
7453            }
7454            QueryExpr::CreateIamPolicy { id, .. } => {
7455                return self.check_policy_management_privilege(
7456                    &auth_store,
7457                    &principal_id,
7458                    role,
7459                    tenant.as_deref(),
7460                    "policy:put",
7461                    "policy",
7462                    id,
7463                );
7464            }
7465            QueryExpr::DropIamPolicy { id } => {
7466                return self.check_policy_management_privilege(
7467                    &auth_store,
7468                    &principal_id,
7469                    role,
7470                    tenant.as_deref(),
7471                    "policy:drop",
7472                    "policy",
7473                    id,
7474                );
7475            }
7476            QueryExpr::AttachPolicy { policy_id, .. } => {
7477                return self.check_policy_management_privilege(
7478                    &auth_store,
7479                    &principal_id,
7480                    role,
7481                    tenant.as_deref(),
7482                    "policy:attach",
7483                    "policy",
7484                    policy_id,
7485                );
7486            }
7487            QueryExpr::DetachPolicy { policy_id, .. } => {
7488                return self.check_policy_management_privilege(
7489                    &auth_store,
7490                    &principal_id,
7491                    role,
7492                    tenant.as_deref(),
7493                    "policy:detach",
7494                    "policy",
7495                    policy_id,
7496                );
7497            }
7498            QueryExpr::ShowPolicies { .. } | QueryExpr::ShowEffectivePermissions { .. } => {
7499                return Ok(());
7500            }
7501            QueryExpr::SimulatePolicy { .. } => {
7502                return self.check_policy_management_privilege(
7503                    &auth_store,
7504                    &principal_id,
7505                    role,
7506                    tenant.as_deref(),
7507                    "policy:simulate",
7508                    "policy",
7509                    "*",
7510                );
7511            }
7512            // DROP and TRUNCATE — Write-role gate + per-collection IAM policy
7513            // when IAM mode is active. Other DDL stays role-only for now.
7514            QueryExpr::DropTable(q) => {
7515                return self.check_ddl_collection_privilege(
7516                    &auth_store,
7517                    &principal_id,
7518                    role,
7519                    tenant.as_deref(),
7520                    &username,
7521                    "drop",
7522                    &q.name,
7523                );
7524            }
7525            QueryExpr::DropGraph(q) => {
7526                return self.check_ddl_collection_privilege(
7527                    &auth_store,
7528                    &principal_id,
7529                    role,
7530                    tenant.as_deref(),
7531                    &username,
7532                    "drop",
7533                    &q.name,
7534                );
7535            }
7536            QueryExpr::DropVector(q) => {
7537                return self.check_ddl_collection_privilege(
7538                    &auth_store,
7539                    &principal_id,
7540                    role,
7541                    tenant.as_deref(),
7542                    &username,
7543                    "drop",
7544                    &q.name,
7545                );
7546            }
7547            QueryExpr::DropDocument(q) => {
7548                return self.check_ddl_collection_privilege(
7549                    &auth_store,
7550                    &principal_id,
7551                    role,
7552                    tenant.as_deref(),
7553                    &username,
7554                    "drop",
7555                    &q.name,
7556                );
7557            }
7558            QueryExpr::DropKv(q) => {
7559                return self.check_ddl_collection_privilege(
7560                    &auth_store,
7561                    &principal_id,
7562                    role,
7563                    tenant.as_deref(),
7564                    &username,
7565                    "drop",
7566                    &q.name,
7567                );
7568            }
7569            QueryExpr::DropCollection(q) => {
7570                return self.check_ddl_collection_privilege(
7571                    &auth_store,
7572                    &principal_id,
7573                    role,
7574                    tenant.as_deref(),
7575                    &username,
7576                    "drop",
7577                    &q.name,
7578                );
7579            }
7580            QueryExpr::Truncate(q) => {
7581                return self.check_ddl_collection_privilege(
7582                    &auth_store,
7583                    &principal_id,
7584                    role,
7585                    tenant.as_deref(),
7586                    &username,
7587                    "truncate",
7588                    &q.name,
7589                );
7590            }
7591            // Remaining DDL — gate on Write role. Fine-grained grants TBD.
7592            QueryExpr::CreateTable(_)
7593            | QueryExpr::AlterTable(_)
7594            | QueryExpr::CreateIndex(_)
7595            | QueryExpr::DropIndex(_)
7596            | QueryExpr::CreateSchema(_)
7597            | QueryExpr::DropSchema(_)
7598            | QueryExpr::CreateSequence(_)
7599            | QueryExpr::DropSequence(_)
7600            | QueryExpr::CreateView(_)
7601            | QueryExpr::DropView(_)
7602            | QueryExpr::RefreshMaterializedView(_)
7603            | QueryExpr::CreatePolicy(_)
7604            | QueryExpr::DropPolicy(_)
7605            | QueryExpr::CreateServer(_)
7606            | QueryExpr::DropServer(_)
7607            | QueryExpr::CreateForeignTable(_)
7608            | QueryExpr::DropForeignTable(_)
7609            | QueryExpr::CreateTimeSeries(_)
7610            | QueryExpr::DropTimeSeries(_)
7611            | QueryExpr::CreateQueue(_)
7612            | QueryExpr::AlterQueue(_)
7613            | QueryExpr::DropQueue(_)
7614            | QueryExpr::CreateTree(_)
7615            | QueryExpr::DropTree(_) => {
7616                return if role >= crate::auth::Role::Write {
7617                    Ok(())
7618                } else {
7619                    Err(format!(
7620                        "principal=`{}` role=`{:?}` cannot issue DDL",
7621                        username, role
7622                    ))
7623                };
7624            }
7625            // Migration DDL — CREATE MIGRATION requires Write role (schema author).
7626            QueryExpr::CreateMigration(_) => {
7627                return if role >= crate::auth::Role::Write {
7628                    Ok(())
7629                } else {
7630                    Err(format!(
7631                        "principal=`{}` role=`{:?}` cannot issue CREATE MIGRATION",
7632                        username, role
7633                    ))
7634                };
7635            }
7636            // APPLY / ROLLBACK change data and schema — require Admin.
7637            QueryExpr::ApplyMigration(_) | QueryExpr::RollbackMigration(_) => {
7638                return if role == crate::auth::Role::Admin {
7639                    Ok(())
7640                } else {
7641                    Err(format!(
7642                        "principal=`{}` role=`{:?}` cannot issue APPLY/ROLLBACK MIGRATION",
7643                        username, role
7644                    ))
7645                };
7646            }
7647            // EXPLAIN MIGRATION is read-only — any authenticated principal.
7648            QueryExpr::ExplainMigration(_) => return Ok(()),
7649            // Everything else (SET, SHOW, transaction control, graph
7650            // commands, queue/tree commands, MaintenanceCommand …)
7651            // is allowed for any authenticated principal.
7652            _ => return Ok(()),
7653        };
7654
7655        if auth_store.iam_authorization_enabled() {
7656            let iam_action = legacy_action_to_iam(action);
7657            let iam_resource = legacy_resource_to_iam(&resource, tenant.as_deref());
7658            let iam_ctx = runtime_iam_context(role, tenant.as_deref());
7659            if !auth_store.check_policy_authz(&principal_id, iam_action, &iam_resource, &iam_ctx) {
7660                return Err(format!(
7661                    "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
7662                    username, iam_action, iam_resource.kind, iam_resource.name
7663                ));
7664            }
7665
7666            if let QueryExpr::Table(table) = expr {
7667                self.check_table_column_projection_privilege(
7668                    &auth_store,
7669                    &principal_id,
7670                    &iam_ctx,
7671                    table,
7672                )?;
7673            }
7674
7675            if let QueryExpr::Update(update) = expr {
7676                let columns = update_set_target_columns(update);
7677                if !columns.is_empty() {
7678                    let request = column_access_request_for_table_update(&update.table, columns);
7679                    let outcome =
7680                        auth_store.check_column_projection_authz(&principal_id, &request, &iam_ctx);
7681                    if let Some(denied) = outcome.first_denied_column() {
7682                        return Err(format!(
7683                            "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM column policy",
7684                            username, iam_action, denied.resource.kind, denied.resource.name
7685                        ));
7686                    }
7687                    if !outcome.allowed() {
7688                        return Err(format!(
7689                            "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
7690                            username,
7691                            iam_action,
7692                            outcome.table_resource.kind,
7693                            outcome.table_resource.name
7694                        ));
7695                    }
7696                }
7697            }
7698
7699            Ok(())
7700        } else {
7701            auth_store
7702                .check_grant(&ctx, action, &resource)
7703                .map_err(|e| e.to_string())
7704        }
7705    }
7706
7707    fn check_table_column_projection_privilege(
7708        &self,
7709        auth_store: &Arc<crate::auth::store::AuthStore>,
7710        principal: &crate::auth::UserId,
7711        ctx: &crate::auth::policies::EvalContext,
7712        table: &crate::storage::query::ast::TableQuery,
7713    ) -> Result<(), String> {
7714        use crate::auth::{ColumnAccessRequest, ColumnDecisionEffect};
7715
7716        let columns = requested_table_columns_for_policy(table);
7717        if columns.is_empty() {
7718            return Ok(());
7719        }
7720
7721        let request = ColumnAccessRequest::select(table.table.clone(), columns);
7722        let outcome = auth_store.check_column_projection_authz(principal, &request, ctx);
7723        if outcome.allowed() {
7724            return Ok(());
7725        }
7726
7727        if !matches!(
7728            outcome.table_decision,
7729            crate::auth::policies::Decision::Allow { .. }
7730                | crate::auth::policies::Decision::AdminBypass
7731        ) {
7732            return Err(format!(
7733                "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
7734                principal, outcome.table_resource.kind, outcome.table_resource.name
7735            ));
7736        }
7737
7738        let denied = outcome
7739            .first_denied_column()
7740            .filter(|decision| decision.effective == ColumnDecisionEffect::Denied);
7741        match denied {
7742            Some(decision) => Err(format!(
7743                "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
7744                principal, decision.resource.kind, decision.resource.name
7745            )),
7746            None => Ok(()),
7747        }
7748    }
7749
7750    fn check_graph_property_projection_privilege(
7751        &self,
7752        auth_store: &Arc<crate::auth::store::AuthStore>,
7753        principal: &crate::auth::UserId,
7754        role: crate::auth::Role,
7755        tenant: Option<&str>,
7756        query: &crate::storage::query::ast::GraphQuery,
7757    ) -> Result<(), String> {
7758        let columns = explicit_graph_projection_properties(query);
7759        if columns.is_empty() {
7760            return Ok(());
7761        }
7762        self.check_table_like_column_projection_privilege(
7763            auth_store, principal, role, tenant, "graph", &columns,
7764        )
7765    }
7766
7767    fn check_table_like_column_projection_privilege(
7768        &self,
7769        auth_store: &Arc<crate::auth::store::AuthStore>,
7770        principal: &crate::auth::UserId,
7771        role: crate::auth::Role,
7772        tenant: Option<&str>,
7773        table: &str,
7774        columns: &[String],
7775    ) -> Result<(), String> {
7776        let iam_ctx = runtime_iam_context(role, tenant);
7777        let request =
7778            crate::auth::ColumnAccessRequest::select(table.to_string(), columns.iter().cloned());
7779        let outcome = auth_store.check_column_projection_authz(principal, &request, &iam_ctx);
7780        if outcome.allowed() {
7781            return Ok(());
7782        }
7783        let denied = outcome
7784            .first_denied_column()
7785            .map(|d| d.resource.name.clone())
7786            .unwrap_or_else(|| format!("{table}.<unknown>"));
7787        Err(format!(
7788            "principal=`{}` action=`select` resource=`column:{}` denied by IAM policy",
7789            principal, denied
7790        ))
7791    }
7792
7793    fn check_policy_management_privilege(
7794        &self,
7795        auth_store: &Arc<crate::auth::store::AuthStore>,
7796        principal: &crate::auth::UserId,
7797        role: crate::auth::Role,
7798        tenant: Option<&str>,
7799        action: &str,
7800        resource_kind: &str,
7801        resource_name: &str,
7802    ) -> Result<(), String> {
7803        if !auth_store.iam_authorization_enabled() {
7804            return if role == crate::auth::Role::Admin {
7805                Ok(())
7806            } else {
7807                Err(format!(
7808                    "principal=`{}` role=`{:?}` cannot issue ACL/auth DDL",
7809                    principal, role
7810                ))
7811            };
7812        }
7813
7814        let mut resource = crate::auth::policies::ResourceRef::new(
7815            resource_kind.to_string(),
7816            resource_name.to_string(),
7817        );
7818        if let Some(t) = tenant {
7819            resource = resource.with_tenant(t.to_string());
7820        }
7821        let ctx = runtime_iam_context(role, tenant);
7822        if auth_store.check_policy_authz(principal, action, &resource, &ctx) {
7823            Ok(())
7824        } else {
7825            Err(format!(
7826                "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
7827                principal, action, resource.kind, resource.name
7828            ))
7829        }
7830    }
7831
7832    /// IAM privilege check for DROP / TRUNCATE on a named collection.
7833    ///
7834    /// In legacy mode (IAM not enabled): requires Write role.
7835    /// In IAM mode: requires an explicit `drop` / `truncate` policy on
7836    /// `collection:<name>` (Admin role auto-passes via AdminBypass).
7837    /// Records an audit log entry for both allow and deny outcomes.
7838    fn check_ddl_collection_privilege(
7839        &self,
7840        auth_store: &Arc<crate::auth::store::AuthStore>,
7841        principal: &crate::auth::UserId,
7842        role: crate::auth::Role,
7843        tenant: Option<&str>,
7844        username: &str,
7845        action: &str,
7846        collection: &str,
7847    ) -> Result<(), String> {
7848        if role < crate::auth::Role::Write {
7849            let msg = format!(
7850                "principal=`{}` role=`{:?}` cannot issue DDL",
7851                username, role
7852            );
7853            self.inner.audit_log.record(
7854                action,
7855                username,
7856                collection,
7857                "denied",
7858                crate::json::Value::Null,
7859            );
7860            return Err(msg);
7861        }
7862
7863        if !auth_store.iam_authorization_enabled() {
7864            self.inner.audit_log.record(
7865                action,
7866                username,
7867                collection,
7868                "ok",
7869                crate::json::Value::Null,
7870            );
7871            return Ok(());
7872        }
7873
7874        let resource_name = collection.to_string();
7875        let mut resource = crate::auth::policies::ResourceRef::new(
7876            "collection".to_string(),
7877            resource_name.clone(),
7878        );
7879        if let Some(t) = tenant {
7880            resource = resource.with_tenant(t.to_string());
7881        }
7882        let ctx = runtime_iam_context(role, tenant);
7883        if auth_store.check_policy_authz(principal, action, &resource, &ctx) {
7884            self.inner.audit_log.record(
7885                action,
7886                username,
7887                &resource_name,
7888                "ok",
7889                crate::json::Value::Null,
7890            );
7891            Ok(())
7892        } else {
7893            self.inner.audit_log.record(
7894                action,
7895                username,
7896                &resource_name,
7897                "denied",
7898                crate::json::Value::Null,
7899            );
7900            Err(format!(
7901                "principal=`{}` action=`{}` resource=`collection:{}` denied by IAM policy",
7902                username, action, resource_name
7903            ))
7904        }
7905    }
7906
7907    /// Translate the parsed [`GrantStmt`] into AuthStore mutations.
7908    fn execute_grant_statement(
7909        &self,
7910        query: &str,
7911        stmt: &crate::storage::query::ast::GrantStmt,
7912    ) -> RedDBResult<RuntimeQueryResult> {
7913        use crate::auth::privileges::{Action, GrantPrincipal, Resource};
7914        use crate::auth::UserId;
7915        use crate::storage::query::ast::{GrantObjectKind, GrantPrincipalRef};
7916
7917        let auth_store = self
7918            .inner
7919            .auth_store
7920            .read()
7921            .clone()
7922            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
7923
7924        // Granter identity + role.
7925        let (gname, grole) = current_auth_identity().ok_or_else(|| {
7926            RedDBError::Query("GRANT requires an authenticated principal".to_string())
7927        })?;
7928        let granter = UserId::from_parts(current_tenant().as_deref(), &gname);
7929        let granter_role = grole;
7930
7931        // Build the action set.
7932        let mut actions: Vec<Action> = Vec::new();
7933        if stmt.all {
7934            actions.push(Action::All);
7935        } else {
7936            for kw in &stmt.actions {
7937                let a = Action::from_keyword(kw).ok_or_else(|| {
7938                    RedDBError::Query(format!("unknown privilege keyword `{}`", kw))
7939                })?;
7940                actions.push(a);
7941            }
7942        }
7943
7944        // Audit emit (printed; structured emission is Agent #4's lane).
7945        let mut applied = 0usize;
7946        for obj in &stmt.objects {
7947            let resource = match stmt.object_kind {
7948                GrantObjectKind::Table => Resource::Table {
7949                    schema: obj.schema.clone(),
7950                    table: obj.name.clone(),
7951                },
7952                GrantObjectKind::Schema => Resource::Schema(obj.name.clone()),
7953                GrantObjectKind::Database => Resource::Database,
7954                GrantObjectKind::Function => Resource::Function {
7955                    schema: obj.schema.clone(),
7956                    name: obj.name.clone(),
7957                },
7958            };
7959            for principal in &stmt.principals {
7960                let p = match principal {
7961                    GrantPrincipalRef::Public => GrantPrincipal::Public,
7962                    GrantPrincipalRef::Group(g) => GrantPrincipal::Group(g.clone()),
7963                    GrantPrincipalRef::User { tenant, name } => {
7964                        GrantPrincipal::User(UserId::from_parts(tenant.as_deref(), name))
7965                    }
7966                };
7967                // Tenant of the grant follows the granter's tenant
7968                // (cross-tenant guard inside `AuthStore::grant`).
7969                let tenant = granter.tenant.clone();
7970                auth_store
7971                    .grant(
7972                        &granter,
7973                        granter_role,
7974                        p.clone(),
7975                        resource.clone(),
7976                        actions.clone(),
7977                        stmt.with_grant_option,
7978                        tenant.clone(),
7979                    )
7980                    .map_err(|e| RedDBError::Query(e.to_string()))?;
7981
7982                // IAM policy translation: every GRANT also lands as a
7983                // synthetic `_grant_<id>` policy attached to the
7984                // principal so the new evaluator sees it.
7985                if let Some(policy) =
7986                    grant_to_iam_policy(&p, &resource, &actions, tenant.as_deref())
7987                {
7988                    let pid = policy.id.clone();
7989                    auth_store
7990                        .put_policy_internal(policy)
7991                        .map_err(|e| RedDBError::Query(e.to_string()))?;
7992                    let attachment = match &p {
7993                        GrantPrincipal::User(uid) => {
7994                            crate::auth::store::PrincipalRef::User(uid.clone())
7995                        }
7996                        GrantPrincipal::Group(group) => {
7997                            crate::auth::store::PrincipalRef::Group(group.clone())
7998                        }
7999                        GrantPrincipal::Public => crate::auth::store::PrincipalRef::Group(
8000                            crate::auth::store::PUBLIC_IAM_GROUP.to_string(),
8001                        ),
8002                    };
8003                    auth_store
8004                        .attach_policy(attachment, &pid)
8005                        .map_err(|e| RedDBError::Query(e.to_string()))?;
8006                }
8007                applied += 1;
8008                tracing::info!(
8009                    target: "audit",
8010                    principal = %granter,
8011                    action = "grant",
8012                    "GRANT applied"
8013                );
8014            }
8015        }
8016
8017        self.invalidate_result_cache();
8018        Ok(RuntimeQueryResult::ok_message(
8019            query.to_string(),
8020            &format!("GRANT applied to {} target(s)", applied),
8021            "grant",
8022        ))
8023    }
8024
8025    /// Translate the parsed [`RevokeStmt`] into AuthStore mutations.
8026    fn execute_revoke_statement(
8027        &self,
8028        query: &str,
8029        stmt: &crate::storage::query::ast::RevokeStmt,
8030    ) -> RedDBResult<RuntimeQueryResult> {
8031        use crate::auth::privileges::{Action, GrantPrincipal, Resource};
8032        use crate::auth::UserId;
8033        use crate::storage::query::ast::{GrantObjectKind, GrantPrincipalRef};
8034
8035        let auth_store = self
8036            .inner
8037            .auth_store
8038            .read()
8039            .clone()
8040            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
8041
8042        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
8043            RedDBError::Query("REVOKE requires an authenticated principal".to_string())
8044        })?;
8045        let granter_role = grole;
8046
8047        let actions: Vec<Action> = if stmt.all {
8048            vec![Action::All]
8049        } else {
8050            stmt.actions
8051                .iter()
8052                .map(|kw| Action::from_keyword(kw).unwrap_or(Action::Select))
8053                .collect()
8054        };
8055
8056        let mut total_removed = 0usize;
8057        for obj in &stmt.objects {
8058            let resource = match stmt.object_kind {
8059                GrantObjectKind::Table => Resource::Table {
8060                    schema: obj.schema.clone(),
8061                    table: obj.name.clone(),
8062                },
8063                GrantObjectKind::Schema => Resource::Schema(obj.name.clone()),
8064                GrantObjectKind::Database => Resource::Database,
8065                GrantObjectKind::Function => Resource::Function {
8066                    schema: obj.schema.clone(),
8067                    name: obj.name.clone(),
8068                },
8069            };
8070            for principal in &stmt.principals {
8071                let p = match principal {
8072                    GrantPrincipalRef::Public => GrantPrincipal::Public,
8073                    GrantPrincipalRef::Group(g) => GrantPrincipal::Group(g.clone()),
8074                    GrantPrincipalRef::User { tenant, name } => {
8075                        GrantPrincipal::User(UserId::from_parts(tenant.as_deref(), name))
8076                    }
8077                };
8078                let removed = auth_store
8079                    .revoke(granter_role, &p, &resource, &actions)
8080                    .map_err(|e| RedDBError::Query(e.to_string()))?;
8081                let _removed_policies =
8082                    auth_store.delete_synthetic_grant_policies(&p, &resource, &actions);
8083                total_removed += removed;
8084            }
8085        }
8086
8087        self.invalidate_result_cache();
8088        Ok(RuntimeQueryResult::ok_message(
8089            query.to_string(),
8090            &format!("REVOKE removed {} grant(s)", total_removed),
8091            "revoke",
8092        ))
8093    }
8094
8095    /// Translate the parsed [`AlterUserStmt`] into AuthStore mutations.
8096    fn execute_alter_user_statement(
8097        &self,
8098        query: &str,
8099        stmt: &crate::storage::query::ast::AlterUserStmt,
8100    ) -> RedDBResult<RuntimeQueryResult> {
8101        use crate::auth::privileges::UserAttributes;
8102        use crate::auth::UserId;
8103        use crate::storage::query::ast::AlterUserAttribute;
8104
8105        let auth_store = self
8106            .inner
8107            .auth_store
8108            .read()
8109            .clone()
8110            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
8111
8112        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
8113            RedDBError::Query("ALTER USER requires an authenticated principal".to_string())
8114        })?;
8115        if grole != crate::auth::Role::Admin {
8116            return Err(RedDBError::Query(
8117                "ALTER USER requires Admin role".to_string(),
8118            ));
8119        }
8120
8121        let target = UserId::from_parts(stmt.tenant.as_deref(), &stmt.username);
8122
8123        // Apply attributes incrementally — each one reads the current
8124        // record, mutates the relevant field, writes back.
8125        let mut attrs = auth_store.user_attributes(&target);
8126        let mut enable_change: Option<bool> = None;
8127
8128        for a in &stmt.attributes {
8129            match a {
8130                AlterUserAttribute::ValidUntil(ts) => {
8131                    // Parse ISO-ish timestamp → ms since epoch. Fall
8132                    // back to integer-ms parsing for callers that pass
8133                    // `'1234567890123'`.
8134                    let ms = parse_timestamp_to_ms(ts).ok_or_else(|| {
8135                        RedDBError::Query(format!("invalid VALID UNTIL timestamp `{ts}`"))
8136                    })?;
8137                    attrs.valid_until = Some(ms);
8138                }
8139                AlterUserAttribute::ConnectionLimit(n) => {
8140                    if *n < 0 {
8141                        return Err(RedDBError::Query(
8142                            "CONNECTION LIMIT must be non-negative".to_string(),
8143                        ));
8144                    }
8145                    attrs.connection_limit = Some(*n as u32);
8146                }
8147                AlterUserAttribute::SetSearchPath(p) => {
8148                    attrs.search_path = Some(p.clone());
8149                }
8150                AlterUserAttribute::AddGroup(g) => {
8151                    if !attrs.groups.iter().any(|existing| existing == g) {
8152                        attrs.groups.push(g.clone());
8153                        attrs.groups.sort();
8154                    }
8155                }
8156                AlterUserAttribute::DropGroup(g) => {
8157                    attrs.groups.retain(|existing| existing != g);
8158                }
8159                AlterUserAttribute::Enable => enable_change = Some(true),
8160                AlterUserAttribute::Disable => enable_change = Some(false),
8161                AlterUserAttribute::Password(_) => {
8162                    // Out of scope — accept the AST but no-op so the
8163                    // parser stays compatible with future password
8164                    // rotation work.
8165                }
8166            }
8167        }
8168
8169        auth_store
8170            .set_user_attributes(&target, attrs)
8171            .map_err(|e| RedDBError::Query(e.to_string()))?;
8172        if let Some(en) = enable_change {
8173            auth_store
8174                .set_user_enabled(&target, en)
8175                .map_err(|e| RedDBError::Query(e.to_string()))?;
8176        }
8177        self.invalidate_result_cache();
8178        tracing::info!(
8179            target: "audit",
8180            principal = %target,
8181            action = "alter_user",
8182            "ALTER USER applied"
8183        );
8184
8185        Ok(RuntimeQueryResult::ok_message(
8186            query.to_string(),
8187            &format!("ALTER USER {} applied", target),
8188            "alter_user",
8189        ))
8190    }
8191
8192    // -----------------------------------------------------------------
8193    // IAM policy executors
8194    // -----------------------------------------------------------------
8195
8196    fn execute_create_iam_policy(
8197        &self,
8198        query: &str,
8199        id: &str,
8200        json: &str,
8201    ) -> RedDBResult<RuntimeQueryResult> {
8202        use crate::auth::policies::Policy;
8203
8204        let auth_store = self
8205            .inner
8206            .auth_store
8207            .read()
8208            .clone()
8209            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
8210
8211        // Parse + validate. The kernel rejects oversize / bad shape /
8212        // bad action keywords. If the supplied id differs from the JSON
8213        // id, override it with the SQL-provided id (the JSON id is
8214        // optional context — the SQL DDL form is authoritative).
8215        let mut policy = Policy::from_json_str(json)
8216            .map_err(|e| RedDBError::Query(format!("policy parse: {e}")))?;
8217        if policy.id != id {
8218            policy.id = id.to_string();
8219        }
8220        let pid = policy.id.clone();
8221        auth_store
8222            .put_policy(policy)
8223            .map_err(|e| RedDBError::Query(e.to_string()))?;
8224
8225        let principal = current_auth_identity()
8226            .map(|(u, _)| u)
8227            .unwrap_or_else(|| "anonymous".into());
8228        tracing::info!(
8229            target: "audit",
8230            principal = %principal,
8231            action = "iam:policy.put",
8232            matched_policy_id = %pid,
8233            "CREATE POLICY applied"
8234        );
8235        self.inner.audit_log.record(
8236            "iam/policy.put",
8237            &principal,
8238            &pid,
8239            "ok",
8240            crate::json::Value::Null,
8241        );
8242
8243        self.invalidate_result_cache();
8244        Ok(RuntimeQueryResult::ok_message(
8245            query.to_string(),
8246            &format!("policy `{pid}` stored"),
8247            "create_iam_policy",
8248        ))
8249    }
8250
8251    fn execute_drop_iam_policy(&self, query: &str, id: &str) -> RedDBResult<RuntimeQueryResult> {
8252        let auth_store = self
8253            .inner
8254            .auth_store
8255            .read()
8256            .clone()
8257            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
8258        auth_store
8259            .delete_policy(id)
8260            .map_err(|e| RedDBError::Query(e.to_string()))?;
8261
8262        let principal = current_auth_identity()
8263            .map(|(u, _)| u)
8264            .unwrap_or_else(|| "anonymous".into());
8265        tracing::info!(
8266            target: "audit",
8267            principal = %principal,
8268            action = "iam:policy.drop",
8269            matched_policy_id = %id,
8270            "DROP POLICY applied"
8271        );
8272        self.inner.audit_log.record(
8273            "iam/policy.drop",
8274            &principal,
8275            id,
8276            "ok",
8277            crate::json::Value::Null,
8278        );
8279
8280        self.invalidate_result_cache();
8281        Ok(RuntimeQueryResult::ok_message(
8282            query.to_string(),
8283            &format!("policy `{id}` dropped"),
8284            "drop_iam_policy",
8285        ))
8286    }
8287
8288    fn execute_attach_policy(
8289        &self,
8290        query: &str,
8291        policy_id: &str,
8292        principal: &crate::storage::query::ast::PolicyPrincipalRef,
8293    ) -> RedDBResult<RuntimeQueryResult> {
8294        use crate::auth::store::PrincipalRef;
8295        use crate::auth::UserId;
8296        use crate::storage::query::ast::PolicyPrincipalRef;
8297
8298        let auth_store = self
8299            .inner
8300            .auth_store
8301            .read()
8302            .clone()
8303            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
8304        let p = match principal {
8305            PolicyPrincipalRef::User(u) => {
8306                PrincipalRef::User(UserId::from_parts(u.tenant.as_deref(), &u.username))
8307            }
8308            PolicyPrincipalRef::Group(g) => PrincipalRef::Group(g.clone()),
8309        };
8310        let pretty_target = principal_label(principal);
8311        auth_store
8312            .attach_policy(p, policy_id)
8313            .map_err(|e| RedDBError::Query(e.to_string()))?;
8314
8315        let principal_str = current_auth_identity()
8316            .map(|(u, _)| u)
8317            .unwrap_or_else(|| "anonymous".into());
8318        tracing::info!(
8319            target: "audit",
8320            principal = %principal_str,
8321            action = "iam:policy.attach",
8322            matched_policy_id = %policy_id,
8323            target = %pretty_target,
8324            "ATTACH POLICY applied"
8325        );
8326        self.inner.audit_log.record(
8327            "iam/policy.attach",
8328            &principal_str,
8329            &pretty_target,
8330            "ok",
8331            crate::json::Value::Null,
8332        );
8333
8334        self.invalidate_result_cache();
8335        Ok(RuntimeQueryResult::ok_message(
8336            query.to_string(),
8337            &format!("policy `{policy_id}` attached to {pretty_target}"),
8338            "attach_policy",
8339        ))
8340    }
8341
8342    fn execute_detach_policy(
8343        &self,
8344        query: &str,
8345        policy_id: &str,
8346        principal: &crate::storage::query::ast::PolicyPrincipalRef,
8347    ) -> RedDBResult<RuntimeQueryResult> {
8348        use crate::auth::store::PrincipalRef;
8349        use crate::auth::UserId;
8350        use crate::storage::query::ast::PolicyPrincipalRef;
8351
8352        let auth_store = self
8353            .inner
8354            .auth_store
8355            .read()
8356            .clone()
8357            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
8358        let p = match principal {
8359            PolicyPrincipalRef::User(u) => {
8360                PrincipalRef::User(UserId::from_parts(u.tenant.as_deref(), &u.username))
8361            }
8362            PolicyPrincipalRef::Group(g) => PrincipalRef::Group(g.clone()),
8363        };
8364        let pretty_target = principal_label(principal);
8365        auth_store
8366            .detach_policy(p, policy_id)
8367            .map_err(|e| RedDBError::Query(e.to_string()))?;
8368
8369        let principal_str = current_auth_identity()
8370            .map(|(u, _)| u)
8371            .unwrap_or_else(|| "anonymous".into());
8372        tracing::info!(
8373            target: "audit",
8374            principal = %principal_str,
8375            action = "iam:policy.detach",
8376            matched_policy_id = %policy_id,
8377            target = %pretty_target,
8378            "DETACH POLICY applied"
8379        );
8380        self.inner.audit_log.record(
8381            "iam/policy.detach",
8382            &principal_str,
8383            &pretty_target,
8384            "ok",
8385            crate::json::Value::Null,
8386        );
8387
8388        self.invalidate_result_cache();
8389        Ok(RuntimeQueryResult::ok_message(
8390            query.to_string(),
8391            &format!("policy `{policy_id}` detached from {pretty_target}"),
8392            "detach_policy",
8393        ))
8394    }
8395
8396    fn execute_show_policies(
8397        &self,
8398        query: &str,
8399        filter: Option<&crate::storage::query::ast::PolicyPrincipalRef>,
8400    ) -> RedDBResult<RuntimeQueryResult> {
8401        use crate::auth::UserId;
8402        use crate::storage::query::ast::PolicyPrincipalRef;
8403        use crate::storage::query::unified::UnifiedRecord;
8404        use crate::storage::schema::Value as SchemaValue;
8405        use std::sync::Arc;
8406
8407        let auth_store = self
8408            .inner
8409            .auth_store
8410            .read()
8411            .clone()
8412            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
8413
8414        let pols = match filter {
8415            None => auth_store.list_policies(),
8416            Some(PolicyPrincipalRef::User(u)) => {
8417                let id = UserId::from_parts(u.tenant.as_deref(), &u.username);
8418                auth_store.effective_policies(&id)
8419            }
8420            Some(PolicyPrincipalRef::Group(g)) => auth_store.group_policies(g),
8421        };
8422
8423        let mut records = Vec::with_capacity(pols.len());
8424        for p in pols.iter() {
8425            let mut rec = UnifiedRecord::default();
8426            rec.set_arc(Arc::from("id"), SchemaValue::text(p.id.clone()));
8427            rec.set_arc(
8428                Arc::from("statements"),
8429                SchemaValue::Integer(p.statements.len() as i64),
8430            );
8431            rec.set_arc(
8432                Arc::from("tenant"),
8433                p.tenant
8434                    .as_deref()
8435                    .map(|t| SchemaValue::text(t.to_string()))
8436                    .unwrap_or(SchemaValue::Null),
8437            );
8438            rec.set_arc(Arc::from("json"), SchemaValue::text(p.to_json_string()));
8439            records.push(rec);
8440        }
8441        let mut result = crate::storage::query::unified::UnifiedResult::empty();
8442        result.records = records;
8443        Ok(RuntimeQueryResult {
8444            query: query.to_string(),
8445            mode: crate::storage::query::modes::QueryMode::Sql,
8446            statement: "show_policies",
8447            engine: "iam-policies",
8448            result,
8449            affected_rows: 0,
8450            statement_type: "select",
8451        })
8452    }
8453
8454    fn execute_show_effective_permissions(
8455        &self,
8456        query: &str,
8457        user: &crate::storage::query::ast::PolicyUserRef,
8458        resource: Option<&crate::storage::query::ast::PolicyResourceRef>,
8459    ) -> RedDBResult<RuntimeQueryResult> {
8460        use crate::auth::UserId;
8461        use crate::storage::query::unified::UnifiedRecord;
8462        use crate::storage::schema::Value as SchemaValue;
8463        use std::sync::Arc;
8464
8465        let auth_store = self
8466            .inner
8467            .auth_store
8468            .read()
8469            .clone()
8470            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
8471        let id = UserId::from_parts(user.tenant.as_deref(), &user.username);
8472        let pols = auth_store.effective_policies(&id);
8473
8474        // Show one row per (policy, statement) tuple, plus any
8475        // resource-level filter passed by the caller.
8476        let mut records = Vec::new();
8477        for p in pols.iter() {
8478            for (idx, st) in p.statements.iter().enumerate() {
8479                if let Some(_r) = resource {
8480                    // Naive filter: render statement targets to strings
8481                    // and skip if no match. Conservative default = include
8482                    // (the simulator handles fine-grained matching).
8483                }
8484                let mut rec = UnifiedRecord::default();
8485                rec.set_arc(Arc::from("policy_id"), SchemaValue::text(p.id.clone()));
8486                rec.set_arc(
8487                    Arc::from("statement_index"),
8488                    SchemaValue::Integer(idx as i64),
8489                );
8490                rec.set_arc(
8491                    Arc::from("sid"),
8492                    st.sid
8493                        .as_deref()
8494                        .map(|s| SchemaValue::text(s.to_string()))
8495                        .unwrap_or(SchemaValue::Null),
8496                );
8497                rec.set_arc(
8498                    Arc::from("effect"),
8499                    SchemaValue::text(match st.effect {
8500                        crate::auth::policies::Effect::Allow => "allow",
8501                        crate::auth::policies::Effect::Deny => "deny",
8502                    }),
8503                );
8504                rec.set_arc(
8505                    Arc::from("actions"),
8506                    SchemaValue::Integer(st.actions.len() as i64),
8507                );
8508                rec.set_arc(
8509                    Arc::from("resources"),
8510                    SchemaValue::Integer(st.resources.len() as i64),
8511                );
8512                records.push(rec);
8513            }
8514        }
8515        let mut result = crate::storage::query::unified::UnifiedResult::empty();
8516        result.records = records;
8517        Ok(RuntimeQueryResult {
8518            query: query.to_string(),
8519            mode: crate::storage::query::modes::QueryMode::Sql,
8520            statement: "show_effective_permissions",
8521            engine: "iam-policies",
8522            result,
8523            affected_rows: 0,
8524            statement_type: "select",
8525        })
8526    }
8527
8528    fn execute_simulate_policy(
8529        &self,
8530        query: &str,
8531        user: &crate::storage::query::ast::PolicyUserRef,
8532        action: &str,
8533        resource: &crate::storage::query::ast::PolicyResourceRef,
8534    ) -> RedDBResult<RuntimeQueryResult> {
8535        use crate::auth::policies::ResourceRef;
8536        use crate::auth::store::SimCtx;
8537        use crate::auth::UserId;
8538        use crate::storage::query::unified::UnifiedRecord;
8539        use crate::storage::schema::Value as SchemaValue;
8540        use std::sync::Arc;
8541
8542        let auth_store = self
8543            .inner
8544            .auth_store
8545            .read()
8546            .clone()
8547            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
8548        let id = UserId::from_parts(user.tenant.as_deref(), &user.username);
8549        let r = ResourceRef::new(resource.kind.clone(), resource.name.clone());
8550        let outcome = auth_store.simulate(&id, action, &r, SimCtx::default());
8551
8552        let principal_str = current_auth_identity()
8553            .map(|(u, _)| u)
8554            .unwrap_or_else(|| "anonymous".into());
8555        let (decision_str, matched_pid, matched_sid) = decision_to_strings(&outcome.decision);
8556        tracing::info!(
8557            target: "audit",
8558            principal = %principal_str,
8559            action = "iam:policy.simulate",
8560            decision = %decision_str,
8561            matched_policy_id = ?matched_pid,
8562            matched_sid = ?matched_sid,
8563            "SIMULATE issued"
8564        );
8565        self.inner.audit_log.record(
8566            "iam/policy.simulate",
8567            &principal_str,
8568            &id.to_string(),
8569            "ok",
8570            crate::json::Value::Null,
8571        );
8572
8573        let mut rec = UnifiedRecord::default();
8574        rec.set_arc(Arc::from("decision"), SchemaValue::text(decision_str));
8575        rec.set_arc(
8576            Arc::from("matched_policy_id"),
8577            matched_pid
8578                .map(SchemaValue::text)
8579                .unwrap_or(SchemaValue::Null),
8580        );
8581        rec.set_arc(
8582            Arc::from("matched_sid"),
8583            matched_sid
8584                .map(SchemaValue::text)
8585                .unwrap_or(SchemaValue::Null),
8586        );
8587        rec.set_arc(Arc::from("reason"), SchemaValue::text(outcome.reason));
8588        rec.set_arc(
8589            Arc::from("trail_len"),
8590            SchemaValue::Integer(outcome.trail.len() as i64),
8591        );
8592        let mut result = crate::storage::query::unified::UnifiedResult::empty();
8593        result.records = vec![rec];
8594        Ok(RuntimeQueryResult {
8595            query: query.to_string(),
8596            mode: crate::storage::query::modes::QueryMode::Sql,
8597            statement: "simulate_policy",
8598            engine: "iam-policies",
8599            result,
8600            affected_rows: 0,
8601            statement_type: "select",
8602        })
8603    }
8604}
8605
8606/// Translate a parsed GRANT into a synthetic IAM policy whose id
8607/// starts with `_grant_<unique>`. PUBLIC is represented as an
8608/// implicit IAM group; legacy GROUP grants are still rejected by the
8609/// grant store and are not translated here.
8610fn grant_to_iam_policy(
8611    principal: &crate::auth::privileges::GrantPrincipal,
8612    resource: &crate::auth::privileges::Resource,
8613    actions: &[crate::auth::privileges::Action],
8614    tenant: Option<&str>,
8615) -> Option<crate::auth::policies::Policy> {
8616    use crate::auth::policies::{
8617        compile_action, ActionPattern, Effect, Policy, ResourcePattern, Statement,
8618    };
8619    use crate::auth::privileges::{Action, GrantPrincipal, Resource};
8620
8621    if matches!(principal, GrantPrincipal::Group(_)) {
8622        return None;
8623    }
8624
8625    let now = crate::auth::now_ms();
8626    let id = format!("_grant_{:x}_{:x}", now, std::process::id());
8627
8628    let resource_str = match resource {
8629        Resource::Database => "table:*".to_string(),
8630        Resource::Schema(s) => format!("table:{s}.*"),
8631        Resource::Table { schema, table } => match schema {
8632            Some(s) => format!("table:{s}.{table}"),
8633            None => format!("table:{table}"),
8634        },
8635        Resource::Function { schema, name } => match schema {
8636            Some(s) => format!("function:{s}.{name}"),
8637            None => format!("function:{name}"),
8638        },
8639    };
8640
8641    // Compile actions — fall back to `*` only when the grant included
8642    // `Action::All`. Map every other action keyword to its lowercase
8643    // form so it lines up with the kernel's allowlist.
8644    let action_patterns: Vec<ActionPattern> = if actions.contains(&Action::All) {
8645        vec![ActionPattern::Wildcard]
8646    } else {
8647        actions
8648            .iter()
8649            .map(|a| compile_action(&a.as_str().to_ascii_lowercase()))
8650            .collect()
8651    };
8652    if action_patterns.is_empty() {
8653        return None;
8654    }
8655
8656    // Inline resource compilation matching the kernel's `compile_resource`:
8657    //   * `*` → wildcard
8658    //   * contains `*` → glob
8659    //   * `kind:name` → exact
8660    let resource_patterns = if resource_str == "*" {
8661        vec![ResourcePattern::Wildcard]
8662    } else if resource_str.contains('*') {
8663        vec![ResourcePattern::Glob(resource_str.clone())]
8664    } else if let Some((kind, name)) = resource_str.split_once(':') {
8665        vec![ResourcePattern::Exact {
8666            kind: kind.to_string(),
8667            name: name.to_string(),
8668        }]
8669    } else {
8670        vec![ResourcePattern::Wildcard]
8671    };
8672
8673    let policy = Policy {
8674        id,
8675        version: 1,
8676        tenant: tenant.map(|t| t.to_string()),
8677        created_at: now,
8678        updated_at: now,
8679        statements: vec![Statement {
8680            sid: None,
8681            effect: Effect::Allow,
8682            actions: action_patterns,
8683            resources: resource_patterns,
8684            condition: None,
8685        }],
8686    };
8687    if policy.validate().is_err() {
8688        return None;
8689    }
8690    Some(policy)
8691}
8692
8693fn legacy_action_to_iam(action: crate::auth::privileges::Action) -> &'static str {
8694    use crate::auth::privileges::Action;
8695    match action {
8696        Action::Select => "select",
8697        Action::Insert => "insert",
8698        Action::Update => "update",
8699        Action::Delete => "delete",
8700        Action::Truncate => "truncate",
8701        Action::References => "references",
8702        Action::Execute => "execute",
8703        Action::Usage => "usage",
8704        Action::All => "*",
8705    }
8706}
8707
8708fn update_set_target_columns(query: &crate::storage::query::ast::UpdateQuery) -> Vec<String> {
8709    let mut columns = Vec::new();
8710    for (column, _) in &query.assignment_exprs {
8711        if !columns.iter().any(|seen| seen == column) {
8712            columns.push(column.clone());
8713        }
8714    }
8715    columns
8716}
8717
8718fn column_access_request_for_table_update(
8719    table_name: &str,
8720    columns: Vec<String>,
8721) -> crate::auth::ColumnAccessRequest {
8722    match table_name.split_once('.') {
8723        Some((schema, table)) => {
8724            crate::auth::ColumnAccessRequest::update(table.to_string(), columns)
8725                .with_schema(schema.to_string())
8726        }
8727        None => crate::auth::ColumnAccessRequest::update(table_name.to_string(), columns),
8728    }
8729}
8730
8731fn requested_table_columns_for_policy(
8732    table: &crate::storage::query::ast::TableQuery,
8733) -> Vec<String> {
8734    use crate::storage::query::sql_lowering::{
8735        effective_table_filter, effective_table_group_by_exprs, effective_table_having_filter,
8736        effective_table_projections,
8737    };
8738
8739    let table_name = table.table.as_str();
8740    let table_alias = table.alias.as_deref();
8741    let mut columns = std::collections::BTreeSet::new();
8742
8743    for projection in effective_table_projections(table) {
8744        collect_projection_columns(&projection, table_name, table_alias, &mut columns);
8745    }
8746    if let Some(filter) = effective_table_filter(table) {
8747        collect_filter_columns(&filter, table_name, table_alias, &mut columns);
8748    }
8749    for expr in effective_table_group_by_exprs(table) {
8750        collect_expr_columns(&expr, table_name, table_alias, &mut columns);
8751    }
8752    if let Some(filter) = effective_table_having_filter(table) {
8753        collect_filter_columns(&filter, table_name, table_alias, &mut columns);
8754    }
8755    for order in &table.order_by {
8756        if let Some(expr) = order.expr.as_ref() {
8757            collect_expr_columns(expr, table_name, table_alias, &mut columns);
8758        } else {
8759            collect_field_ref_column(&order.field, table_name, table_alias, &mut columns);
8760        }
8761    }
8762
8763    columns.into_iter().collect()
8764}
8765
8766fn collect_projection_columns(
8767    projection: &crate::storage::query::ast::Projection,
8768    table_name: &str,
8769    table_alias: Option<&str>,
8770    columns: &mut std::collections::BTreeSet<String>,
8771) {
8772    use crate::storage::query::ast::Projection;
8773    match projection {
8774        Projection::All => {
8775            columns.insert("*".to_string());
8776        }
8777        Projection::Column(column) | Projection::Alias(column, _) => {
8778            if column != "*" {
8779                columns.insert(column.clone());
8780            }
8781        }
8782        Projection::Function(_, args) => {
8783            for arg in args {
8784                collect_projection_columns(arg, table_name, table_alias, columns);
8785            }
8786        }
8787        Projection::Expression(filter, _) => {
8788            collect_filter_columns(filter, table_name, table_alias, columns);
8789        }
8790        Projection::Field(field, _) => {
8791            collect_field_ref_column(field, table_name, table_alias, columns);
8792        }
8793    }
8794}
8795
8796fn collect_filter_columns(
8797    filter: &crate::storage::query::ast::Filter,
8798    table_name: &str,
8799    table_alias: Option<&str>,
8800    columns: &mut std::collections::BTreeSet<String>,
8801) {
8802    use crate::storage::query::ast::Filter;
8803    match filter {
8804        Filter::Compare { field, .. }
8805        | Filter::IsNull(field)
8806        | Filter::IsNotNull(field)
8807        | Filter::In { field, .. }
8808        | Filter::Between { field, .. }
8809        | Filter::Like { field, .. }
8810        | Filter::StartsWith { field, .. }
8811        | Filter::EndsWith { field, .. }
8812        | Filter::Contains { field, .. } => {
8813            collect_field_ref_column(field, table_name, table_alias, columns);
8814        }
8815        Filter::CompareFields { left, right, .. } => {
8816            collect_field_ref_column(left, table_name, table_alias, columns);
8817            collect_field_ref_column(right, table_name, table_alias, columns);
8818        }
8819        Filter::CompareExpr { lhs, rhs, .. } => {
8820            collect_expr_columns(lhs, table_name, table_alias, columns);
8821            collect_expr_columns(rhs, table_name, table_alias, columns);
8822        }
8823        Filter::And(left, right) | Filter::Or(left, right) => {
8824            collect_filter_columns(left, table_name, table_alias, columns);
8825            collect_filter_columns(right, table_name, table_alias, columns);
8826        }
8827        Filter::Not(inner) => collect_filter_columns(inner, table_name, table_alias, columns),
8828    }
8829}
8830
8831fn collect_expr_columns(
8832    expr: &crate::storage::query::ast::Expr,
8833    table_name: &str,
8834    table_alias: Option<&str>,
8835    columns: &mut std::collections::BTreeSet<String>,
8836) {
8837    use crate::storage::query::ast::Expr;
8838    match expr {
8839        Expr::Column { field, .. } => {
8840            collect_field_ref_column(field, table_name, table_alias, columns);
8841        }
8842        Expr::Literal { .. } | Expr::Parameter { .. } => {}
8843        Expr::UnaryOp { operand, .. } | Expr::Cast { inner: operand, .. } => {
8844            collect_expr_columns(operand, table_name, table_alias, columns);
8845        }
8846        Expr::BinaryOp { lhs, rhs, .. } => {
8847            collect_expr_columns(lhs, table_name, table_alias, columns);
8848            collect_expr_columns(rhs, table_name, table_alias, columns);
8849        }
8850        Expr::FunctionCall { args, .. } => {
8851            for arg in args {
8852                collect_expr_columns(arg, table_name, table_alias, columns);
8853            }
8854        }
8855        Expr::Case {
8856            branches, else_, ..
8857        } => {
8858            for (condition, value) in branches {
8859                collect_expr_columns(condition, table_name, table_alias, columns);
8860                collect_expr_columns(value, table_name, table_alias, columns);
8861            }
8862            if let Some(value) = else_ {
8863                collect_expr_columns(value, table_name, table_alias, columns);
8864            }
8865        }
8866        Expr::IsNull { operand, .. } => {
8867            collect_expr_columns(operand, table_name, table_alias, columns);
8868        }
8869        Expr::InList { target, values, .. } => {
8870            collect_expr_columns(target, table_name, table_alias, columns);
8871            for value in values {
8872                collect_expr_columns(value, table_name, table_alias, columns);
8873            }
8874        }
8875        Expr::Between {
8876            target, low, high, ..
8877        } => {
8878            collect_expr_columns(target, table_name, table_alias, columns);
8879            collect_expr_columns(low, table_name, table_alias, columns);
8880            collect_expr_columns(high, table_name, table_alias, columns);
8881        }
8882    }
8883}
8884
8885fn collect_field_ref_column(
8886    field: &crate::storage::query::ast::FieldRef,
8887    table_name: &str,
8888    table_alias: Option<&str>,
8889    columns: &mut std::collections::BTreeSet<String>,
8890) {
8891    if let Some(column) = policy_column_name_from_field_ref(field, table_name, table_alias) {
8892        if column != "*" {
8893            columns.insert(column);
8894        }
8895    }
8896}
8897
8898fn policy_column_name_from_field_ref(
8899    field: &crate::storage::query::ast::FieldRef,
8900    table_name: &str,
8901    table_alias: Option<&str>,
8902) -> Option<String> {
8903    match field {
8904        crate::storage::query::ast::FieldRef::TableColumn { table, column } => {
8905            if column == "*" {
8906                return Some("*".to_string());
8907            }
8908            if table.is_empty() || table == table_name || Some(table.as_str()) == table_alias {
8909                Some(column.clone())
8910            } else {
8911                Some(format!("{table}.{column}"))
8912            }
8913        }
8914        _ => None,
8915    }
8916}
8917
8918fn legacy_resource_to_iam(
8919    resource: &crate::auth::privileges::Resource,
8920    tenant: Option<&str>,
8921) -> crate::auth::policies::ResourceRef {
8922    use crate::auth::privileges::Resource;
8923
8924    let (kind, name) = match resource {
8925        Resource::Database => ("database".to_string(), "*".to_string()),
8926        Resource::Schema(s) => ("schema".to_string(), format!("{s}.*")),
8927        Resource::Table { schema, table } => (
8928            "table".to_string(),
8929            match schema {
8930                Some(s) => format!("{s}.{table}"),
8931                None => table.clone(),
8932            },
8933        ),
8934        Resource::Function { schema, name } => (
8935            "function".to_string(),
8936            match schema {
8937                Some(s) => format!("{s}.{name}"),
8938                None => name.clone(),
8939            },
8940        ),
8941    };
8942
8943    let mut out = crate::auth::policies::ResourceRef::new(kind, name);
8944    if let Some(t) = tenant {
8945        out = out.with_tenant(t.to_string());
8946    }
8947    out
8948}
8949
8950#[derive(Debug)]
8951struct JoinTableSide {
8952    table: String,
8953    alias: String,
8954}
8955
8956fn table_side_context(expr: &QueryExpr) -> Option<JoinTableSide> {
8957    match expr {
8958        QueryExpr::Table(table) => Some(JoinTableSide {
8959            table: table.table.clone(),
8960            alias: table.alias.clone().unwrap_or_else(|| table.table.clone()),
8961        }),
8962        _ => None,
8963    }
8964}
8965
8966fn collect_projection_columns_for_table(
8967    projection: &Projection,
8968    table: &str,
8969    alias: Option<&str>,
8970    out: &mut BTreeSet<String>,
8971) {
8972    match projection {
8973        Projection::Column(column) | Projection::Alias(column, _) => {
8974            match split_qualified_column(column) {
8975                Some((qualifier, column))
8976                    if qualifier == table || alias.is_some_and(|alias| qualifier == alias) =>
8977                {
8978                    push_policy_column(column, out);
8979                }
8980                Some(_) => {}
8981                None => push_policy_column(column, out),
8982            }
8983        }
8984        Projection::Field(
8985            FieldRef::TableColumn {
8986                table: qualifier,
8987                column,
8988            },
8989            _,
8990        ) => {
8991            if qualifier.is_empty()
8992                || qualifier == table
8993                || alias.is_some_and(|alias| qualifier == alias)
8994            {
8995                push_policy_column(column, out);
8996            }
8997        }
8998        Projection::Field(
8999            FieldRef::NodeProperty {
9000                alias: qualifier,
9001                property,
9002            },
9003            _,
9004        )
9005        | Projection::Field(
9006            FieldRef::EdgeProperty {
9007                alias: qualifier,
9008                property,
9009            },
9010            _,
9011        ) => {
9012            if qualifier == table || alias.is_some_and(|alias| qualifier == alias) {
9013                push_policy_column(property, out);
9014            }
9015        }
9016        Projection::Function(_, args) => {
9017            for arg in args {
9018                collect_projection_columns_for_table(arg, table, alias, out);
9019            }
9020        }
9021        Projection::Expression(_, _) | Projection::All | Projection::Field(_, _) => {}
9022    }
9023}
9024
9025fn collect_projection_columns_for_join_side(
9026    projection: &Projection,
9027    left: Option<&JoinTableSide>,
9028    right: Option<&JoinTableSide>,
9029    out: &mut HashMap<String, BTreeSet<String>>,
9030) -> RedDBResult<()> {
9031    match projection {
9032        Projection::Column(column) | Projection::Alias(column, _) => {
9033            if let Some((qualifier, column)) = split_qualified_column(column) {
9034                push_qualified_join_column(qualifier, column, left, right, out);
9035            } else {
9036                push_unqualified_join_column(column, left, right, out);
9037            }
9038        }
9039        Projection::Field(FieldRef::TableColumn { table, column }, _) => {
9040            if table.is_empty() {
9041                push_unqualified_join_column(column, left, right, out);
9042            } else if let Some(side) = [left, right]
9043                .into_iter()
9044                .flatten()
9045                .find(|side| table == side.table.as_str() || table == side.alias.as_str())
9046            {
9047                push_join_column(&side.table, column, out);
9048            }
9049        }
9050        Projection::Field(FieldRef::NodeProperty { alias, property }, _)
9051        | Projection::Field(FieldRef::EdgeProperty { alias, property }, _) => {
9052            push_qualified_join_column(alias, property, left, right, out);
9053        }
9054        Projection::Function(_, args) => {
9055            for arg in args {
9056                collect_projection_columns_for_join_side(arg, left, right, out)?;
9057            }
9058        }
9059        Projection::Expression(_, _) | Projection::All | Projection::Field(_, _) => {}
9060    }
9061    Ok(())
9062}
9063
9064fn split_qualified_column(column: &str) -> Option<(&str, &str)> {
9065    let (qualifier, column) = column.split_once('.')?;
9066    if qualifier.is_empty() || column.is_empty() || column.contains('.') {
9067        return None;
9068    }
9069    Some((qualifier, column))
9070}
9071
9072fn push_qualified_join_column(
9073    qualifier: &str,
9074    column: &str,
9075    left: Option<&JoinTableSide>,
9076    right: Option<&JoinTableSide>,
9077    out: &mut HashMap<String, BTreeSet<String>>,
9078) {
9079    if let Some(side) = [left, right]
9080        .into_iter()
9081        .flatten()
9082        .find(|side| qualifier == side.table.as_str() || qualifier == side.alias.as_str())
9083    {
9084        push_join_column(&side.table, column, out);
9085    }
9086}
9087
9088fn push_unqualified_join_column(
9089    column: &str,
9090    left: Option<&JoinTableSide>,
9091    right: Option<&JoinTableSide>,
9092    out: &mut HashMap<String, BTreeSet<String>>,
9093) {
9094    for side in [left, right].into_iter().flatten() {
9095        push_join_column(&side.table, column, out);
9096    }
9097}
9098
9099fn push_join_column(table: &str, column: &str, out: &mut HashMap<String, BTreeSet<String>>) {
9100    if is_policy_column_name(column) {
9101        out.entry(table.to_string())
9102            .or_default()
9103            .insert(column.to_string());
9104    }
9105}
9106
9107fn push_policy_column(column: &str, out: &mut BTreeSet<String>) {
9108    if is_policy_column_name(column) {
9109        out.insert(column.to_string());
9110    }
9111}
9112
9113fn is_policy_column_name(column: &str) -> bool {
9114    !column.is_empty()
9115        && column != "*"
9116        && !column.starts_with("LIT:")
9117        && !column.starts_with("TYPE:")
9118}
9119
9120fn runtime_iam_context(
9121    role: crate::auth::Role,
9122    tenant: Option<&str>,
9123) -> crate::auth::policies::EvalContext {
9124    crate::auth::policies::EvalContext {
9125        principal_tenant: tenant.map(|t| t.to_string()),
9126        current_tenant: tenant.map(|t| t.to_string()),
9127        peer_ip: None,
9128        mfa_present: false,
9129        now_ms: crate::auth::now_ms(),
9130        principal_is_admin_role: role == crate::auth::Role::Admin,
9131    }
9132}
9133
9134fn explicit_table_projection_columns(
9135    query: &crate::storage::query::ast::TableQuery,
9136) -> Vec<String> {
9137    use crate::storage::query::ast::{FieldRef, Projection};
9138
9139    let mut columns = Vec::new();
9140    for projection in crate::storage::query::sql_lowering::effective_table_projections(query) {
9141        match projection {
9142            Projection::Column(column) | Projection::Alias(column, _) => {
9143                push_unique(&mut columns, column)
9144            }
9145            Projection::Field(FieldRef::TableColumn { column, .. }, _) => {
9146                push_unique(&mut columns, column)
9147            }
9148            // SELECT * and expression/function projections need the
9149            // executor-wide column-policy context mapped in
9150            // docs/security/select-relational-column-policy-audit-2026-05-08.md.
9151            _ => {}
9152        }
9153    }
9154    columns
9155}
9156
9157fn explicit_graph_projection_properties(
9158    query: &crate::storage::query::ast::GraphQuery,
9159) -> Vec<String> {
9160    use crate::storage::query::ast::{FieldRef, Projection};
9161
9162    let mut columns = Vec::new();
9163    for projection in &query.return_ {
9164        match projection {
9165            Projection::Field(FieldRef::NodeProperty { property, .. }, _)
9166            | Projection::Field(FieldRef::EdgeProperty { property, .. }, _) => {
9167                push_unique(&mut columns, property.clone())
9168            }
9169            _ => {}
9170        }
9171    }
9172    columns
9173}
9174
9175fn push_unique(columns: &mut Vec<String>, column: String) {
9176    if !columns.iter().any(|existing| existing == &column) {
9177        columns.push(column);
9178    }
9179}
9180
9181fn principal_label(p: &crate::storage::query::ast::PolicyPrincipalRef) -> String {
9182    use crate::storage::query::ast::PolicyPrincipalRef;
9183    match p {
9184        PolicyPrincipalRef::User(u) => match &u.tenant {
9185            Some(t) => format!("user:{t}/{}", u.username),
9186            None => format!("user:{}", u.username),
9187        },
9188        PolicyPrincipalRef::Group(g) => format!("group:{g}"),
9189    }
9190}
9191
9192/// Render a `Decision` into the (decision, matched_policy_id, matched_sid)
9193/// shape used by every audit emit + the simulator response.
9194pub(crate) fn decision_to_strings(
9195    d: &crate::auth::policies::Decision,
9196) -> (String, Option<String>, Option<String>) {
9197    use crate::auth::policies::Decision;
9198    match d {
9199        Decision::Allow {
9200            matched_policy_id,
9201            matched_sid,
9202        } => (
9203            "allow".into(),
9204            Some(matched_policy_id.clone()),
9205            matched_sid.clone(),
9206        ),
9207        Decision::Deny {
9208            matched_policy_id,
9209            matched_sid,
9210        } => (
9211            "deny".into(),
9212            Some(matched_policy_id.clone()),
9213            matched_sid.clone(),
9214        ),
9215        Decision::DefaultDeny => ("default_deny".into(), None, None),
9216        Decision::AdminBypass => ("admin_bypass".into(), None, None),
9217    }
9218}
9219
9220fn parse_timestamp_to_ms(s: &str) -> Option<u128> {
9221    // Bare integer ms.
9222    if let Ok(n) = s.parse::<u128>() {
9223        return Some(n);
9224    }
9225    // Fallback: ISO-8601 like 2030-01-02 03:04:05 — accept the date
9226    // portion only (midnight UTC). Full RFC3339 parsing is a stretch
9227    // goal; the common case is `'2030-01-01'`.
9228    if let Some(date) = s.split_whitespace().next() {
9229        let parts: Vec<&str> = date.split('-').collect();
9230        if parts.len() == 3 {
9231            let (y, m, d) = (parts[0], parts[1], parts[2]);
9232            if let (Ok(y), Ok(m), Ok(d)) = (y.parse::<i64>(), m.parse::<u32>(), d.parse::<u32>()) {
9233                // Days since 1970-01-01 — simple Julian arithmetic
9234                // suitable for years 1970-2100. Good enough for test
9235                // fixtures; precise parsing lands when we wire chrono.
9236                let days_in = days_from_civil(y, m, d);
9237                return Some((days_in as u128) * 86_400_000u128);
9238            }
9239        }
9240    }
9241    None
9242}
9243
9244/// Days from Unix epoch using H. Hinnant's civil-from-days algorithm.
9245/// Robust for the entire Gregorian range; used by `parse_timestamp_to_ms`.
9246fn days_from_civil(y: i64, m: u32, d: u32) -> i64 {
9247    let y = if m <= 2 { y - 1 } else { y };
9248    let era = if y >= 0 { y } else { y - 399 } / 400;
9249    let yoe = (y - era * 400) as u64; // [0, 399]
9250    let doy = (153 * (if m > 2 { m - 3 } else { m + 9 }) as u64 + 2) / 5 + d as u64 - 1;
9251    let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
9252    era * 146097 + doe as i64 - 719468
9253}
9254
9255fn walk_plan_node(
9256    node: &crate::storage::query::planner::CanonicalLogicalNode,
9257    depth: usize,
9258    out: &mut Vec<crate::storage::query::unified::UnifiedRecord>,
9259) {
9260    use std::sync::Arc;
9261    let mut rec = crate::storage::query::unified::UnifiedRecord::default();
9262    rec.set_arc(Arc::from("op"), Value::text(node.operator.clone()));
9263    rec.set_arc(
9264        Arc::from("source"),
9265        node.source.clone().map(Value::text).unwrap_or(Value::Null),
9266    );
9267    rec.set_arc(Arc::from("est_rows"), Value::Float(node.estimated_rows));
9268    rec.set_arc(Arc::from("est_cost"), Value::Float(node.operator_cost));
9269    rec.set_arc(Arc::from("depth"), Value::Integer(depth as i64));
9270    out.push(rec);
9271    for child in &node.children {
9272        walk_plan_node(child, depth + 1, out);
9273    }
9274}