Skip to main content

reddb_server/runtime/
impl_core.rs

1use super::*;
2use crate::application::entity::metadata_to_json;
3use crate::auth::column_policy_gate::ColumnAccessRequest;
4use crate::auth::UserId;
5use crate::replication::cdc::ChangeRecord;
6use crate::replication::logical::{ApplyMode, LogicalChangeApplier};
7use crate::storage::query::ast::TableSource;
8
9thread_local! {
10    /// Current connection id for the executing statement. Set by the
11    /// per-connection wrapper (stdio/gRPC handlers) before dispatching
12    /// into `execute_query`; falls back to `0` for embedded callers.
13    static CURRENT_CONN_ID: std::cell::Cell<u64> = const { std::cell::Cell::new(0) };
14
15    /// Authenticated user + role for the executing statement (Phase 2.5.2
16    /// RLS enforcement). Set by the transport middleware after validating
17    /// credentials (password / cert / oauth); unset means "anonymous" /
18    /// "embedded" — RLS policies degrade to the role-agnostic subset.
19    ///
20    /// `None` skips RLS injection entirely; `Some((username, role))`
21    /// passes `role` to `matching_rls_policies(table, Some(role), action)`.
22    static CURRENT_AUTH_IDENTITY: std::cell::RefCell<Option<(String, crate::auth::Role)>> =
23        const { std::cell::RefCell::new(None) };
24
25    /// MVCC snapshot scoped to the currently-executing statement (Phase
26    /// 2.3.2d PG parity). `execute_query` captures it on entry and drops
27    /// it on exit; every scan consults it via
28    /// `entity_visible_under_current_snapshot` to hide tuples whose xmin
29    /// hasn't committed or whose xmax already has.
30    ///
31    /// `None` means "pre-MVCC semantics" — the read path returns every
32    /// tuple regardless of xmin/xmax. All embedded callers that bypass
33    /// `execute_query` see this default.
34    static CURRENT_SNAPSHOT: std::cell::RefCell<Option<SnapshotContext>> =
35        const { std::cell::RefCell::new(None) };
36
37    /// Cheap presence flag for `CURRENT_SNAPSHOT`. Scan hot paths
38    /// poll this instead of `borrow()`-ing the RefCell on every
39    /// row — the common case (autocommit / no MVCC session) reads
40    /// one atomic `Cell<bool>` and short-circuits, saving ~10ns × N
41    /// rows on aggregate_group / select_range scans.
42    static HAS_SNAPSHOT: std::cell::Cell<bool> = const { std::cell::Cell::new(false) };
43
44    /// Session-scoped tenant id for the current connection (Phase 2.5.3
45    /// multi-tenancy). Populated by `SET TENANT 'id'` or by transport
46    /// middleware after resolving tenant from auth claims. Read by the
47    /// `CURRENT_TENANT()` scalar function — RLS policies typically
48    /// combine it as `USING (tenant_id = CURRENT_TENANT())` to scope
49    /// every query to one tenant.
50    ///
51    /// `None` means "no tenant bound" — `CURRENT_TENANT()` returns
52    /// NULL, and RLS policies that gate on it hide every row.
53    static CURRENT_TENANT_ID: std::cell::RefCell<Option<String>> =
54        const { std::cell::RefCell::new(None) };
55
56    /// Statement-local config resolver. SQL expressions materialize the
57    /// `red_config` snapshot lazily on the first `$config.*`/`CONFIG()`
58    /// access, keeping ordinary statements on the zero-scan path.
59    static CURRENT_CONFIG_RESOLVER: std::cell::RefCell<Option<ConfigResolver>> =
60        const { std::cell::RefCell::new(None) };
61
62    /// Statement-local secret resolver. SQL expressions materialize the
63    /// vault KV snapshot lazily on first `$secret.*` access, then use
64    /// lock-free map reads for the rest of the statement.
65    static CURRENT_SECRET_RESOLVER: std::cell::RefCell<Option<SecretResolver>> =
66        const { std::cell::RefCell::new(None) };
67}
68
69fn secret_sql_value_to_string(value: &Value) -> RedDBResult<String> {
70    match value {
71        Value::Text(s) => Ok(s.to_string()),
72        Value::Integer(n) => Ok(n.to_string()),
73        Value::UnsignedInteger(n) => Ok(n.to_string()),
74        Value::Float(n) => Ok(n.to_string()),
75        Value::Boolean(b) => Ok(b.to_string()),
76        Value::Null => Err(RedDBError::Query(
77            "SET SECRET key = NULL deletes the secret; use DELETE SECRET for explicit deletes"
78                .to_string(),
79        )),
80        Value::Password(_) | Value::Secret(_) => Err(RedDBError::Query(
81            "SET SECRET accepts plain scalar literals; PASSWORD() and SECRET() are for typed columns"
82                .to_string(),
83        )),
84        _ => Err(RedDBError::Query(format!(
85            "SET SECRET does not support value type {:?} yet",
86            value.data_type()
87        ))),
88    }
89}
90
91fn system_keyed_collection_contract(
92    name: &str,
93    model: crate::catalog::CollectionModel,
94) -> crate::physical::CollectionContract {
95    let now = crate::utils::now_unix_millis() as u128;
96    crate::physical::CollectionContract {
97        name: name.to_string(),
98        declared_model: model,
99        schema_mode: crate::catalog::SchemaMode::Dynamic,
100        origin: crate::physical::ContractOrigin::Implicit,
101        version: 1,
102        created_at_unix_ms: now,
103        updated_at_unix_ms: now,
104        default_ttl_ms: None,
105        vector_dimension: None,
106        vector_metric: None,
107        context_index_fields: Vec::new(),
108        declared_columns: Vec::new(),
109        table_def: None,
110        timestamps_enabled: false,
111        context_index_enabled: false,
112        metrics_raw_retention_ms: None,
113        metrics_rollup_policies: Vec::new(),
114        metrics_tenant_identity: None,
115        metrics_namespace: None,
116        append_only: false,
117        subscriptions: Vec::new(),
118        session_key: None,
119        session_gap_ms: None,
120        retention_duration_ms: None,
121    }
122}
123
124/// Snapshot + manager pair used for read-path visibility checks.
125///
126/// The manager is needed in addition to the snapshot because `aborted`
127/// state mutates after the snapshot is captured — a ROLLBACK by a
128/// committed-at-capture-time writer must still hide its tuples. Keeping
129/// the Arc around is O(pointer) and the RwLock reads on `is_aborted`
130/// are cheap (HashSet lookup under a parking_lot read guard).
131///
132/// `own_xids` (Phase 2.3.2e) lists the xids belonging to the current
133/// connection's transaction — the parent xid plus open and released
134/// savepoint sub-xids. The visibility rule promotes rows stamped with
135/// these xids to "always visible (unless aborted)" so the writer sees
136/// its own nested-savepoint writes even though their xids exceed
137/// `snapshot.xid`.
138#[derive(Clone)]
139pub struct SnapshotContext {
140    pub snapshot: crate::storage::transaction::snapshot::Snapshot,
141    pub manager: Arc<crate::storage::transaction::snapshot::SnapshotManager>,
142    pub own_xids: std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
143    pub requires_index_fallback: bool,
144}
145
146/// Install a connection id on the current thread for the duration of a
147/// statement. Transaction state (`RuntimeInner::tx_contexts`) is keyed
148/// by this id so different connections can hold independent BEGINs.
149///
150/// Pub so transports (PG wire, gRPC, HTTP per-request spawners) and
151/// tests can emulate per-connection isolation. Call it once when
152/// binding the connection's worker thread; pair with
153/// `clear_current_connection_id` on teardown.
154pub fn set_current_connection_id(id: u64) {
155    CURRENT_CONN_ID.with(|c| c.set(id));
156}
157
158/// Reset the thread's connection id back to `0` (autocommit).
159pub fn clear_current_connection_id() {
160    CURRENT_CONN_ID.with(|c| c.set(0));
161}
162
163/// Read the connection id set by `set_current_connection_id`. Returns
164/// `0` when no wrapper installed one — auto-commit path.
165pub fn current_connection_id() -> u64 {
166    CURRENT_CONN_ID.with(|c| c.get())
167}
168
169/// Install the authenticated identity for the current thread (Phase 2.5.2
170/// RLS enforcement). Transport layers call this right after resolving
171/// auth so the query dispatch can fold RLS policies into the filter.
172pub fn set_current_auth_identity(username: String, role: crate::auth::Role) {
173    CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = Some((username, role)));
174}
175
176/// Clear the thread-local auth identity. Transports call this after the
177/// statement completes so pooled threads don't leak identities across
178/// requests.
179pub fn clear_current_auth_identity() {
180    CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = None);
181}
182
183/// Read the current-thread auth identity. `None` when no transport
184/// installed one (embedded mode / anonymous access).
185pub(crate) fn current_auth_identity() -> Option<(String, crate::auth::Role)> {
186    CURRENT_AUTH_IDENTITY.with(|cell| cell.borrow().clone())
187}
188
189/// Install the session tenant id for the current thread (Phase 2.5.3
190/// multi-tenancy). Called by `SET TENANT 'id'` dispatch and by
191/// transport middleware that resolves tenant from auth claims (e.g.
192/// JWT `tenant` claim, HTTP header, subdomain).
193pub fn set_current_tenant(tenant_id: String) {
194    CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = Some(tenant_id));
195}
196
197/// Clear the current-thread tenant — `CURRENT_TENANT()` will then
198/// return NULL and any RLS policy gated on it will hide every row.
199pub fn clear_current_tenant() {
200    CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = None);
201}
202
203/// Read the current-thread tenant id, applying overrides in priority order:
204///   1. `WITHIN TENANT '<id>' …` per-statement override (highest)
205///   2. `SET LOCAL TENANT '<id>'` transaction-local override (consulted
206///      only when the current connection has an open transaction)
207///   3. `SET TENANT '<id>'` session-level thread-local
208///   4. `None` (deny-default for RLS).
209///
210/// The transaction-local layer is read through the runtime; an embedded
211/// helper crate that has no `RedDBRuntime` access still gets correct
212/// behaviour for layers 1, 3, and 4.
213pub fn current_tenant() -> Option<String> {
214    let inherited = CURRENT_TENANT_ID.with(|cell| cell.borrow().clone());
215    if let Some(over) = current_scope_override() {
216        if over.tenant.is_active() {
217            return over.tenant.resolve(inherited);
218        }
219    }
220    if let Some(tx_local) = current_tx_local_tenant() {
221        return tx_local;
222    }
223    inherited
224}
225
226thread_local! {
227    /// Snapshot of the active connection's `tx_local_tenants` entry for
228    /// the current `execute_query` call. Outer `Some(_)` means "a
229    /// transaction-local tenant override is active for this call";
230    /// inner is the override's value (`Some(s)` overrides to `s`,
231    /// `None` overrides to NULL/cleared). Refreshed at the top of every
232    /// `execute_query` invocation and cleared by the RAII guard on
233    /// return so pooled connections cannot leak the override past the
234    /// statement that owns it.
235    static TX_LOCAL_TENANT: std::cell::RefCell<Option<Option<String>>> =
236        const { std::cell::RefCell::new(None) };
237}
238
239fn current_tx_local_tenant() -> Option<Option<String>> {
240    TX_LOCAL_TENANT.with(|cell| cell.borrow().clone())
241}
242
243/// Recognise `SET LOCAL TENANT '<id>'` / `SET LOCAL TENANT NULL` —
244/// returns `Ok(Some(Some(id)))` for an explicit value, `Ok(Some(None))`
245/// for an explicit NULL clear, `Ok(None)` when the input is not a
246/// `SET LOCAL TENANT` statement at all, and `Err` when the prefix
247/// matches but the value is malformed.
248fn parse_set_local_tenant(query: &str) -> RedDBResult<Option<Option<String>>> {
249    let mut tokens = query.split_ascii_whitespace();
250    let Some(w1) = tokens.next() else {
251        return Ok(None);
252    };
253    if !w1.eq_ignore_ascii_case("SET") {
254        return Ok(None);
255    }
256    let Some(w2) = tokens.next() else {
257        return Ok(None);
258    };
259    if !w2.eq_ignore_ascii_case("LOCAL") {
260        return Ok(None);
261    }
262    let Some(w3) = tokens.next() else {
263        return Ok(None);
264    };
265    if !w3.eq_ignore_ascii_case("TENANT") {
266        return Ok(None);
267    }
268    let rest: String = tokens.collect::<Vec<_>>().join(" ");
269    let rest = rest.trim().trim_end_matches(';').trim();
270    let value_str = rest.strip_prefix('=').map(|s| s.trim()).unwrap_or(rest);
271    if value_str.is_empty() {
272        return Err(RedDBError::Query(
273            "SET LOCAL TENANT expects a string literal or NULL".to_string(),
274        ));
275    }
276    if value_str.eq_ignore_ascii_case("NULL") {
277        return Ok(Some(None));
278    }
279    if value_str.starts_with('\'') && value_str.ends_with('\'') && value_str.len() >= 2 {
280        let inner = &value_str[1..value_str.len() - 1];
281        return Ok(Some(Some(inner.to_string())));
282    }
283    Err(RedDBError::Query(format!(
284        "SET LOCAL TENANT expects a string literal or NULL, got `{value_str}`"
285    )))
286}
287
288pub(crate) struct TxLocalTenantGuard;
289
290impl TxLocalTenantGuard {
291    pub fn install(value: Option<Option<String>>) -> Self {
292        TX_LOCAL_TENANT.with(|cell| *cell.borrow_mut() = value);
293        Self
294    }
295}
296
297impl Drop for TxLocalTenantGuard {
298    fn drop(&mut self) {
299        TX_LOCAL_TENANT.with(|cell| *cell.borrow_mut() = None);
300    }
301}
302
303thread_local! {
304    /// Stack of `WITHIN ... <stmt>` overrides active on the current
305    /// thread. Every entry corresponds to one in-flight `execute_query`
306    /// call that started with a `WITHIN` prefix; the entry is pushed
307    /// before dispatch and popped before the call returns. The stack
308    /// shape supports nested invocations (e.g. a view body that itself
309    /// re-enters execute_query).
310    static SCOPE_OVERRIDES: std::cell::RefCell<Vec<crate::runtime::within_clause::ScopeOverride>> =
311        const { std::cell::RefCell::new(Vec::new()) };
312}
313
314pub(crate) fn push_scope_override(over: crate::runtime::within_clause::ScopeOverride) {
315    SCOPE_OVERRIDES.with(|cell| cell.borrow_mut().push(over));
316}
317
318pub(crate) fn pop_scope_override() {
319    SCOPE_OVERRIDES.with(|cell| {
320        cell.borrow_mut().pop();
321    });
322}
323
324pub(crate) fn current_scope_override() -> Option<crate::runtime::within_clause::ScopeOverride> {
325    SCOPE_OVERRIDES.with(|cell| cell.borrow().last().cloned())
326}
327
328/// Cheap probe: is any `WITHIN …` scope override active on this
329/// thread? The fast-path needs to know without paying for the full
330/// `.last().cloned()` allocation — just peek at stack length.
331pub(crate) fn has_scope_override_active() -> bool {
332    SCOPE_OVERRIDES.with(|cell| !cell.borrow().is_empty())
333}
334
335/// RAII guard pairing `push_scope_override` with the matching pop, so
336/// the stack stays balanced even when the inner `execute_query` returns
337/// early via `?`.
338pub(crate) struct ScopeOverrideGuard;
339
340impl ScopeOverrideGuard {
341    pub fn install(over: crate::runtime::within_clause::ScopeOverride) -> Self {
342        push_scope_override(over);
343        Self
344    }
345}
346
347impl Drop for ScopeOverrideGuard {
348    fn drop(&mut self) {
349        pop_scope_override();
350    }
351}
352
353/// Read the current-thread auth identity, honouring per-statement
354/// `WITHIN ... USER '<u>' AS ROLE '<r>'` overrides. The override only
355/// supplies projected strings — it never grants additional privilege —
356/// so callers that need to make authorisation decisions must read from
357/// the underlying `current_auth_identity()` directly.
358pub(crate) fn current_user_projected() -> Option<String> {
359    let inherited = current_auth_identity().map(|(u, _)| u);
360    if let Some(over) = current_scope_override() {
361        if over.user.is_active() {
362            return over.user.resolve(inherited);
363        }
364    }
365    inherited
366}
367
368pub(crate) fn current_role_projected() -> Option<String> {
369    let inherited = current_auth_identity().map(|(_, r)| format!("{r:?}").to_lowercase());
370    if let Some(over) = current_scope_override() {
371        if over.role.is_active() {
372            return over.role.resolve(inherited);
373        }
374    }
375    inherited
376}
377
378pub(crate) fn current_secret_value(path: &str) -> Option<String> {
379    let key = path.to_ascii_lowercase();
380    CURRENT_SECRET_RESOLVER.with(|cell| {
381        let mut resolver = cell.borrow_mut();
382        let resolver = resolver.as_mut()?;
383        if resolver.values.is_none() {
384            resolver.values = resolver
385                .store
386                .as_ref()
387                .map(|store| store.vault_kv_snapshot());
388        }
389        let values = resolver.values.as_ref()?;
390        values.get(&key).cloned().or_else(|| {
391            key.strip_prefix("red.vault/").and_then(|rest| {
392                values
393                    .get(rest)
394                    .cloned()
395                    .or_else(|| values.get(&format!("red.secret.{rest}")).cloned())
396            })
397        })
398    })
399}
400
401struct SecretResolver {
402    store: Option<Arc<crate::auth::store::AuthStore>>,
403    values: Option<HashMap<String, String>>,
404}
405
406pub(super) struct SecretStoreGuard {
407    previous: Option<SecretResolver>,
408}
409
410impl SecretStoreGuard {
411    pub(super) fn install(store: Option<Arc<crate::auth::store::AuthStore>>) -> Self {
412        let previous = CURRENT_SECRET_RESOLVER.with(|cell| {
413            cell.replace(Some(SecretResolver {
414                store,
415                values: None,
416            }))
417        });
418        Self { previous }
419    }
420}
421
422impl Drop for SecretStoreGuard {
423    fn drop(&mut self) {
424        let previous = self.previous.take();
425        CURRENT_SECRET_RESOLVER.with(|cell| {
426            cell.replace(previous);
427        });
428    }
429}
430
431pub(crate) fn current_config_value(path: &str) -> Option<Value> {
432    let key = path.to_ascii_lowercase();
433    CURRENT_CONFIG_RESOLVER.with(|cell| {
434        let mut resolver = cell.borrow_mut();
435        let resolver = resolver.as_mut()?;
436        if resolver.values.is_none() {
437            resolver.values = Some(latest_config_snapshot(&resolver.db));
438        }
439        let values = resolver.values.as_ref()?;
440        values.get(&key).cloned().or_else(|| {
441            key.strip_prefix("red.config/")
442                .and_then(|rest| values.get(&format!("red.config.{rest}")).cloned())
443        })
444    })
445}
446
447fn update_current_config_value(path: &str, value: Value) {
448    let key = path.to_ascii_lowercase();
449    CURRENT_CONFIG_RESOLVER.with(|cell| {
450        if let Some(resolver) = cell.borrow_mut().as_mut() {
451            if let Some(values) = resolver.values.as_mut() {
452                values.insert(key, value);
453            }
454        }
455    });
456}
457
458fn update_current_secret_value(path: &str, value: Option<String>) {
459    let key = path.to_ascii_lowercase();
460    CURRENT_SECRET_RESOLVER.with(|cell| {
461        if let Some(resolver) = cell.borrow_mut().as_mut() {
462            let Some(values) = resolver.values.as_mut() else {
463                return;
464            };
465            match value {
466                Some(value) => {
467                    values.insert(key, value);
468                }
469                None => {
470                    values.remove(&key);
471                }
472            }
473        }
474    });
475}
476
477fn latest_config_snapshot(db: &RedDB) -> HashMap<String, Value> {
478    let mut latest: HashMap<String, (u64, Value)> = HashMap::new();
479
480    if let Some(manager) = db.store().get_collection("red_config") {
481        manager.for_each_entity(|entity| {
482            let Some(row) = entity.data.as_row() else {
483                return true;
484            };
485            let Some(Value::Text(key)) = row.get_field("key") else {
486                return true;
487            };
488            let value = row.get_field("value").cloned().unwrap_or(Value::Null);
489            let id = entity.id.raw();
490            let key = key.to_ascii_lowercase();
491            insert_latest_config_value(&mut latest, key.clone(), id, value.clone());
492            if let Some(rest) = key.strip_prefix("red.config.") {
493                insert_latest_config_value(&mut latest, format!("red.config/{rest}"), id, value);
494            }
495            true
496        });
497    }
498
499    if let Some(manager) = db.store().get_collection("red.config") {
500        manager.for_each_entity(|entity| {
501            let Some(row) = entity.data.as_row() else {
502                return true;
503            };
504            if matches!(row.get_field("tombstone"), Some(Value::Boolean(true))) {
505                return true;
506            }
507            let Some(Value::Text(key)) = row.get_field("key") else {
508                return true;
509            };
510            let value = row.get_field("value").cloned().unwrap_or(Value::Null);
511            insert_latest_config_value(
512                &mut latest,
513                format!("red.config/{}", key.to_ascii_lowercase()),
514                entity.id.raw(),
515                value,
516            );
517            true
518        });
519    }
520
521    latest
522        .into_iter()
523        .map(|(key, (_, value))| (key, value))
524        .collect()
525}
526
527fn insert_latest_config_value(
528    latest: &mut HashMap<String, (u64, Value)>,
529    key: String,
530    id: u64,
531    value: Value,
532) {
533    match latest.get(&key) {
534        Some((prev_id, _)) if *prev_id > id => {}
535        _ => {
536            latest.insert(key, (id, value));
537        }
538    }
539}
540
541struct ConfigResolver {
542    db: Arc<RedDB>,
543    values: Option<HashMap<String, Value>>,
544}
545
546pub(super) struct ConfigSnapshotGuard {
547    previous: Option<ConfigResolver>,
548}
549
550impl ConfigSnapshotGuard {
551    pub(super) fn install(db: Arc<RedDB>) -> Self {
552        let previous = CURRENT_CONFIG_RESOLVER
553            .with(|cell| cell.replace(Some(ConfigResolver { db, values: None })));
554        Self { previous }
555    }
556}
557
558impl Drop for ConfigSnapshotGuard {
559    fn drop(&mut self) {
560        let previous = self.previous.take();
561        CURRENT_CONFIG_RESOLVER.with(|cell| {
562            cell.replace(previous);
563        });
564    }
565}
566
567/// Install the MVCC snapshot used by the current thread for the duration
568/// of one statement. Paired with `clear_current_snapshot()` — callers
569/// should prefer the `CurrentSnapshotGuard` RAII wrapper so early returns
570/// still clean up.
571pub fn set_current_snapshot(ctx: SnapshotContext) {
572    CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = Some(ctx));
573    HAS_SNAPSHOT.with(|c| c.set(true));
574}
575
576pub fn clear_current_snapshot() {
577    CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = None);
578    HAS_SNAPSHOT.with(|c| c.set(false));
579}
580
581/// Drop-guard that restores the previous snapshot on scope exit. Safe to
582/// nest — each statement saves the caller's snapshot and puts it back
583/// instead of blindly clearing, so a top-level `execute_query` called
584/// from inside another statement dispatch (e.g. vector source subqueries)
585/// doesn't strip visibility from the outer scan.
586pub(crate) struct CurrentSnapshotGuard {
587    previous: Option<SnapshotContext>,
588}
589
590impl CurrentSnapshotGuard {
591    pub(crate) fn install(ctx: SnapshotContext) -> Self {
592        let previous = CURRENT_SNAPSHOT.with(|cell| cell.borrow().clone());
593        set_current_snapshot(ctx);
594        Self { previous }
595    }
596}
597
598impl Drop for CurrentSnapshotGuard {
599    fn drop(&mut self) {
600        let prev = self.previous.take();
601        let has = prev.is_some();
602        CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = prev);
603        HAS_SNAPSHOT.with(|c| c.set(has));
604    }
605}
606
607/// Is this entity visible under the current thread's MVCC snapshot?
608///
609/// Returns `true` (no filtering) when no snapshot is installed — that
610/// path is used by embedded callers and by operations that intentionally
611/// bypass MVCC (VACUUM, snapshot export, admin introspection).
612///
613/// When a snapshot is installed the result is
614///   `snapshot.sees(xmin, xmax) && !mgr.is_aborted(xmin) && !xmax_half_abort`
615/// where `xmax_half_abort` re-grants visibility for tuples whose
616/// deleting transaction rolled back.
617#[inline]
618pub fn entity_visible_under_current_snapshot(
619    entity: &crate::storage::unified::entity::UnifiedEntity,
620) -> bool {
621    // Fast path — one `Cell<bool>` read, no RefCell borrow. Autocommit
622    // reads (no active MVCC transaction) still hide superseded physical
623    // versions while avoiding a full snapshot-context lookup.
624    // This runs on every row of every scan; the slow path only fires
625    // inside an explicit transaction.
626    if !HAS_SNAPSHOT.with(|c| c.get()) {
627        return entity.xmax == 0;
628    }
629    CURRENT_SNAPSHOT.with(|cell| {
630        let guard = cell.borrow();
631        let Some(ctx) = guard.as_ref() else {
632            return true;
633        };
634        visibility_check(ctx, entity.xmin, entity.xmax)
635    })
636}
637
638/// Direct visibility check from raw `(xmin, xmax)` — bypasses the
639/// entity borrow for callers that already decomposed the tuple (e.g.
640/// pre-materialized scan caches). Same semantics as
641/// `entity_visible_under_current_snapshot`.
642#[inline]
643pub(crate) fn xids_visible_under_current_snapshot(xmin: u64, xmax: u64) -> bool {
644    if !HAS_SNAPSHOT.with(|c| c.get()) {
645        return true;
646    }
647    CURRENT_SNAPSHOT.with(|cell| {
648        let guard = cell.borrow();
649        let Some(ctx) = guard.as_ref() else {
650            return true;
651        };
652        visibility_check(ctx, xmin, xmax)
653    })
654}
655
656/// Clone the current thread's snapshot context. Parallel scan paths
657/// (`query_all_zoned` with `std::thread::scope`) call this on the main
658/// thread *before* spawning workers so the captured `SnapshotContext`
659/// can be moved into every worker closure. Worker threads do not
660/// inherit thread-locals, so calling `entity_visible_under_current_snapshot`
661/// from inside a spawned closure would silently skip the filter.
662pub fn capture_current_snapshot() -> Option<SnapshotContext> {
663    CURRENT_SNAPSHOT.with(|cell| cell.borrow().clone())
664}
665
666/// Whether the active read snapshot may need historical tuple versions
667/// that the current secondary indexes cannot prove. Index paths can still
668/// recheck visible candidates, but only a heap scan can discover versions
669/// whose indexed value was changed or deleted after this snapshot.
670pub(crate) fn current_snapshot_requires_index_fallback() -> bool {
671    if !HAS_SNAPSHOT.with(|c| c.get()) {
672        return false;
673    }
674    CURRENT_SNAPSHOT.with(|cell| {
675        cell.borrow()
676            .as_ref()
677            .is_some_and(|ctx| ctx.requires_index_fallback)
678    })
679}
680
681/// Frozen MVCC + identity context for callers that need to reinstall
682/// the same view across thread-local boundaries — long-lived cursors,
683/// background batchers, anything that detaches from the dispatch path
684/// and re-enters later.
685///
686/// The bundle bakes in the three thread-locals every read path
687/// consults: `SnapshotContext` (MVCC visibility), the auth identity
688/// (RLS policy gate), and the tenant id (RLS scope). A FETCH that
689/// reinstalls the bundle sees exactly the same rows as the DECLARE
690/// would have, regardless of writes that landed in between.
691///
692/// Cheap to clone — `SnapshotContext` is a clone of three
693/// `Arc`-backed fields, identity is a `(String, Role)`, tenant is a
694/// `String`. None of these contend with the read path.
695#[derive(Clone, Default)]
696pub struct SnapshotBundle {
697    pub snapshot: Option<SnapshotContext>,
698    pub auth: Option<(String, crate::auth::Role)>,
699    pub tenant: Option<String>,
700}
701
702/// Capture the three read-path thread-locals into a `SnapshotBundle`.
703/// Pairs with `with_snapshot_bundle` for re-entry.
704pub fn snapshot_bundle() -> SnapshotBundle {
705    SnapshotBundle {
706        snapshot: capture_current_snapshot(),
707        auth: current_auth_identity(),
708        tenant: CURRENT_TENANT_ID.with(|cell| cell.borrow().clone()),
709    }
710}
711
712/// Reinstall a captured `SnapshotBundle` for the duration of `f`.
713/// Restores the caller's previous thread-locals on exit (panic-safe via
714/// the explicit guard struct so a panic in `f` cannot leak the
715/// installed identity into the worker's next request).
716pub fn with_snapshot_bundle<R>(bundle: &SnapshotBundle, f: impl FnOnce() -> R) -> R {
717    struct Guard {
718        prev_snapshot: Option<SnapshotContext>,
719        prev_auth: Option<(String, crate::auth::Role)>,
720        prev_tenant: Option<String>,
721    }
722    impl Drop for Guard {
723        fn drop(&mut self) {
724            let snap = self.prev_snapshot.take();
725            let has = snap.is_some();
726            CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = snap);
727            HAS_SNAPSHOT.with(|c| c.set(has));
728            CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = self.prev_auth.take());
729            CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = self.prev_tenant.take());
730        }
731    }
732
733    let _guard = {
734        let prev_snapshot = CURRENT_SNAPSHOT.with(|cell| cell.borrow().clone());
735        let prev_auth = CURRENT_AUTH_IDENTITY.with(|cell| cell.borrow().clone());
736        let prev_tenant = CURRENT_TENANT_ID.with(|cell| cell.borrow().clone());
737
738        match bundle.snapshot.clone() {
739            Some(ctx) => set_current_snapshot(ctx),
740            None => clear_current_snapshot(),
741        }
742        CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = bundle.auth.clone());
743        CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = bundle.tenant.clone());
744
745        Guard {
746            prev_snapshot,
747            prev_auth,
748            prev_tenant,
749        }
750    };
751    f()
752}
753
754/// Apply the same visibility rules used by the thread-local helpers
755/// against a caller-provided context. Intended for parallel workers
756/// that captured the snapshot with `capture_current_snapshot()`.
757#[inline]
758pub fn entity_visible_with_context(
759    ctx: Option<&SnapshotContext>,
760    entity: &crate::storage::unified::entity::UnifiedEntity,
761) -> bool {
762    match ctx {
763        Some(ctx) => visibility_check(ctx, entity.xmin, entity.xmax),
764        None => true,
765    }
766}
767
768fn table_row_index_fields(
769    entity: &crate::storage::unified::entity::UnifiedEntity,
770) -> Vec<(String, crate::storage::schema::Value)> {
771    let crate::storage::EntityData::Row(row) = &entity.data else {
772        return Vec::new();
773    };
774    if let Some(named) = &row.named {
775        return named
776            .iter()
777            .map(|(name, value)| (name.clone(), value.clone()))
778            .collect();
779    }
780    if let Some(schema) = &row.schema {
781        return schema
782            .iter()
783            .zip(row.columns.iter())
784            .map(|(name, value)| (name.clone(), value.clone()))
785            .collect();
786    }
787    Vec::new()
788}
789
790#[inline]
791fn visibility_check(ctx: &SnapshotContext, xmin: u64, xmax: u64) -> bool {
792    // Writer aborted → tuple never existed from any future reader's view.
793    // Checked *before* the own-xids fast path so an aborted own-sub-xid
794    // (rolled-back savepoint) stays hidden from the parent.
795    if xmin != 0 && ctx.manager.is_aborted(xmin) {
796        return false;
797    }
798    // Deleter aborted → treat xmax as unset; fall back to xmin-only check.
799    let effective_xmax = if xmax != 0 && ctx.manager.is_aborted(xmax) {
800        0
801    } else {
802        xmax
803    };
804    // Phase 2.3.2e: own-tx writes are always visible to the connection
805    // that stamped them, even when xmin/xmax exceed `snapshot.xid` (as
806    // happens for sub-xids allocated by SAVEPOINT after BEGIN).
807    let own_xmin = xmin != 0 && ctx.own_xids.contains(&xmin);
808    let own_xmax = effective_xmax != 0 && ctx.own_xids.contains(&effective_xmax);
809    if own_xmax {
810        // This connection deleted the row via this xid — hide it from self.
811        return false;
812    }
813    if own_xmin {
814        return true;
815    }
816    ctx.snapshot.sees(xmin, effective_xmax)
817}
818
819fn runtime_pool_lock(runtime: &RedDBRuntime) -> std::sync::MutexGuard<'_, PoolState> {
820    runtime
821        .inner
822        .pool
823        .lock()
824        .unwrap_or_else(|poisoned| poisoned.into_inner())
825}
826
827fn cache_scope_insert(scopes: &mut HashSet<String>, name: &str) {
828    if name.is_empty() || name.starts_with("__subq_") || is_universal_query_source(name) {
829        return;
830    }
831    scopes.insert(name.to_string());
832}
833
834fn collect_table_source_scopes(scopes: &mut HashSet<String>, query: &TableQuery) {
835    match query.source.as_ref() {
836        Some(crate::storage::query::ast::TableSource::Name(name)) => {
837            cache_scope_insert(scopes, name)
838        }
839        Some(crate::storage::query::ast::TableSource::Subquery(subquery)) => {
840            collect_query_expr_result_cache_scopes(scopes, subquery);
841        }
842        None => cache_scope_insert(scopes, &query.table),
843    }
844}
845
846fn collect_vector_source_scopes(
847    scopes: &mut HashSet<String>,
848    source: &crate::storage::query::ast::VectorSource,
849) {
850    match source {
851        crate::storage::query::ast::VectorSource::Reference { collection, .. } => {
852            cache_scope_insert(scopes, collection);
853        }
854        crate::storage::query::ast::VectorSource::Subquery(subquery) => {
855            collect_query_expr_result_cache_scopes(scopes, subquery);
856        }
857        crate::storage::query::ast::VectorSource::Literal(_)
858        | crate::storage::query::ast::VectorSource::Text(_) => {}
859    }
860}
861
862fn collect_path_selector_scopes(
863    scopes: &mut HashSet<String>,
864    selector: &crate::storage::query::ast::NodeSelector,
865) {
866    if let crate::storage::query::ast::NodeSelector::ByRow { table, .. } = selector {
867        cache_scope_insert(scopes, table);
868    }
869}
870
871fn collect_query_expr_result_cache_scopes(scopes: &mut HashSet<String>, expr: &QueryExpr) {
872    match expr {
873        QueryExpr::Table(query) => collect_table_source_scopes(scopes, query),
874        QueryExpr::Join(query) => {
875            collect_query_expr_result_cache_scopes(scopes, &query.left);
876            collect_query_expr_result_cache_scopes(scopes, &query.right);
877        }
878        QueryExpr::Path(query) => {
879            collect_path_selector_scopes(scopes, &query.from);
880            collect_path_selector_scopes(scopes, &query.to);
881        }
882        QueryExpr::Vector(query) => {
883            cache_scope_insert(scopes, &query.collection);
884            collect_vector_source_scopes(scopes, &query.query_vector);
885        }
886        QueryExpr::Hybrid(query) => {
887            collect_query_expr_result_cache_scopes(scopes, &query.structured);
888            cache_scope_insert(scopes, &query.vector.collection);
889            collect_vector_source_scopes(scopes, &query.vector.query_vector);
890        }
891        QueryExpr::Insert(query) => cache_scope_insert(scopes, &query.table),
892        QueryExpr::Update(query) => cache_scope_insert(scopes, &query.table),
893        QueryExpr::Delete(query) => cache_scope_insert(scopes, &query.table),
894        QueryExpr::CreateTable(query) => cache_scope_insert(scopes, &query.name),
895        QueryExpr::CreateCollection(query) => cache_scope_insert(scopes, &query.name),
896        QueryExpr::CreateVector(query) => cache_scope_insert(scopes, &query.name),
897        QueryExpr::DropTable(query) => cache_scope_insert(scopes, &query.name),
898        QueryExpr::DropGraph(query) => cache_scope_insert(scopes, &query.name),
899        QueryExpr::DropVector(query) => cache_scope_insert(scopes, &query.name),
900        QueryExpr::DropDocument(query) => cache_scope_insert(scopes, &query.name),
901        QueryExpr::DropKv(query) => cache_scope_insert(scopes, &query.name),
902        QueryExpr::DropCollection(query) => cache_scope_insert(scopes, &query.name),
903        QueryExpr::Truncate(query) => cache_scope_insert(scopes, &query.name),
904        QueryExpr::AlterTable(query) => cache_scope_insert(scopes, &query.name),
905        QueryExpr::CreateIndex(query) => cache_scope_insert(scopes, &query.table),
906        QueryExpr::DropIndex(query) => cache_scope_insert(scopes, &query.table),
907        QueryExpr::CreateTimeSeries(query) => cache_scope_insert(scopes, &query.name),
908        QueryExpr::DropTimeSeries(query) => cache_scope_insert(scopes, &query.name),
909        QueryExpr::CreateQueue(query) => cache_scope_insert(scopes, &query.name),
910        QueryExpr::AlterQueue(query) => cache_scope_insert(scopes, &query.name),
911        QueryExpr::DropQueue(query) => cache_scope_insert(scopes, &query.name),
912        QueryExpr::QueueSelect(query) => cache_scope_insert(scopes, &query.queue),
913        QueryExpr::QueueCommand(query) => match query {
914            QueueCommand::Push { queue, .. }
915            | QueueCommand::Pop { queue, .. }
916            | QueueCommand::Peek { queue, .. }
917            | QueueCommand::Len { queue }
918            | QueueCommand::Purge { queue }
919            | QueueCommand::GroupCreate { queue, .. }
920            | QueueCommand::GroupRead { queue, .. }
921            | QueueCommand::Pending { queue, .. }
922            | QueueCommand::Claim { queue, .. }
923            | QueueCommand::Ack { queue, .. }
924            | QueueCommand::Nack { queue, .. } => cache_scope_insert(scopes, queue),
925            QueueCommand::Move {
926                source,
927                destination,
928                ..
929            } => {
930                cache_scope_insert(scopes, source);
931                cache_scope_insert(scopes, destination);
932            }
933        },
934        QueryExpr::EventsBackfill(query) => {
935            cache_scope_insert(scopes, &query.collection);
936            cache_scope_insert(scopes, &query.target_queue);
937        }
938        QueryExpr::CreateTree(query) => cache_scope_insert(scopes, &query.collection),
939        QueryExpr::DropTree(query) => cache_scope_insert(scopes, &query.collection),
940        QueryExpr::TreeCommand(query) => match query {
941            TreeCommand::Insert { collection, .. }
942            | TreeCommand::Move { collection, .. }
943            | TreeCommand::Delete { collection, .. }
944            | TreeCommand::Validate { collection, .. }
945            | TreeCommand::Rebalance { collection, .. } => cache_scope_insert(scopes, collection),
946        },
947        QueryExpr::SearchCommand(query) => match query {
948            SearchCommand::Similar { collection, .. }
949            | SearchCommand::Hybrid { collection, .. }
950            | SearchCommand::SpatialRadius { collection, .. }
951            | SearchCommand::SpatialBbox { collection, .. }
952            | SearchCommand::SpatialNearest { collection, .. } => {
953                cache_scope_insert(scopes, collection);
954            }
955            SearchCommand::Text { collection, .. }
956            | SearchCommand::Multimodal { collection, .. }
957            | SearchCommand::Index { collection, .. }
958            | SearchCommand::Context { collection, .. } => {
959                if let Some(collection) = collection.as_deref() {
960                    cache_scope_insert(scopes, collection);
961                }
962            }
963        },
964        QueryExpr::Ask(query) => {
965            if let Some(collection) = query.collection.as_deref() {
966                cache_scope_insert(scopes, collection);
967            }
968        }
969        QueryExpr::ExplainAlter(query) => cache_scope_insert(scopes, &query.target.name),
970        QueryExpr::MaintenanceCommand(cmd) => match cmd {
971            crate::storage::query::ast::MaintenanceCommand::Vacuum { target, .. }
972            | crate::storage::query::ast::MaintenanceCommand::Analyze { target } => {
973                if let Some(t) = target {
974                    cache_scope_insert(scopes, t);
975                }
976            }
977        },
978        QueryExpr::CopyFrom(cmd) => cache_scope_insert(scopes, &cmd.table),
979        QueryExpr::CreateView(cmd) => {
980            cache_scope_insert(scopes, &cmd.name);
981            // Invalidating the view should also invalidate its dependencies.
982            collect_query_expr_result_cache_scopes(scopes, &cmd.query);
983        }
984        QueryExpr::DropView(cmd) => cache_scope_insert(scopes, &cmd.name),
985        QueryExpr::RefreshMaterializedView(cmd) => cache_scope_insert(scopes, &cmd.name),
986        QueryExpr::CreatePolicy(cmd) => cache_scope_insert(scopes, &cmd.table),
987        QueryExpr::DropPolicy(cmd) => cache_scope_insert(scopes, &cmd.table),
988        QueryExpr::CreateServer(_) | QueryExpr::DropServer(_) => {}
989        QueryExpr::CreateForeignTable(cmd) => cache_scope_insert(scopes, &cmd.name),
990        QueryExpr::DropForeignTable(cmd) => cache_scope_insert(scopes, &cmd.name),
991        QueryExpr::Graph(_)
992        | QueryExpr::GraphCommand(_)
993        | QueryExpr::ProbabilisticCommand(_)
994        | QueryExpr::SetConfig { .. }
995        | QueryExpr::ShowConfig { .. }
996        | QueryExpr::SetSecret { .. }
997        | QueryExpr::DeleteSecret { .. }
998        | QueryExpr::ShowSecrets { .. }
999        | QueryExpr::SetTenant(_)
1000        | QueryExpr::ShowTenant
1001        | QueryExpr::TransactionControl(_)
1002        | QueryExpr::CreateSchema(_)
1003        | QueryExpr::DropSchema(_)
1004        | QueryExpr::CreateSequence(_)
1005        | QueryExpr::DropSequence(_)
1006        | QueryExpr::Grant(_)
1007        | QueryExpr::Revoke(_)
1008        | QueryExpr::AlterUser(_)
1009        | QueryExpr::CreateIamPolicy { .. }
1010        | QueryExpr::DropIamPolicy { .. }
1011        | QueryExpr::AttachPolicy { .. }
1012        | QueryExpr::DetachPolicy { .. }
1013        | QueryExpr::ShowPolicies { .. }
1014        | QueryExpr::ShowEffectivePermissions { .. }
1015        | QueryExpr::SimulatePolicy { .. }
1016        | QueryExpr::CreateMigration(_)
1017        | QueryExpr::ApplyMigration(_)
1018        | QueryExpr::RollbackMigration(_)
1019        | QueryExpr::ExplainMigration(_)
1020        | QueryExpr::EventsBackfillStatus { .. } => {}
1021        QueryExpr::KvCommand(cmd) => {
1022            use crate::storage::query::ast::KvCommand;
1023            match cmd {
1024                KvCommand::Put { collection, .. }
1025                | KvCommand::InvalidateTags { collection, .. }
1026                | KvCommand::Get { collection, .. }
1027                | KvCommand::Unseal { collection, .. }
1028                | KvCommand::Rotate { collection, .. }
1029                | KvCommand::History { collection, .. }
1030                | KvCommand::List { collection, .. }
1031                | KvCommand::Purge { collection, .. }
1032                | KvCommand::Watch { collection, .. }
1033                | KvCommand::Delete { collection, .. }
1034                | KvCommand::Incr { collection, .. }
1035                | KvCommand::Cas { collection, .. } => cache_scope_insert(scopes, collection),
1036            }
1037        }
1038        QueryExpr::ConfigCommand(cmd) => {
1039            use crate::storage::query::ast::ConfigCommand;
1040            match cmd {
1041                ConfigCommand::Put { collection, .. }
1042                | ConfigCommand::Get { collection, .. }
1043                | ConfigCommand::Resolve { collection, .. }
1044                | ConfigCommand::Rotate { collection, .. }
1045                | ConfigCommand::Delete { collection, .. }
1046                | ConfigCommand::History { collection, .. }
1047                | ConfigCommand::List { collection, .. }
1048                | ConfigCommand::Watch { collection, .. }
1049                | ConfigCommand::InvalidVolatileOperation { collection, .. } => {
1050                    cache_scope_insert(scopes, collection)
1051                }
1052            }
1053        }
1054    }
1055}
1056
1057/// Combine matching RLS policies for a table + action into a single
1058/// `Filter` suitable for AND-ing into a caller's `WHERE` clause.
1059///
1060/// Returns `None` when RLS is disabled or no policy admits the caller's
1061/// role — callers use that to short-circuit the mutation (for DELETE /
1062/// UPDATE we simply skip the operation, which PG expresses as "no rows
1063/// match the policy + predicate combination").
1064pub(crate) fn rls_policy_filter(
1065    runtime: &RedDBRuntime,
1066    table: &str,
1067    action: crate::storage::query::ast::PolicyAction,
1068) -> Option<crate::storage::query::ast::Filter> {
1069    rls_policy_filter_for_kind(
1070        runtime,
1071        table,
1072        action,
1073        crate::storage::query::ast::PolicyTargetKind::Table,
1074    )
1075}
1076
1077/// Kind-aware policy filter combiner (Phase 2.5.5 RLS universal).
1078/// Graph / vector / queue / timeseries scans pass the concrete kind;
1079/// policies targeting other kinds are ignored. Legacy Table-scoped
1080/// policies still apply cross-kind — callers register auto-tenancy
1081/// policies as Table today.
1082pub(crate) fn rls_policy_filter_for_kind(
1083    runtime: &RedDBRuntime,
1084    table: &str,
1085    action: crate::storage::query::ast::PolicyAction,
1086    kind: crate::storage::query::ast::PolicyTargetKind,
1087) -> Option<crate::storage::query::ast::Filter> {
1088    use crate::storage::query::ast::Filter;
1089
1090    if !runtime.inner.rls_enabled_tables.read().contains(table) {
1091        return None;
1092    }
1093    let role = current_auth_identity().map(|(_, role)| role);
1094    let role_str = role.map(|r| r.as_str().to_string());
1095    let policies = runtime.matching_rls_policies_for_kind(table, role_str.as_deref(), action, kind);
1096    if policies.is_empty() {
1097        return None;
1098    }
1099    policies
1100        .into_iter()
1101        .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1102}
1103
1104/// Returns true when the table has RLS enforcement enabled. Convenience
1105/// shortcut so DML paths can gate the AND-combine work without reaching
1106/// into `runtime.inner.rls_enabled_tables` directly.
1107pub(crate) fn rls_is_enabled(runtime: &RedDBRuntime, table: &str) -> bool {
1108    runtime.inner.rls_enabled_tables.read().contains(table)
1109}
1110
1111/// Per-entity gate used by the graph materialiser for `GraphNode`
1112/// entities. RLS is checked against the source collection with
1113/// `kind = Nodes`, which `matching_rls_policies_for_kind` resolves to
1114/// either `Nodes`-targeted policies or legacy `Table`-targeted ones
1115/// (for back-compat with auto-tenancy declarations). Cached per
1116/// collection so big graphs only resolve the policy chain once.
1117fn node_passes_rls(
1118    runtime: &RedDBRuntime,
1119    collection: &str,
1120    role: Option<&str>,
1121    cache: &mut std::collections::HashMap<String, Option<crate::storage::query::ast::Filter>>,
1122    entity: &crate::storage::unified::entity::UnifiedEntity,
1123) -> bool {
1124    use crate::storage::query::ast::{Filter, PolicyAction, PolicyTargetKind};
1125
1126    if !runtime.inner.rls_enabled_tables.read().contains(collection) {
1127        return true;
1128    }
1129    let filter = cache.entry(collection.to_string()).or_insert_with(|| {
1130        let policies = runtime.matching_rls_policies_for_kind(
1131            collection,
1132            role,
1133            PolicyAction::Select,
1134            PolicyTargetKind::Nodes,
1135        );
1136        if policies.is_empty() {
1137            None
1138        } else {
1139            policies
1140                .into_iter()
1141                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1142        }
1143    });
1144    let Some(filter) = filter else {
1145        return false;
1146    };
1147    crate::runtime::query_exec::evaluate_entity_filter_with_db(
1148        Some(&runtime.inner.db),
1149        entity,
1150        filter,
1151        collection,
1152        collection,
1153    )
1154}
1155
1156/// Edge counterpart of `node_passes_rls`. Same caching strategy with
1157/// `kind = Edges`.
1158fn edge_passes_rls(
1159    runtime: &RedDBRuntime,
1160    collection: &str,
1161    role: Option<&str>,
1162    cache: &mut std::collections::HashMap<String, Option<crate::storage::query::ast::Filter>>,
1163    entity: &crate::storage::unified::entity::UnifiedEntity,
1164) -> bool {
1165    use crate::storage::query::ast::{Filter, PolicyAction, PolicyTargetKind};
1166
1167    if !runtime.inner.rls_enabled_tables.read().contains(collection) {
1168        return true;
1169    }
1170    let filter = cache.entry(collection.to_string()).or_insert_with(|| {
1171        let policies = runtime.matching_rls_policies_for_kind(
1172            collection,
1173            role,
1174            PolicyAction::Select,
1175            PolicyTargetKind::Edges,
1176        );
1177        if policies.is_empty() {
1178            None
1179        } else {
1180            policies
1181                .into_iter()
1182                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1183        }
1184    });
1185    let Some(filter) = filter else {
1186        return false;
1187    };
1188    crate::runtime::query_exec::evaluate_entity_filter_with_db(
1189        Some(&runtime.inner.db),
1190        entity,
1191        filter,
1192        collection,
1193        collection,
1194    )
1195}
1196
1197/// RLS policy injection (Phase 2.5.2 PG parity).
1198///
1199/// Fetch every matching policy for the current thread-local role and
1200/// fold them into the query's filter. Semantics mirror PostgreSQL:
1201///
1202/// * Multiple policies on the same table combine with **OR** — a row is
1203///   visible if *any* policy admits it.
1204/// * The combined policy predicate is **AND**-ed into the caller's
1205///   existing `WHERE` clause so explicit predicates continue to trim
1206///   the policy-allowed set.
1207/// * No matching policies + RLS enabled = zero rows (PG's
1208///   restrictive-default). Callers get `None` and return an empty
1209///   `UnifiedResult` without ever dispatching the scan.
1210///
1211/// This runs only when `RuntimeInner::rls_enabled_tables` already
1212/// contains the table name — callers gate the hot path upfront to
1213/// avoid the lock acquisition on tables without RLS.
1214///
1215/// Returns `None` when no policy admits the current role; returns
1216/// `Some(mutated_table)` with policy filters folded in otherwise.
1217fn inject_rls_filters(
1218    runtime: &RedDBRuntime,
1219    frame: &dyn super::statement_frame::ReadFrame,
1220    mut table: crate::storage::query::ast::TableQuery,
1221) -> Option<crate::storage::query::ast::TableQuery> {
1222    use crate::storage::query::ast::{Filter, PolicyAction};
1223
1224    // `None` role falls through to policies with no `TO role` clause.
1225    let role = frame.identity().map(|(_, role)| role);
1226    let role_str = role.map(|r| r.as_str().to_string());
1227    let policies =
1228        runtime.matching_rls_policies(&table.table, role_str.as_deref(), PolicyAction::Select);
1229
1230    if policies.is_empty() {
1231        // RLS enabled + no policy match = deny everything. Signal the
1232        // caller to short-circuit with an empty result set.
1233        return None;
1234    }
1235
1236    // Combine policy predicates with OR (PG's permissive default).
1237    let combined = policies
1238        .into_iter()
1239        .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1240        .expect("policies non-empty");
1241
1242    // AND into the caller's existing filter.
1243    table.filter = Some(match table.filter.take() {
1244        Some(existing) => Filter::And(Box::new(existing), Box::new(combined)),
1245        None => combined,
1246    });
1247    Some(table)
1248}
1249
1250/// Apply per-table RLS to a `JoinQuery` by folding each side's policy
1251/// predicate into the join's outer filter. Walking the merged record
1252/// at the join layer (rather than mutating the per-side scan filter)
1253/// keeps the planner's strategy choice and per-side index selection
1254/// undisturbed — the policy predicate uses the qualified `t.col` form
1255/// that resolves cleanly against the merged record's keys.
1256///
1257/// Returns `None` when any leaf has RLS enabled and no policy admits
1258/// the caller — the join short-circuits to an empty result.
1259fn inject_rls_into_join(
1260    runtime: &RedDBRuntime,
1261    frame: &dyn super::statement_frame::ReadFrame,
1262    mut join: crate::storage::query::ast::JoinQuery,
1263) -> Option<crate::storage::query::ast::JoinQuery> {
1264    use crate::storage::query::ast::Filter;
1265
1266    let mut policy_filters: Vec<Filter> = Vec::new();
1267    if !collect_join_side_policy(runtime, frame, join.left.as_ref(), &mut policy_filters) {
1268        return None;
1269    }
1270    if !collect_join_side_policy(runtime, frame, join.right.as_ref(), &mut policy_filters) {
1271        return None;
1272    }
1273
1274    if policy_filters.is_empty() {
1275        return Some(join);
1276    }
1277
1278    let combined = policy_filters
1279        .into_iter()
1280        .reduce(|acc, f| Filter::And(Box::new(acc), Box::new(f)))
1281        .expect("policy_filters non-empty");
1282
1283    join.filter = Some(match join.filter.take() {
1284        Some(existing) => Filter::And(Box::new(existing), Box::new(combined)),
1285        None => combined,
1286    });
1287
1288    Some(join)
1289}
1290
1291/// For each `Table` leaf reachable through nested joins, append the
1292/// RLS-policy filter (combined with OR across that side's matching
1293/// policies) into `out`. Returns `false` when a side has RLS enabled
1294/// but no policy admits the caller — the join must short-circuit.
1295fn collect_join_side_policy(
1296    runtime: &RedDBRuntime,
1297    frame: &dyn super::statement_frame::ReadFrame,
1298    expr: &crate::storage::query::ast::QueryExpr,
1299    out: &mut Vec<crate::storage::query::ast::Filter>,
1300) -> bool {
1301    use crate::storage::query::ast::{Filter, PolicyAction, QueryExpr};
1302    match expr {
1303        QueryExpr::Table(t) => {
1304            if !runtime.inner.rls_enabled_tables.read().contains(&t.table) {
1305                return true;
1306            }
1307            let role = frame.identity().map(|(_, role)| role);
1308            let role_str = role.map(|r| r.as_str().to_string());
1309            let policies =
1310                runtime.matching_rls_policies(&t.table, role_str.as_deref(), PolicyAction::Select);
1311            if policies.is_empty() {
1312                return false;
1313            }
1314            let combined = policies
1315                .into_iter()
1316                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1317                .expect("policies non-empty");
1318            out.push(combined);
1319            true
1320        }
1321        QueryExpr::Join(inner) => {
1322            collect_join_side_policy(runtime, frame, inner.left.as_ref(), out)
1323                && collect_join_side_policy(runtime, frame, inner.right.as_ref(), out)
1324        }
1325        _ => true,
1326    }
1327}
1328
1329/// Foreign-table post-scan filter application (Phase 3.2.2 PG parity).
1330///
1331/// Phase 3.2 FDW wrappers don't advertise filter pushdown, so the runtime
1332/// applies `WHERE` / `ORDER BY` / `LIMIT` / `OFFSET` after the wrapper
1333/// materialises all rows. Projections are best-effort — when the query
1334/// lists explicit columns we keep only those; a `SELECT *` keeps every
1335/// wrapper-emitted field verbatim.
1336///
1337/// When a wrapper later opts into pushdown (`supports_pushdown = true`)
1338/// the runtime will pass the compiled filter down instead of post-filtering.
1339fn apply_foreign_table_filters(
1340    records: Vec<crate::storage::query::unified::UnifiedRecord>,
1341    query: &crate::storage::query::ast::TableQuery,
1342) -> crate::storage::query::unified::UnifiedResult {
1343    use crate::storage::query::sql_lowering::{
1344        effective_table_filter, effective_table_projections,
1345    };
1346    use crate::storage::query::unified::UnifiedResult;
1347
1348    let filter = effective_table_filter(query);
1349    let projections = effective_table_projections(query);
1350
1351    // Step 1 — WHERE. Reuse the cross-store evaluator so the semantics
1352    // match native-collection queries (same operators, same NULL handling).
1353    let mut filtered: Vec<_> = records
1354        .into_iter()
1355        .filter(|record| match &filter {
1356            Some(f) => {
1357                super::join_filter::evaluate_runtime_filter_with_db(None, record, f, None, None)
1358            }
1359            None => true,
1360        })
1361        .collect();
1362
1363    // Step 2 — LIMIT / OFFSET. Applied after filter to match SQL semantics.
1364    if let Some(offset) = query.offset {
1365        let offset = offset as usize;
1366        if offset >= filtered.len() {
1367            filtered.clear();
1368        } else {
1369            filtered.drain(0..offset);
1370        }
1371    }
1372    if let Some(limit) = query.limit {
1373        filtered.truncate(limit as usize);
1374    }
1375
1376    // Step 3 — columns list. `SELECT *` (no explicit projections) keeps
1377    // the wrapper's column set; an explicit list trims to those names.
1378    let columns: Vec<String> = if projections.is_empty() {
1379        filtered
1380            .first()
1381            .map(|r| r.column_names().iter().map(|k| k.to_string()).collect())
1382            .unwrap_or_default()
1383    } else {
1384        projections
1385            .iter()
1386            .map(super::join_filter::projection_name)
1387            .collect()
1388    };
1389
1390    let mut result = UnifiedResult::empty();
1391    result.columns = columns;
1392    result.records = filtered;
1393    result
1394}
1395
1396/// Collect every concrete table reference inside a `QueryExpr`.
1397///
1398/// Used by view bookkeeping (dependency tracking for materialised
1399/// invalidation) and any other rewriter that needs to know the base
1400/// tables a query pulls from. Does not descend into projections/filters;
1401/// only the `FROM` side.
1402pub(crate) fn collect_table_refs(expr: &QueryExpr) -> Vec<String> {
1403    let mut scopes: HashSet<String> = HashSet::new();
1404    collect_query_expr_result_cache_scopes(&mut scopes, expr);
1405    scopes.into_iter().collect()
1406}
1407
1408fn query_expr_result_cache_scopes(expr: &QueryExpr) -> HashSet<String> {
1409    let mut scopes = HashSet::new();
1410    collect_query_expr_result_cache_scopes(&mut scopes, expr);
1411    scopes
1412}
1413
1414const RESULT_CACHE_BACKEND_KEY: &str = "runtime.result_cache.backend";
1415const RESULT_CACHE_DEFAULT_BACKEND: &str = "legacy";
1416const RESULT_CACHE_BLOB_NAMESPACE: &str = "runtime.result_cache";
1417const RESULT_CACHE_TTL_SECS: u64 = 30;
1418const RESULT_CACHE_MAX_ENTRIES: usize = 1000;
1419const RESULT_CACHE_PAYLOAD_MAGIC: &[u8; 8] = b"RDRC0001";
1420
1421#[derive(Clone, Copy, Debug, PartialEq, Eq)]
1422enum RuntimeResultCacheBackend {
1423    Legacy,
1424    BlobCache,
1425    Shadow,
1426}
1427
1428fn trim_result_cache(
1429    map: &mut HashMap<String, RuntimeResultCacheEntry>,
1430    order: &mut std::collections::VecDeque<String>,
1431) {
1432    while map.len() > RESULT_CACHE_MAX_ENTRIES {
1433        if let Some(oldest) = order.pop_front() {
1434            map.remove(&oldest);
1435        } else {
1436            break;
1437        }
1438    }
1439}
1440
1441fn result_cache_fingerprint(result: &RuntimeQueryResult) -> String {
1442    format!(
1443        "{:?}|{}|{}|{}|{}|{:?}",
1444        result.result,
1445        result.query,
1446        result.statement,
1447        result.engine,
1448        result.affected_rows,
1449        result.statement_type
1450    )
1451}
1452
1453fn mode_to_byte(mode: crate::storage::query::modes::QueryMode) -> u8 {
1454    match mode {
1455        crate::storage::query::modes::QueryMode::Sql => 0,
1456        crate::storage::query::modes::QueryMode::Gremlin => 1,
1457        crate::storage::query::modes::QueryMode::Cypher => 2,
1458        crate::storage::query::modes::QueryMode::Sparql => 3,
1459        crate::storage::query::modes::QueryMode::Path => 4,
1460        crate::storage::query::modes::QueryMode::Natural => 5,
1461        crate::storage::query::modes::QueryMode::Unknown => 255,
1462    }
1463}
1464
1465fn mode_from_byte(byte: u8) -> Option<crate::storage::query::modes::QueryMode> {
1466    match byte {
1467        0 => Some(crate::storage::query::modes::QueryMode::Sql),
1468        1 => Some(crate::storage::query::modes::QueryMode::Gremlin),
1469        2 => Some(crate::storage::query::modes::QueryMode::Cypher),
1470        3 => Some(crate::storage::query::modes::QueryMode::Sparql),
1471        4 => Some(crate::storage::query::modes::QueryMode::Path),
1472        5 => Some(crate::storage::query::modes::QueryMode::Natural),
1473        255 => Some(crate::storage::query::modes::QueryMode::Unknown),
1474        _ => None,
1475    }
1476}
1477
1478fn result_cache_static_str(value: &str) -> Option<&'static str> {
1479    match value {
1480        "select" => Some("select"),
1481        "materialized-graph" => Some("materialized-graph"),
1482        "runtime-red-schema" => Some("runtime-red-schema"),
1483        "runtime-fdw" => Some("runtime-fdw"),
1484        "runtime-table-rls" => Some("runtime-table-rls"),
1485        "runtime-table" => Some("runtime-table"),
1486        "runtime-join-rls" => Some("runtime-join-rls"),
1487        "runtime-join" => Some("runtime-join"),
1488        "runtime-vector" => Some("runtime-vector"),
1489        "runtime-hybrid" => Some("runtime-hybrid"),
1490        "runtime-secret" => Some("runtime-secret"),
1491        "runtime-config" => Some("runtime-config"),
1492        "runtime-tenant" => Some("runtime-tenant"),
1493        "runtime-explain" => Some("runtime-explain"),
1494        "runtime-tree" => Some("runtime-tree"),
1495        "runtime-kv" => Some("runtime-kv"),
1496        "runtime-queue" => Some("runtime-queue"),
1497        _ => None,
1498    }
1499}
1500
1501fn write_u32(out: &mut Vec<u8>, value: usize) -> Option<()> {
1502    let value = u32::try_from(value).ok()?;
1503    out.extend_from_slice(&value.to_le_bytes());
1504    Some(())
1505}
1506
1507fn write_string(out: &mut Vec<u8>, value: &str) -> Option<()> {
1508    write_u32(out, value.len())?;
1509    out.extend_from_slice(value.as_bytes());
1510    Some(())
1511}
1512
1513fn write_bytes(out: &mut Vec<u8>, value: &[u8]) -> Option<()> {
1514    write_u32(out, value.len())?;
1515    out.extend_from_slice(value);
1516    Some(())
1517}
1518
1519fn read_u8(input: &mut &[u8]) -> Option<u8> {
1520    let (&value, rest) = input.split_first()?;
1521    *input = rest;
1522    Some(value)
1523}
1524
1525fn read_u32(input: &mut &[u8]) -> Option<usize> {
1526    if input.len() < 4 {
1527        return None;
1528    }
1529    let value = u32::from_le_bytes(input[..4].try_into().ok()?) as usize;
1530    *input = &input[4..];
1531    Some(value)
1532}
1533
1534fn read_u64(input: &mut &[u8]) -> Option<u64> {
1535    if input.len() < 8 {
1536        return None;
1537    }
1538    let value = u64::from_le_bytes(input[..8].try_into().ok()?);
1539    *input = &input[8..];
1540    Some(value)
1541}
1542
1543fn read_string(input: &mut &[u8]) -> Option<String> {
1544    let len = read_u32(input)?;
1545    if input.len() < len {
1546        return None;
1547    }
1548    let value = String::from_utf8(input[..len].to_vec()).ok()?;
1549    *input = &input[len..];
1550    Some(value)
1551}
1552
1553fn read_bytes<'a>(input: &mut &'a [u8]) -> Option<&'a [u8]> {
1554    let len = read_u32(input)?;
1555    if input.len() < len {
1556        return None;
1557    }
1558    let value = &input[..len];
1559    *input = &input[len..];
1560    Some(value)
1561}
1562
1563fn encode_result_cache_payload(entry: &RuntimeResultCacheEntry) -> Option<Vec<u8>> {
1564    let result = &entry.result;
1565    if result.result.pre_serialized_json.is_some()
1566        || result_cache_static_str(result.statement).is_none()
1567        || result_cache_static_str(result.engine).is_none()
1568        || result_cache_static_str(result.statement_type).is_none()
1569        || result.result.records.iter().any(|record| {
1570            !record.nodes.is_empty()
1571                || !record.edges.is_empty()
1572                || !record.paths.is_empty()
1573                || !record.vector_results.is_empty()
1574        })
1575    {
1576        return None;
1577    }
1578
1579    let mut out = Vec::new();
1580    out.extend_from_slice(RESULT_CACHE_PAYLOAD_MAGIC);
1581    write_string(&mut out, &result.query)?;
1582    out.push(mode_to_byte(result.mode));
1583    write_string(&mut out, result.statement)?;
1584    write_string(&mut out, result.engine)?;
1585    out.extend_from_slice(&result.affected_rows.to_le_bytes());
1586    write_string(&mut out, result.statement_type)?;
1587
1588    write_u32(&mut out, result.result.columns.len())?;
1589    for column in &result.result.columns {
1590        write_string(&mut out, column)?;
1591    }
1592    out.extend_from_slice(&result.result.stats.nodes_scanned.to_le_bytes());
1593    out.extend_from_slice(&result.result.stats.edges_scanned.to_le_bytes());
1594    out.extend_from_slice(&result.result.stats.rows_scanned.to_le_bytes());
1595    out.extend_from_slice(&result.result.stats.exec_time_us.to_le_bytes());
1596
1597    write_u32(&mut out, result.result.records.len())?;
1598    for record in &result.result.records {
1599        let fields = record.iter_fields().collect::<Vec<_>>();
1600        write_u32(&mut out, fields.len())?;
1601        for (name, value) in fields {
1602            write_string(&mut out, name)?;
1603            let mut encoded = Vec::new();
1604            crate::storage::schema::value_codec::encode(value, &mut encoded);
1605            write_bytes(&mut out, &encoded)?;
1606        }
1607    }
1608
1609    write_u32(&mut out, entry.scopes.len())?;
1610    for scope in &entry.scopes {
1611        write_string(&mut out, scope)?;
1612    }
1613    Some(out)
1614}
1615
1616fn decode_result_cache_payload(mut input: &[u8]) -> Option<(RuntimeQueryResult, HashSet<String>)> {
1617    if input.len() < RESULT_CACHE_PAYLOAD_MAGIC.len()
1618        || &input[..RESULT_CACHE_PAYLOAD_MAGIC.len()] != RESULT_CACHE_PAYLOAD_MAGIC
1619    {
1620        return None;
1621    }
1622    input = &input[RESULT_CACHE_PAYLOAD_MAGIC.len()..];
1623
1624    let query = read_string(&mut input)?;
1625    let mode = mode_from_byte(read_u8(&mut input)?)?;
1626    let statement = result_cache_static_str(&read_string(&mut input)?)?;
1627    let engine = result_cache_static_str(&read_string(&mut input)?)?;
1628    let affected_rows = read_u64(&mut input)?;
1629    let statement_type = result_cache_static_str(&read_string(&mut input)?)?;
1630
1631    let mut columns = Vec::new();
1632    for _ in 0..read_u32(&mut input)? {
1633        columns.push(read_string(&mut input)?);
1634    }
1635    let stats = crate::storage::query::unified::QueryStats {
1636        nodes_scanned: read_u64(&mut input)?,
1637        edges_scanned: read_u64(&mut input)?,
1638        rows_scanned: read_u64(&mut input)?,
1639        exec_time_us: read_u64(&mut input)?,
1640    };
1641
1642    let mut records = Vec::new();
1643    for _ in 0..read_u32(&mut input)? {
1644        let mut record = crate::storage::query::unified::UnifiedRecord::new();
1645        for _ in 0..read_u32(&mut input)? {
1646            let name = read_string(&mut input)?;
1647            let bytes = read_bytes(&mut input)?;
1648            let (value, used) = crate::storage::schema::value_codec::decode(bytes).ok()?;
1649            if used != bytes.len() {
1650                return None;
1651            }
1652            record.set_owned(name, value);
1653        }
1654        records.push(record);
1655    }
1656
1657    let mut scopes = HashSet::new();
1658    for _ in 0..read_u32(&mut input)? {
1659        scopes.insert(read_string(&mut input)?);
1660    }
1661    if !input.is_empty() {
1662        return None;
1663    }
1664
1665    Some((
1666        RuntimeQueryResult {
1667            query,
1668            mode,
1669            statement,
1670            engine,
1671            result: crate::storage::query::unified::UnifiedResult {
1672                columns,
1673                records,
1674                stats,
1675                pre_serialized_json: None,
1676            },
1677            affected_rows,
1678            statement_type,
1679        },
1680        scopes,
1681    ))
1682}
1683
1684/// Heuristic: does the raw SQL reference a built-in whose output
1685/// varies by connection, clock, or randomness? Such queries must
1686/// skip the 30s result cache — see the call site for rationale.
1687///
1688/// ASCII case-insensitive substring match. False positives (the
1689/// token appears in a quoted string) only skip caching, which is
1690/// the conservative direction.
1691/// If `sql` starts with `EXPLAIN` followed by a non-`ALTER` token,
1692/// return the trimmed inner statement; otherwise `None`.
1693///
1694/// `EXPLAIN ALTER FOR CREATE TABLE ...` is a separate schema-diff
1695/// command handled inside the normal SQL parser, so we leave it
1696/// alone here.
1697fn strip_explain_prefix(sql: &str) -> Option<&str> {
1698    let trimmed = sql.trim_start();
1699    let (head, rest) = trimmed.split_at(
1700        trimmed
1701            .find(|c: char| c.is_whitespace())
1702            .unwrap_or(trimmed.len()),
1703    );
1704    if !head.eq_ignore_ascii_case("EXPLAIN") {
1705        return None;
1706    }
1707    let rest = rest.trim_start();
1708    if rest.is_empty() {
1709        return None;
1710    }
1711    // Peek the next token — if ALTER or ASK, defer to the normal parser.
1712    // `EXPLAIN ASK` is an executable read path: it runs retrieval and
1713    // provider selection, then short-circuits before the LLM call.
1714    let next_head_end = rest.find(|c: char| c.is_whitespace()).unwrap_or(rest.len());
1715    if rest[..next_head_end].eq_ignore_ascii_case("ALTER")
1716        || rest[..next_head_end].eq_ignore_ascii_case("ASK")
1717    {
1718        return None;
1719    }
1720    Some(rest)
1721}
1722
1723/// Cheap prefix check for a leading `WITH` keyword. Used to gate the
1724/// CTE-aware parse in `execute_query` without paying for a full
1725/// lexer pass on every statement. Treats `WITHIN` as not-a-CTE so
1726/// `WITHIN TENANT '...' SELECT ...` doesn't mis-route.
1727pub(super) fn has_with_prefix(sql: &str) -> bool {
1728    let trimmed = sql.trim_start();
1729    let head_end = trimmed
1730        .find(|c: char| c.is_whitespace() || c == '(')
1731        .unwrap_or(trimmed.len());
1732    trimmed[..head_end].eq_ignore_ascii_case("WITH")
1733}
1734
1735/// If the query is a plain SELECT whose top-level `TableQuery`
1736/// carries an `AS OF` clause, return a typed spec that the runtime
1737/// can feed to `vcs_resolve_as_of`. Returns `None` for any other
1738/// shape — joins, DML, EXPLAIN, or parse failures — so callers fall
1739/// back to the connection's regular MVCC snapshot. A cheap textual
1740/// prefilter skips the parse entirely when the source doesn't
1741/// mention `AS OF` / `as of`, keeping the autocommit hot path free.
1742fn peek_top_level_as_of(sql: &str) -> Option<crate::application::vcs::AsOfSpec> {
1743    peek_top_level_as_of_with_table(sql).map(|(spec, _)| spec)
1744}
1745
1746/// Same as `peek_top_level_as_of` but also returns the table name
1747/// targeted by the AS OF clause (when the FROM clause names a
1748/// concrete table). `None` for the table slot means scalar SELECT
1749/// or a subquery source — callers treat those as "no enforcement".
1750pub(super) fn peek_top_level_as_of_with_table(
1751    sql: &str,
1752) -> Option<(crate::application::vcs::AsOfSpec, Option<String>)> {
1753    if !sql
1754        .as_bytes()
1755        .windows(5)
1756        .any(|w| w.eq_ignore_ascii_case(b"as of"))
1757    {
1758        return None;
1759    }
1760    let parsed = crate::storage::query::parser::parse(sql).ok()?;
1761    let crate::storage::query::ast::QueryExpr::Table(table) = parsed.query else {
1762        return None;
1763    };
1764    let clause = table.as_of?;
1765    let table_name = if table.table.is_empty() || table.table == "any" {
1766        None
1767    } else {
1768        Some(table.table.clone())
1769    };
1770    let spec = match clause {
1771        crate::storage::query::ast::AsOfClause::Commit(h) => {
1772            crate::application::vcs::AsOfSpec::Commit(h)
1773        }
1774        crate::storage::query::ast::AsOfClause::Branch(b) => {
1775            crate::application::vcs::AsOfSpec::Branch(b)
1776        }
1777        crate::storage::query::ast::AsOfClause::Tag(t) => crate::application::vcs::AsOfSpec::Tag(t),
1778        crate::storage::query::ast::AsOfClause::TimestampMs(ts) => {
1779            crate::application::vcs::AsOfSpec::TimestampMs(ts)
1780        }
1781        crate::storage::query::ast::AsOfClause::Snapshot(x) => {
1782            crate::application::vcs::AsOfSpec::Snapshot(x)
1783        }
1784    };
1785    Some((spec, table_name))
1786}
1787
1788pub(super) fn query_has_volatile_builtin(sql: &str) -> bool {
1789    // Lowercase the bytes up to the first null/newline into a small
1790    // stack buffer for cheap contains() checks. Most SQL fits in the
1791    // buffer; longer queries fall back to owned lowercase.
1792    const VOLATILE_TOKENS: &[&str] = &[
1793        "pg_advisory_lock",
1794        "pg_try_advisory_lock",
1795        "pg_advisory_unlock",
1796        "random()",
1797        // NOW() / CURRENT_TIMESTAMP / CURRENT_DATE intentionally
1798        // omitted for now — they ARE volatile but today's tests rely
1799        // on caching them. Revisit once a tighter volatility story
1800        // lands.
1801    ];
1802    let lowered = sql.to_ascii_lowercase();
1803    VOLATILE_TOKENS.iter().any(|t| lowered.contains(t))
1804}
1805
1806pub(super) fn query_is_ask_statement(sql: &str) -> bool {
1807    let trimmed = sql.trim_start();
1808    let head_end = trimmed
1809        .find(|c: char| c.is_whitespace() || c == '(' || c == ';')
1810        .unwrap_or(trimmed.len());
1811    trimmed[..head_end].eq_ignore_ascii_case("ASK")
1812}
1813
1814/// Pick the `(global_mode, collection_mode)` pair for an expression,
1815/// or `None` for variants that opt out of intent-locking entirely
1816/// (admin statements like `SHOW CONFIG`, transaction control, tenant
1817/// toggles).
1818///
1819/// Phase-1 contract:
1820/// - Reads  — `(IX-compatible) (Global, IS) → (Collection, IS)`
1821/// - Writes — `(IX-compatible) (Global, IX) → (Collection, IX)`
1822/// - DDL    — `(strong)        (Global, IX) → (Collection, X)`
1823pub(super) fn intent_lock_modes_for(
1824    expr: &QueryExpr,
1825) -> Option<(
1826    crate::storage::transaction::lock::LockMode,
1827    crate::storage::transaction::lock::LockMode,
1828)> {
1829    use crate::storage::transaction::lock::LockMode::{Exclusive, IntentExclusive, IntentShared};
1830
1831    match expr {
1832        // Reads — IS / IS.
1833        QueryExpr::Table(_)
1834        | QueryExpr::Join(_)
1835        | QueryExpr::Vector(_)
1836        | QueryExpr::Hybrid(_)
1837        | QueryExpr::Graph(_)
1838        | QueryExpr::Path(_)
1839        | QueryExpr::Ask(_)
1840        | QueryExpr::SearchCommand(_)
1841        | QueryExpr::GraphCommand(_)
1842        | QueryExpr::QueueSelect(_) => Some((IntentShared, IntentShared)),
1843
1844        // Writes — IX / IX. Non-tabular mutations (vector insert,
1845        // graph node insert, queue push, timeseries point insert)
1846        // don't carry their own dispatch arm here; they ride through
1847        // the Insert variant or a command variant covered by the
1848        // read-side arm above. P1.T4 expands only the TableQuery-ish
1849        // writes; non-tabular kinds inherit when their DML variants
1850        // land in later phases.
1851        QueryExpr::Insert(_)
1852        | QueryExpr::Update(_)
1853        | QueryExpr::Delete(_)
1854        | QueryExpr::QueueCommand(QueueCommand::Move { .. }) => {
1855            Some((IntentExclusive, IntentExclusive))
1856        }
1857        QueryExpr::QueueCommand(_) => Some((IntentShared, IntentShared)),
1858
1859        // DDL — IX / X. A DDL against collection `c` blocks all
1860        // other writers + readers on `c` but leaves other collections
1861        // running (because Global stays IX, not X).
1862        QueryExpr::CreateTable(_)
1863        | QueryExpr::CreateCollection(_)
1864        | QueryExpr::CreateVector(_)
1865        | QueryExpr::DropTable(_)
1866        | QueryExpr::DropGraph(_)
1867        | QueryExpr::DropVector(_)
1868        | QueryExpr::DropDocument(_)
1869        | QueryExpr::DropKv(_)
1870        | QueryExpr::DropCollection(_)
1871        | QueryExpr::Truncate(_)
1872        | QueryExpr::AlterTable(_)
1873        | QueryExpr::CreateIndex(_)
1874        | QueryExpr::DropIndex(_)
1875        | QueryExpr::CreateTimeSeries(_)
1876        | QueryExpr::DropTimeSeries(_)
1877        | QueryExpr::CreateQueue(_)
1878        | QueryExpr::AlterQueue(_)
1879        | QueryExpr::DropQueue(_)
1880        | QueryExpr::CreateTree(_)
1881        | QueryExpr::DropTree(_)
1882        | QueryExpr::CreatePolicy(_)
1883        | QueryExpr::DropPolicy(_)
1884        | QueryExpr::CreateView(_)
1885        | QueryExpr::DropView(_)
1886        | QueryExpr::RefreshMaterializedView(_)
1887        | QueryExpr::CreateSchema(_)
1888        | QueryExpr::DropSchema(_)
1889        | QueryExpr::CreateSequence(_)
1890        | QueryExpr::DropSequence(_)
1891        | QueryExpr::CreateServer(_)
1892        | QueryExpr::DropServer(_)
1893        | QueryExpr::CreateForeignTable(_)
1894        | QueryExpr::DropForeignTable(_) => Some((IntentExclusive, Exclusive)),
1895
1896        // Admin / control — skip intent locks. `SET TENANT`,
1897        // `BEGIN / COMMIT / ROLLBACK`, `SET CONFIG`, `SHOW CONFIG`,
1898        // `VACUUM`, etc. don't touch collection data the same way
1899        // and the existing transaction layer already serialises the
1900        // pieces that matter.
1901        _ => None,
1902    }
1903}
1904
1905/// Best-effort collection inventory for an expression. Used to pick
1906/// `Collection(...)` resources for the intent-lock guard. Overshoots
1907/// are fine (take an extra IS, benign); undershoots leak writes past
1908/// DDL X locks, so err on the side of listing more names.
1909pub(super) fn collections_referenced(expr: &QueryExpr) -> Vec<String> {
1910    let mut out = Vec::new();
1911    walk_collections(expr, &mut out);
1912    out.sort();
1913    out.dedup();
1914    out
1915}
1916
1917fn walk_collections(expr: &QueryExpr, out: &mut Vec<String>) {
1918    match expr {
1919        QueryExpr::Table(t) => out.push(t.table.clone()),
1920        QueryExpr::Join(j) => {
1921            walk_collections(&j.left, out);
1922            walk_collections(&j.right, out);
1923        }
1924        QueryExpr::Insert(i) => out.push(i.table.clone()),
1925        QueryExpr::Update(u) => out.push(u.table.clone()),
1926        QueryExpr::Delete(d) => out.push(d.table.clone()),
1927        QueryExpr::QueueSelect(q) => out.push(q.queue.clone()),
1928
1929        // DDL — include the target collection so DDL takes
1930        // `(Collection, X)` and blocks concurrent readers / writers
1931        // on the same collection. Other collections stay live
1932        // because Global is still IX.
1933        QueryExpr::CreateTable(q) => out.push(q.name.clone()),
1934        QueryExpr::CreateCollection(q) => out.push(q.name.clone()),
1935        QueryExpr::CreateVector(q) => out.push(q.name.clone()),
1936        QueryExpr::DropTable(q) => out.push(q.name.clone()),
1937        QueryExpr::DropGraph(q) => out.push(q.name.clone()),
1938        QueryExpr::DropVector(q) => out.push(q.name.clone()),
1939        QueryExpr::DropDocument(q) => out.push(q.name.clone()),
1940        QueryExpr::DropKv(q) => out.push(q.name.clone()),
1941        QueryExpr::DropCollection(q) => out.push(q.name.clone()),
1942        QueryExpr::Truncate(q) => out.push(q.name.clone()),
1943        QueryExpr::AlterTable(q) => out.push(q.name.clone()),
1944        QueryExpr::CreateIndex(q) => out.push(q.table.clone()),
1945        QueryExpr::DropIndex(q) => out.push(q.table.clone()),
1946        QueryExpr::CreateTimeSeries(q) => out.push(q.name.clone()),
1947        QueryExpr::DropTimeSeries(q) => out.push(q.name.clone()),
1948        QueryExpr::CreateQueue(q) => out.push(q.name.clone()),
1949        QueryExpr::AlterQueue(q) => out.push(q.name.clone()),
1950        QueryExpr::DropQueue(q) => out.push(q.name.clone()),
1951        QueryExpr::QueueCommand(QueueCommand::Move {
1952            source,
1953            destination,
1954            ..
1955        }) => {
1956            out.push(source.clone());
1957            out.push(destination.clone());
1958        }
1959        QueryExpr::CreatePolicy(q) => out.push(q.table.clone()),
1960        QueryExpr::CreateView(q) => out.push(q.name.clone()),
1961        QueryExpr::DropView(q) => out.push(q.name.clone()),
1962        QueryExpr::RefreshMaterializedView(q) => out.push(q.name.clone()),
1963
1964        // Vector / Hybrid / Graph / Path / commands reference
1965        // collections through fields whose shape varies; without a
1966        // uniform accessor we fall back to the global lock only —
1967        // benign because every runtime path still holds the global
1968        // mode.
1969        _ => {}
1970    }
1971}
1972
1973impl RedDBRuntime {
1974    pub fn in_memory() -> RedDBResult<Self> {
1975        Self::with_options(RedDBOptions::in_memory())
1976    }
1977
1978    /// Handle to the intent-lock manager for tests + introspection.
1979    /// Production code acquires via `LockerGuard::new(rt.lock_manager())`
1980    /// rather than touching the manager directly.
1981    pub fn lock_manager(&self) -> std::sync::Arc<crate::storage::transaction::lock::LockManager> {
1982        self.inner.lock_manager.clone()
1983    }
1984
1985    #[inline(never)]
1986    pub fn with_options(options: RedDBOptions) -> RedDBResult<Self> {
1987        Self::with_pool(options, ConnectionPoolConfig::default())
1988    }
1989
1990    pub fn with_pool(
1991        options: RedDBOptions,
1992        pool_config: ConnectionPoolConfig,
1993    ) -> RedDBResult<Self> {
1994        // PLAN.md Phase 9.1 — capture wall-clock before storage
1995        // open so the cold-start phase markers can be backfilled
1996        // once Lifecycle is constructed below. Storage open
1997        // encapsulates auto-restore + WAL replay; we treat the
1998        // whole window as one combined "restore" + "wal_replay"
1999        // phase split at the same boundary because the storage
2000        // layer doesn't yet emit a finer signal.
2001        let boot_open_start_ms = std::time::SystemTime::now()
2002            .duration_since(std::time::UNIX_EPOCH)
2003            .map(|d| d.as_millis() as u64)
2004            .unwrap_or(0);
2005        let db = Arc::new(
2006            RedDB::open_with_options(&options)
2007                .map_err(|err| RedDBError::Internal(err.to_string()))?,
2008        );
2009        let result_blob_cache = crate::storage::cache::BlobCache::open_with_l2(
2010            crate::storage::cache::BlobCacheConfig::default().with_l2_path(
2011                options
2012                    .resolved_path("data.rdb")
2013                    .with_extension("result-cache.l2"),
2014            ),
2015        )
2016        .map_err(|err| {
2017            RedDBError::Internal(format!("open result Blob Cache L2 failed: {err:?}"))
2018        })?;
2019        let storage_ready_ms = std::time::SystemTime::now()
2020            .duration_since(std::time::UNIX_EPOCH)
2021            .map(|d| d.as_millis() as u64)
2022            .unwrap_or(0);
2023
2024        let runtime = Self {
2025            inner: Arc::new(RuntimeInner {
2026                db,
2027                layout: PhysicalLayout::from_options(&options),
2028                indices: IndexCatalog::register_default_vector_graph(
2029                    options.has_capability(crate::api::Capability::Table),
2030                    options.has_capability(crate::api::Capability::Graph),
2031                ),
2032                pool_config,
2033                pool: Mutex::new(PoolState::default()),
2034                started_at_unix_ms: SystemTime::now()
2035                    .duration_since(UNIX_EPOCH)
2036                    .unwrap_or_default()
2037                    .as_millis(),
2038                probabilistic: super::probabilistic_store::ProbabilisticStore::new(),
2039                index_store: super::index_store::IndexStore::new(),
2040                cdc: crate::replication::cdc::CdcBuffer::new(100_000),
2041                backup_scheduler: crate::replication::scheduler::BackupScheduler::new(3600),
2042                query_cache: parking_lot::RwLock::new(
2043                    crate::storage::query::planner::cache::PlanCache::new(1000),
2044                ),
2045                result_cache: parking_lot::RwLock::new((
2046                    HashMap::new(),
2047                    std::collections::VecDeque::new(),
2048                )),
2049                result_blob_cache,
2050                result_blob_entries: parking_lot::RwLock::new((
2051                    HashMap::new(),
2052                    std::collections::VecDeque::new(),
2053                )),
2054                ask_answer_cache_entries: parking_lot::RwLock::new((
2055                    HashSet::new(),
2056                    std::collections::VecDeque::new(),
2057                )),
2058                result_cache_shadow_divergences: std::sync::atomic::AtomicU64::new(0),
2059                ask_daily_spend: parking_lot::RwLock::new(HashMap::new()),
2060                queue_message_locks: parking_lot::RwLock::new(HashMap::new()),
2061                rmw_locks: RmwLockTable::new(),
2062                planner_dirty_tables: parking_lot::RwLock::new(HashSet::new()),
2063                ec_registry: Arc::new(crate::ec::config::EcRegistry::new()),
2064                ec_worker: crate::ec::worker::EcWorker::new(),
2065                auth_store: parking_lot::RwLock::new(None),
2066                oauth_validator: parking_lot::RwLock::new(None),
2067                views: parking_lot::RwLock::new(HashMap::new()),
2068                materialized_views: parking_lot::RwLock::new(
2069                    crate::storage::cache::result::MaterializedViewCache::new(),
2070                ),
2071                retention_sweeper: parking_lot::RwLock::new(
2072                    crate::runtime::retention_sweeper::RetentionSweeperState::new(),
2073                ),
2074                snapshot_manager: Arc::new(
2075                    crate::storage::transaction::snapshot::SnapshotManager::new(),
2076                ),
2077                tx_contexts: parking_lot::RwLock::new(HashMap::new()),
2078                tx_local_tenants: parking_lot::RwLock::new(HashMap::new()),
2079                env_config_overrides: crate::runtime::config_overlay::collect_env_overrides(),
2080                lock_manager: Arc::new({
2081                    // Sourced from the matrix: Tier B key
2082                    // `concurrency.locking.deadlock_timeout_ms`
2083                    // (default 5000). Env var wins at boot so
2084                    // operators can tune without touching red_config.
2085                    let env = crate::runtime::config_overlay::collect_env_overrides();
2086                    let timeout_ms = env
2087                        .get("concurrency.locking.deadlock_timeout_ms")
2088                        .and_then(|raw| raw.parse::<u64>().ok())
2089                        .unwrap_or_else(|| {
2090                            match crate::runtime::config_matrix::default_for(
2091                                "concurrency.locking.deadlock_timeout_ms",
2092                            ) {
2093                                Some(crate::serde_json::Value::Number(n)) => n as u64,
2094                                _ => 5000,
2095                            }
2096                        });
2097                    let cfg = crate::storage::transaction::lock::LockConfig {
2098                        default_timeout: std::time::Duration::from_millis(timeout_ms),
2099                        ..Default::default()
2100                    };
2101                    crate::storage::transaction::lock::LockManager::new(cfg)
2102                }),
2103                rls_policies: parking_lot::RwLock::new(HashMap::new()),
2104                rls_enabled_tables: parking_lot::RwLock::new(HashSet::new()),
2105                foreign_tables: Arc::new(crate::storage::fdw::ForeignTableRegistry::with_builtins()),
2106                pending_tombstones: parking_lot::RwLock::new(HashMap::new()),
2107                pending_versioned_updates: parking_lot::RwLock::new(HashMap::new()),
2108                pending_kv_watch_events: parking_lot::RwLock::new(HashMap::new()),
2109                pending_store_wal_actions: parking_lot::RwLock::new(HashMap::new()),
2110                tenant_tables: parking_lot::RwLock::new(HashMap::new()),
2111                ddl_epoch: std::sync::atomic::AtomicU64::new(0),
2112                write_gate: Arc::new(crate::runtime::write_gate::WriteGate::from_options(
2113                    &options,
2114                )),
2115                lifecycle: crate::runtime::lifecycle::Lifecycle::new(),
2116                resource_limits: crate::runtime::resource_limits::ResourceLimits::from_env(),
2117                audit_log: {
2118                    // Default audit-log path for the in-memory case
2119                    // sits in the system temp dir; persistent runs
2120                    // place it next to data.rdb.
2121                    //
2122                    // gh-471 iter 2: route through the resolved
2123                    // `LogDestination`. Performance/Max tiers emit a
2124                    // `File(...)` under `<dbname>.rdb.red/logs/`;
2125                    // lower tiers / ephemeral runs report `Stderr`
2126                    // and we keep the legacy file-next-to-data sink.
2127                    let data_path = options
2128                        .data_path
2129                        .clone()
2130                        .unwrap_or_else(|| std::env::temp_dir().join("reddb"));
2131                    let (audit_dest, _) =
2132                        crate::api::tier_wiring::current_log_destinations();
2133                    Arc::new(crate::runtime::audit_log::AuditLogger::for_destination(
2134                        &audit_dest,
2135                        &data_path,
2136                    ))
2137                },
2138                lease_lifecycle: std::sync::OnceLock::new(),
2139                replica_apply_metrics: crate::replication::logical::ReplicaApplyMetrics::default(),
2140                quota_bucket: crate::runtime::quota_bucket::QuotaBucket::from_env(),
2141                schema_vocabulary: parking_lot::RwLock::new(
2142                    crate::runtime::schema_vocabulary::SchemaVocabulary::new(),
2143                ),
2144                slow_query_logger: {
2145                    // Issue #205 — slow-query sink lives in the same
2146                    // directory the audit log uses, so backup/restore
2147                    // ships them together. Threshold + sample-pct
2148                    // default conservatively (1 s, 100% sampling) so
2149                    // emitted lines are rare and complete. Operators
2150                    // tune via env / config matrix in a follow-up.
2151                    //
2152                    // gh-471 iter 2: same routing as the audit log —
2153                    // `LogDestination::File(...)` for Performance/Max
2154                    // lands under `<dbname>.rdb.red/logs/slow.log`;
2155                    // lower tiers fall back to `red-slow.log` in the
2156                    // data directory.
2157                    let fallback_dir = options
2158                        .data_path
2159                        .as_ref()
2160                        .and_then(|p| p.parent().map(std::path::PathBuf::from))
2161                        .unwrap_or_else(|| std::env::temp_dir().join("reddb"));
2162                    let threshold_ms = std::env::var("RED_SLOW_QUERY_THRESHOLD_MS")
2163                        .ok()
2164                        .and_then(|s| s.parse::<u64>().ok())
2165                        .unwrap_or(1000);
2166                    let sample_pct = std::env::var("RED_SLOW_QUERY_SAMPLE_PCT")
2167                        .ok()
2168                        .and_then(|s| s.parse::<u8>().ok())
2169                        .unwrap_or(100);
2170                    let (_, slow_dest) =
2171                        crate::api::tier_wiring::current_log_destinations();
2172                    crate::telemetry::slow_query_logger::SlowQueryLogger::for_destination(
2173                        &slow_dest,
2174                        &fallback_dir,
2175                        threshold_ms,
2176                        sample_pct,
2177                    )
2178                },
2179                kv_stats: crate::runtime::KvStatsCounters::default(),
2180                metrics_ingest_stats: crate::runtime::MetricsIngestCounters::default(),
2181                metrics_tenant_activity_stats:
2182                    crate::runtime::MetricsTenantActivityCounters::default(),
2183                queue_telemetry: Arc::new(
2184                    crate::runtime::queue_telemetry::QueueTelemetryCounters::default(),
2185                ),
2186                kv_tag_index: crate::runtime::KvTagIndex::default(),
2187                chain_tip_cache: parking_lot::Mutex::new(HashMap::new()),
2188                chain_integrity_broken: parking_lot::Mutex::new(HashMap::new()),
2189            }),
2190        };
2191
2192        // Issue #205 — install the process-wide OperatorEvent sink so
2193        // emit sites buried in storage / replication / signal handlers
2194        // can record without threading an `&AuditLogger` through every
2195        // call stack. First registration wins; subsequent in-memory
2196        // runtimes (test harnesses) fall through to tracing+eprintln.
2197        crate::telemetry::operator_event::install_global_audit_sink(Arc::clone(
2198            &runtime.inner.audit_log,
2199        ));
2200
2201        // PLAN.md Phase 9.1 — backfill cold-start phase markers
2202        // from the wall-clock captured before storage open. The
2203        // entire `RedDB::open_with_options` call covers both
2204        // auto-restore (when configured) and WAL replay. We
2205        // record both phases against the same boundary today;
2206        // a follow-up will split them once the storage layer
2207        // surfaces a finer-grained event.
2208        runtime
2209            .inner
2210            .lifecycle
2211            .set_restore_started_at_ms(boot_open_start_ms);
2212        runtime
2213            .inner
2214            .lifecycle
2215            .set_restore_ready_at_ms(storage_ready_ms);
2216        runtime
2217            .inner
2218            .lifecycle
2219            .set_wal_replay_started_at_ms(boot_open_start_ms);
2220        runtime
2221            .inner
2222            .lifecycle
2223            .set_wal_replay_ready_at_ms(storage_ready_ms);
2224
2225        let restored_cdc_lsn = runtime
2226            .inner
2227            .db
2228            .replication
2229            .as_ref()
2230            .map(|repl| {
2231                repl.logical_wal_spool
2232                    .as_ref()
2233                    .map(|spool| spool.current_lsn())
2234                    .unwrap_or(0)
2235            })
2236            .unwrap_or(0)
2237            .max(runtime.config_u64("red.config.timeline.last_archived_lsn", 0));
2238        runtime.inner.cdc.set_current_lsn(restored_cdc_lsn);
2239        runtime.rehydrate_snapshot_xid_floor();
2240        runtime.bootstrap_system_keyed_collections()?;
2241        runtime.rehydrate_declared_column_schemas();
2242        runtime.load_probabilistic_state()?;
2243
2244        // Phase 2.5.4: replay `tenant_tables.{table}.column` markers so
2245        // tables declared via `TENANT BY (col)` survive restart. Each
2246        // entry re-registers the auto-policy and flips RLS on again.
2247        runtime.rehydrate_tenant_tables();
2248        if let Some(repl) = &runtime.inner.db.replication {
2249            repl.wal_buffer.set_current_lsn(restored_cdc_lsn);
2250        }
2251
2252        // Save system info to red_config on boot
2253        {
2254            let sys = SystemInfo::collect();
2255            runtime.inner.db.store().set_config_tree(
2256                "red.system",
2257                &crate::serde_json::json!({
2258                    "pid": sys.pid,
2259                    "cpu_cores": sys.cpu_cores,
2260                    "total_memory_bytes": sys.total_memory_bytes,
2261                    "available_memory_bytes": sys.available_memory_bytes,
2262                    "os": sys.os,
2263                    "arch": sys.arch,
2264                    "hostname": sys.hostname,
2265                    "started_at": SystemTime::now()
2266                        .duration_since(UNIX_EPOCH)
2267                        .unwrap_or_default()
2268                        .as_millis() as u64
2269                }),
2270            );
2271
2272            // Seed defaults on first boot (only if red_config is empty or missing defaults)
2273            let store = runtime.inner.db.store();
2274            if store
2275                .get_collection("red_config")
2276                .map(|m| m.query_all(|_| true).len())
2277                .unwrap_or(0)
2278                <= 10
2279            {
2280                store.set_config_tree("red.ai", &crate::json!({
2281                    "default": crate::json!({
2282                        "provider": "openai",
2283                        "model": crate::ai::DEFAULT_OPENAI_PROMPT_MODEL
2284                    }),
2285                    "max_embedding_inputs": 256,
2286                    "max_prompt_batch": 256,
2287                    "timeout": crate::json!({ "connect_secs": 10, "read_secs": 90, "write_secs": 30 })
2288                }));
2289                store.set_config_tree(
2290                    "red.server",
2291                    &crate::json!({
2292                        "max_scan_limit": 1000,
2293                        "max_body_size": 1048576,
2294                        "read_timeout_ms": 5000,
2295                        "write_timeout_ms": 5000
2296                    }),
2297                );
2298                store.set_config_tree(
2299                    "red.storage",
2300                    &crate::json!({
2301                        "page_size": 4096,
2302                        "page_cache_capacity": 100000,
2303                        "auto_checkpoint_pages": 1000,
2304                        "snapshot_retention": 16,
2305                        "verify_checksums": true,
2306                        "segment": crate::json!({
2307                            "max_entities": 100000,
2308                            "max_bytes": 268435456_u64,
2309                            "compression_level": 6
2310                        }),
2311                        "hnsw": crate::json!({ "m": 16, "ef_construction": 100, "ef_search": 50 }),
2312                        "ivf": crate::json!({ "n_lists": 100, "n_probes": 10 }),
2313                        "bm25": crate::json!({ "k1": 1.2, "b": 0.75 })
2314                    }),
2315                );
2316                store.set_config_tree(
2317                    "red.search",
2318                    &crate::json!({
2319                        "rag": crate::json!({
2320                            "max_chunks_per_source": 10,
2321                            "max_total_chunks": 25,
2322                            "similarity_threshold": 0.8,
2323                            "graph_depth": 2,
2324                            "min_relevance": 0.3
2325                        }),
2326                        "fusion": crate::json!({
2327                            "vector_weight": 0.5,
2328                            "graph_weight": 0.3,
2329                            "table_weight": 0.2,
2330                            "dedup_threshold": 0.85
2331                        })
2332                    }),
2333                );
2334                store.set_config_tree(
2335                    "red.auth",
2336                    &crate::json!({
2337                        "enabled": false,
2338                        "session_ttl_secs": 3600,
2339                        "require_auth": false
2340                    }),
2341                );
2342                store.set_config_tree(
2343                    "red.query",
2344                    &crate::json!({
2345                        "connection_pool": crate::json!({ "max_connections": 64, "max_idle": 16 }),
2346                        "max_recursion_depth": 1000
2347                    }),
2348                );
2349                store.set_config_tree(
2350                    "red.indexes",
2351                    &crate::json!({
2352                        "auto_select": true,
2353                        "bloom_filter": crate::json!({
2354                            "enabled": true,
2355                            "false_positive_rate": 0.01,
2356                            "prune_on_scan": true
2357                        }),
2358                        "hash": crate::json!({ "enabled": true }),
2359                        "bitmap": crate::json!({ "enabled": true, "max_cardinality": 1000 }),
2360                        "spatial": crate::json!({ "enabled": true })
2361                    }),
2362                );
2363                store.set_config_tree(
2364                    "red.memtable",
2365                    &crate::json!({
2366                        "enabled": true,
2367                        "max_bytes": 67108864_u64,
2368                        "flush_threshold": 0.75
2369                    }),
2370                );
2371                store.set_config_tree(
2372                    "red.probabilistic",
2373                    &crate::json!({
2374                        "hll_registers": 16384,
2375                        "sketch_default_width": 1000,
2376                        "sketch_default_depth": 5,
2377                        "filter_default_capacity": 100000
2378                    }),
2379                );
2380                store.set_config_tree(
2381                    "red.timeseries",
2382                    &crate::json!({
2383                        "default_chunk_size": 1024,
2384                        "compression": crate::json!({
2385                            "timestamps": "delta_of_delta",
2386                            "values": "gorilla_xor"
2387                        }),
2388                        "default_retention_days": 0
2389                    }),
2390                );
2391                store.set_config_tree(
2392                    "red.queue",
2393                    &crate::json!({
2394                        "default_max_size": 0,
2395                        "default_max_attempts": 3,
2396                        "visibility_timeout_ms": 30000,
2397                        "consumer_idle_timeout_ms": 60000
2398                    }),
2399                );
2400                store.set_config_tree(
2401                    "red.backup",
2402                    &crate::json!({
2403                        "enabled": false,
2404                        "interval_secs": 3600,
2405                        "retention_count": 24,
2406                        "upload": false,
2407                        "backend": "local"
2408                    }),
2409                );
2410                store.set_config_tree(
2411                    "red.wal",
2412                    &crate::json!({
2413                        "archive": crate::json!({
2414                            "enabled": false,
2415                            "retention_hours": 168,
2416                            "prefix": "wal/"
2417                        })
2418                    }),
2419                );
2420                store.set_config_tree(
2421                    "red.cdc",
2422                    &crate::json!({
2423                        "enabled": true,
2424                        "buffer_size": 100000
2425                    }),
2426                );
2427                store.set_config_tree(
2428                    "red.config.secret",
2429                    &crate::json!({
2430                        "auto_encrypt": true,
2431                        "auto_decrypt": true
2432                    }),
2433                );
2434            }
2435
2436            // Perf-parity config matrix: heal the Tier A (critical)
2437            // keys unconditionally on every boot. Idempotent — only
2438            // writes the default when the key is missing. Keeps
2439            // `SHOW CONFIG` showing every guarantee the operator has
2440            // (durability.mode, concurrency.locking.enabled, …) even
2441            // on long-running datadirs that predate the matrix.
2442            crate::runtime::config_matrix::heal_critical_keys(store.as_ref());
2443
2444            // Phase 5 — Lehman-Yao runtime flag. Read the Tier A
2445            // `storage.btree.lehman_yao` value from the matrix (env
2446            // > file > red_config > default) and publish it to the
2447            // storage layer's atomic so the B-tree read / split
2448            // paths can branch without re-reading the config on
2449            // every hot-path call.
2450            let lehman_yao = runtime.config_bool("storage.btree.lehman_yao", true);
2451            crate::storage::engine::btree::lehman_yao::set_enabled(lehman_yao);
2452            if lehman_yao {
2453                tracing::info!(
2454                    "storage.btree.lehman_yao=true — lock-free concurrent descent enabled"
2455                );
2456            }
2457
2458            // Config file overlay — mounted `/etc/reddb/config.json`
2459            // (override path via REDDB_CONFIG_FILE). Writes keys with
2460            // write-if-absent semantics so a later user `SET CONFIG`
2461            // always wins. Missing file = silent no-op.
2462            let overlay_path = crate::runtime::config_overlay::config_file_path();
2463            let _ =
2464                crate::runtime::config_overlay::apply_config_file(store.as_ref(), &overlay_path);
2465        }
2466
2467        // VCS ("Git for Data") — create the `red_*` metadata
2468        // collections on first boot. Idempotent: `get_or_create_collection`
2469        // is a no-op if the collection already exists.
2470        {
2471            let store = runtime.inner.db.store();
2472            for name in crate::application::vcs_collections::ALL {
2473                let _ = store.get_or_create_collection(*name);
2474            }
2475            // Seed VCS config namespace with sensible defaults on first
2476            // boot, matching the pattern used by red.ai / red.storage.
2477            store.set_config_tree(
2478                crate::application::vcs_collections::CONFIG_NAMESPACE,
2479                &crate::json!({
2480                    "default_branch": "main",
2481                    "author": crate::json!({
2482                        "name": "reddb",
2483                        "email": "reddb@localhost"
2484                    }),
2485                    "protected_branches": crate::json!(["main"]),
2486                    "closure": crate::json!({
2487                        "enabled": true,
2488                        "lazy": true
2489                    }),
2490                    "merge": crate::json!({
2491                        "default_strategy": "auto",
2492                        "fast_forward": true
2493                    })
2494                }),
2495            );
2496        }
2497
2498        // Migrations — create the `red_migrations` / `red_migration_deps`
2499        // system collections on first boot. Idempotent.
2500        {
2501            let store = runtime.inner.db.store();
2502            for name in crate::application::migration_collections::ALL {
2503                let _ = store.get_or_create_collection(*name);
2504            }
2505        }
2506
2507        // Start background maintenance thread (context index refresh +
2508        // session purge). Held by a WEAK reference to `RuntimeInner`
2509        // so dropping the last `RedDBRuntime` handle actually releases
2510        // the underlying Arc<Pager> (and its file lock). Polling at
2511        // 200ms means shutdown latency is bounded; the real 60-second
2512        // work cadence is tracked independently via a `last_work`
2513        // timestamp.
2514        //
2515        // The previous version captured `rt = runtime.clone()` by
2516        // strong reference and ran an unterminated `loop`, which held
2517        // Arc<RuntimeInner> forever — reopening a persistent database
2518        // in the same process failed with "Database is locked" because
2519        // the pager could never drop. See the regression test
2520        // `finding_1_select_after_bulk_insert_persistent_reopen`.
2521        {
2522            let weak = Arc::downgrade(&runtime.inner);
2523            std::thread::Builder::new()
2524                .name("reddb-maintenance".into())
2525                .spawn(move || {
2526                    let tick = std::time::Duration::from_millis(200);
2527                    let work_interval = std::time::Duration::from_secs(60);
2528                    let mut last_work = std::time::Instant::now();
2529                    loop {
2530                        std::thread::sleep(tick);
2531                        let Some(inner) = weak.upgrade() else {
2532                            // All strong references dropped — the
2533                            // runtime is gone, exit cleanly.
2534                            break;
2535                        };
2536                        if last_work.elapsed() >= work_interval {
2537                            let _stats = inner.db.store().context_index().stats();
2538                            last_work = std::time::Instant::now();
2539                        }
2540                    }
2541                })
2542                .ok();
2543        }
2544
2545        // Start backup scheduler if enabled via red_config
2546        {
2547            let store = runtime.inner.db.store();
2548            let mut backup_enabled = false;
2549            let mut backup_interval = 3600u64;
2550
2551            if let Some(manager) = store.get_collection("red_config") {
2552                manager.for_each_entity(|entity| {
2553                    if let Some(row) = entity.data.as_row() {
2554                        let key = row.get_field("key").and_then(|v| match v {
2555                            crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2556                            _ => None,
2557                        });
2558                        let val = row.get_field("value");
2559                        if key == Some("red.config.backup.enabled") {
2560                            backup_enabled = match val {
2561                                Some(crate::storage::schema::Value::Boolean(true)) => true,
2562                                Some(crate::storage::schema::Value::Text(s)) => &**s == "true",
2563                                _ => false,
2564                            };
2565                        } else if key == Some("red.config.backup.interval_secs") {
2566                            if let Some(crate::storage::schema::Value::Integer(n)) = val {
2567                                backup_interval = *n as u64;
2568                            }
2569                        }
2570                    }
2571                    true
2572                });
2573            }
2574
2575            if backup_enabled {
2576                runtime.inner.backup_scheduler.set_interval(backup_interval);
2577                let rt = runtime.clone();
2578                runtime
2579                    .inner
2580                    .backup_scheduler
2581                    .start(move || rt.trigger_backup().map_err(|e| format!("{}", e)));
2582            }
2583        }
2584
2585        // Load EC registry from red_config and start worker
2586        {
2587            runtime
2588                .inner
2589                .ec_registry
2590                .load_from_config_store(runtime.inner.db.store().as_ref());
2591            if !runtime.inner.ec_registry.async_configs().is_empty() {
2592                runtime.inner.ec_worker.start(
2593                    Arc::clone(&runtime.inner.ec_registry),
2594                    Arc::clone(&runtime.inner.db.store()),
2595                );
2596            }
2597        }
2598
2599        if let crate::replication::ReplicationRole::Replica { primary_addr } =
2600            runtime.inner.db.options().replication.role.clone()
2601        {
2602            let rt = runtime.clone();
2603            std::thread::Builder::new()
2604                .name("reddb-replica".into())
2605                .spawn(move || rt.run_replica_loop(primary_addr))
2606                .ok();
2607        }
2608
2609        // PLAN.md Phase 1 — Lifecycle Contract. Mark Ready once every
2610        // boot stage above has completed (WAL replay, restore-from-
2611        // remote, replica-loop spawn). Health probes flip from 503 to
2612        // 200 here; shutdown begins from this state.
2613        runtime.inner.lifecycle.mark_ready();
2614
2615        // Issue #583 slice 10 — ContinuousMaterializedView scheduler.
2616        // Low-priority background ticker that drains the cache's
2617        // `claim_due_at` set every ~50ms. Holds only a Weak<RuntimeInner>
2618        // so the thread exits cleanly when the runtime drops (≤50ms
2619        // latency between drop and exit). Materialized views without
2620        // a `REFRESH EVERY` clause stay on the manual-refresh path
2621        // and are skipped by `claim_due_at`, so the loop is a no-op
2622        // when no scheduled views exist.
2623        {
2624            let weak_inner = Arc::downgrade(&runtime.inner);
2625            std::thread::Builder::new()
2626                .name("reddb-mv-scheduler".into())
2627                .spawn(move || loop {
2628                    std::thread::sleep(std::time::Duration::from_millis(50));
2629                    let Some(inner) = weak_inner.upgrade() else {
2630                        break;
2631                    };
2632                    let rt = RedDBRuntime { inner };
2633                    rt.refresh_due_materialized_views();
2634                })
2635                .ok();
2636        }
2637
2638        // Issue #584 slice 12 — DeclarativeRetention background sweeper.
2639        // Low-priority ticker that physically reclaims rows whose
2640        // timestamp has fallen beyond the retention window. Holds a
2641        // `Weak<RuntimeInner>` so the thread exits within one tick of
2642        // the runtime drop (graceful shutdown leaves storage consistent
2643        // because each tick goes through the standard DELETE path —
2644        // there is no half-finished mutation state to clean up). The
2645        // tick interval is intentionally longer than the MV scheduler
2646        // (500ms) because retention is order-of-seconds at minimum.
2647        {
2648            let weak_inner = Arc::downgrade(&runtime.inner);
2649            std::thread::Builder::new()
2650                .name("reddb-retention-sweeper".into())
2651                .spawn(move || loop {
2652                    std::thread::sleep(std::time::Duration::from_millis(500));
2653                    let Some(inner) = weak_inner.upgrade() else {
2654                        break;
2655                    };
2656                    let rt = RedDBRuntime { inner };
2657                    rt.sweep_retention_tick(
2658                        crate::runtime::retention_sweeper::DEFAULT_SWEEPER_BATCH,
2659                    );
2660                })
2661                .ok();
2662        }
2663
2664        Ok(runtime)
2665    }
2666
2667    fn rehydrate_snapshot_xid_floor(&self) {
2668        let store = self.inner.db.store();
2669        for collection in store.list_collections() {
2670            let Some(manager) = store.get_collection(&collection) else {
2671                continue;
2672            };
2673            for entity in manager.query_all(|_| true) {
2674                self.inner
2675                    .snapshot_manager
2676                    .observe_committed_xid(entity.xmin);
2677                self.inner
2678                    .snapshot_manager
2679                    .observe_committed_xid(entity.xmax);
2680            }
2681        }
2682    }
2683
2684    fn bootstrap_system_keyed_collections(&self) -> RedDBResult<()> {
2685        let mut changed = false;
2686        for (name, model) in [
2687            ("red.config", crate::catalog::CollectionModel::Config),
2688            ("red.vault", crate::catalog::CollectionModel::Vault),
2689        ] {
2690            if self.inner.db.store().get_collection(name).is_none() {
2691                self.inner.db.store().get_or_create_collection(name);
2692                changed = true;
2693            }
2694            if self.inner.db.collection_contract(name).is_none() {
2695                self.inner
2696                    .db
2697                    .save_collection_contract(system_keyed_collection_contract(name, model))
2698                    .map_err(|err| RedDBError::Internal(err.to_string()))?;
2699                changed = true;
2700            }
2701        }
2702        if changed {
2703            self.inner
2704                .db
2705                .persist_metadata()
2706                .map_err(|err| RedDBError::Internal(err.to_string()))?;
2707        }
2708        Ok(())
2709    }
2710
2711    pub fn db(&self) -> Arc<RedDB> {
2712        Arc::clone(&self.inner.db)
2713    }
2714
2715    /// Direct access to the runtime's secondary-index store.
2716    /// Used by bulk-insert entry points (gRPC binary bulk, HTTP bulk,
2717    /// wire bulk) that need to push new rows through the per-index
2718    /// maintenance hook after `store.bulk_insert` returns.
2719    pub fn index_store_ref(&self) -> &super::index_store::IndexStore {
2720        &self.inner.index_store
2721    }
2722
2723    /// Apply a DDL event to the schema-vocabulary reverse index
2724    /// (issue #120). Called by DDL execution paths after the catalog
2725    /// mutation has succeeded so the index never holds entries for
2726    /// half-applied DDL.
2727    pub(crate) fn schema_vocabulary_apply(
2728        &self,
2729        event: crate::runtime::schema_vocabulary::DdlEvent,
2730    ) {
2731        self.inner.schema_vocabulary.write().on_ddl(event);
2732    }
2733
2734    /// Lookup `token` in the schema-vocabulary reverse index. Returns
2735    /// an owned `Vec<VocabHit>` because the underlying read lock
2736    /// cannot be borrowed across the call boundary; the slice from
2737    /// `SchemaVocabulary::lookup` is cloned per hit.
2738    pub fn schema_vocabulary_lookup(
2739        &self,
2740        token: &str,
2741    ) -> Vec<crate::runtime::schema_vocabulary::VocabHit> {
2742        self.inner.schema_vocabulary.read().lookup(token).to_vec()
2743    }
2744
2745    /// Inject an AuthStore into the runtime. Called by server boot
2746    /// after the vault has been bootstrapped, so that `Value::Secret`
2747    /// auto-encrypt/decrypt can reach the vault AES key.
2748    pub fn set_auth_store(&self, store: Arc<crate::auth::store::AuthStore>) {
2749        *self.inner.auth_store.write() = Some(store);
2750    }
2751
2752    /// Snapshot the current AuthStore (if any). Used by the wire listener
2753    /// to validate bearer tokens issued via HTTP `/auth/login`.
2754    pub fn auth_store(&self) -> Option<Arc<crate::auth::store::AuthStore>> {
2755        self.inner.auth_store.read().clone()
2756    }
2757
2758    /// Read a vault KV secret from the configured AuthStore, if present.
2759    pub fn vault_kv_get(&self, key: &str) -> Option<String> {
2760        self.inner
2761            .auth_store
2762            .read()
2763            .as_ref()
2764            .and_then(|store| store.vault_kv_get(key))
2765    }
2766
2767    /// Write a vault KV secret and fail if the encrypted vault write is
2768    /// unavailable or cannot be made durable.
2769    pub fn vault_kv_try_set(&self, key: String, value: String) -> RedDBResult<()> {
2770        let store = self.inner.auth_store.read().clone().ok_or_else(|| {
2771            RedDBError::Query("secret storage requires an enabled, unsealed vault".to_string())
2772        })?;
2773        store
2774            .vault_kv_try_set(key, value)
2775            .map_err(|err| RedDBError::Query(err.to_string()))
2776    }
2777
2778    /// Inject an `OAuthValidator` into the runtime. When set, HTTP and
2779    /// wire transports try OAuth JWT validation before falling back to
2780    /// the local AuthStore lookup. Pass `None` to disable.
2781    pub fn set_oauth_validator(&self, validator: Option<Arc<crate::auth::oauth::OAuthValidator>>) {
2782        *self.inner.oauth_validator.write() = validator;
2783    }
2784
2785    /// Returns a clone of the configured `OAuthValidator` Arc, if any.
2786    /// Hot path: called per HTTP request when an Authorization header
2787    /// is present, so we hand back a cheap Arc clone.
2788    pub fn oauth_validator(&self) -> Option<Arc<crate::auth::oauth::OAuthValidator>> {
2789        self.inner.oauth_validator.read().clone()
2790    }
2791
2792    /// Returns the vault AES key (`red.secret.aes_key`) if an auth
2793    /// store is wired and a key has been generated. Used by the
2794    /// `Value::Secret` encrypt/decrypt pipeline.
2795    pub(crate) fn secret_aes_key(&self) -> Option<[u8; 32]> {
2796        let guard = self.inner.auth_store.read();
2797        guard.as_ref().and_then(|s| s.vault_secret_key())
2798    }
2799
2800    /// Resolve a boolean flag from `red_config`. Defaults to `default`
2801    /// when the key is missing or not coercible. If the same key has
2802    /// been written multiple times (SET CONFIG appends new rows), the
2803    /// most recent entity wins. Env-var overrides
2804    /// (`REDDB_<UP_DOTTED>`) take highest precedence.
2805    pub(crate) fn config_bool(&self, key: &str, default: bool) -> bool {
2806        if let Some(raw) = self.inner.env_config_overrides.get(key) {
2807            if let Some(crate::storage::schema::Value::Boolean(b)) =
2808                crate::runtime::config_overlay::coerce_env_value(key, raw)
2809            {
2810                return b;
2811            }
2812        }
2813        let store = self.inner.db.store();
2814        let Some(manager) = store.get_collection("red_config") else {
2815            return default;
2816        };
2817        let mut result = default;
2818        let mut latest_id: u64 = 0;
2819        manager.for_each_entity(|entity| {
2820            if let Some(row) = entity.data.as_row() {
2821                let entry_key = row.get_field("key").and_then(|v| match v {
2822                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2823                    _ => None,
2824                });
2825                if entry_key == Some(key) {
2826                    let id = entity.id.raw();
2827                    if id >= latest_id {
2828                        latest_id = id;
2829                        result = match row.get_field("value") {
2830                            Some(crate::storage::schema::Value::Boolean(b)) => *b,
2831                            Some(crate::storage::schema::Value::Text(s)) => {
2832                                matches!(s.as_ref(), "true" | "TRUE" | "True" | "1")
2833                            }
2834                            Some(crate::storage::schema::Value::Integer(n)) => *n != 0,
2835                            _ => default,
2836                        };
2837                    }
2838                }
2839            }
2840            true
2841        });
2842        result
2843    }
2844
2845    pub(crate) fn config_u64(&self, key: &str, default: u64) -> u64 {
2846        if let Some(raw) = self.inner.env_config_overrides.get(key) {
2847            if let Some(crate::storage::schema::Value::UnsignedInteger(n)) =
2848                crate::runtime::config_overlay::coerce_env_value(key, raw)
2849            {
2850                return n;
2851            }
2852        }
2853        let store = self.inner.db.store();
2854        let Some(manager) = store.get_collection("red_config") else {
2855            return default;
2856        };
2857        let mut result = default;
2858        let mut latest_id: u64 = 0;
2859        manager.for_each_entity(|entity| {
2860            if let Some(row) = entity.data.as_row() {
2861                let entry_key = row.get_field("key").and_then(|v| match v {
2862                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2863                    _ => None,
2864                });
2865                if entry_key == Some(key) {
2866                    let id = entity.id.raw();
2867                    if id >= latest_id {
2868                        latest_id = id;
2869                        result = match row.get_field("value") {
2870                            Some(crate::storage::schema::Value::Integer(n)) => *n as u64,
2871                            Some(crate::storage::schema::Value::UnsignedInteger(n)) => *n,
2872                            Some(crate::storage::schema::Value::Text(s)) => {
2873                                s.parse::<u64>().unwrap_or(default)
2874                            }
2875                            _ => default,
2876                        };
2877                    }
2878                }
2879            }
2880            true
2881        });
2882        result
2883    }
2884
2885    pub(crate) fn config_f64(&self, key: &str, default: f64) -> f64 {
2886        if let Some(raw) = self.inner.env_config_overrides.get(key) {
2887            if let Ok(n) = raw.parse::<f64>() {
2888                return n;
2889            }
2890        }
2891        let store = self.inner.db.store();
2892        let Some(manager) = store.get_collection("red_config") else {
2893            return default;
2894        };
2895        let mut result = default;
2896        let mut latest_id: u64 = 0;
2897        manager.for_each_entity(|entity| {
2898            if let Some(row) = entity.data.as_row() {
2899                let entry_key = row.get_field("key").and_then(|v| match v {
2900                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2901                    _ => None,
2902                });
2903                if entry_key == Some(key) {
2904                    let id = entity.id.raw();
2905                    if id >= latest_id {
2906                        latest_id = id;
2907                        result = match row.get_field("value") {
2908                            Some(crate::storage::schema::Value::Float(n)) => *n,
2909                            Some(crate::storage::schema::Value::Integer(n)) => *n as f64,
2910                            Some(crate::storage::schema::Value::UnsignedInteger(n)) => *n as f64,
2911                            Some(crate::storage::schema::Value::Text(s)) => {
2912                                s.parse::<f64>().unwrap_or(default)
2913                            }
2914                            _ => default,
2915                        };
2916                    }
2917                }
2918            }
2919            true
2920        });
2921        result
2922    }
2923
2924    pub(crate) fn config_string(&self, key: &str, default: &str) -> String {
2925        if let Some(raw) = self.inner.env_config_overrides.get(key) {
2926            return raw.clone();
2927        }
2928        let store = self.inner.db.store();
2929        let Some(manager) = store.get_collection("red_config") else {
2930            return default.to_string();
2931        };
2932        let mut result = default.to_string();
2933        let mut latest_id: u64 = 0;
2934        manager.for_each_entity(|entity| {
2935            if let Some(row) = entity.data.as_row() {
2936                let entry_key = row.get_field("key").and_then(|v| match v {
2937                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2938                    _ => None,
2939                });
2940                if entry_key == Some(key) {
2941                    let id = entity.id.raw();
2942                    if id >= latest_id {
2943                        latest_id = id;
2944                        if let Some(crate::storage::schema::Value::Text(value)) =
2945                            row.get_field("value")
2946                        {
2947                            result = value.to_string();
2948                        }
2949                    }
2950                }
2951            }
2952            true
2953        });
2954        result
2955    }
2956
2957    fn latest_metadata_for(
2958        &self,
2959        collection: &str,
2960        entity_id: u64,
2961    ) -> Option<crate::serde_json::Value> {
2962        self.inner
2963            .db
2964            .store()
2965            .get_metadata(collection, EntityId::new(entity_id))
2966            .map(|metadata| metadata_to_json(&metadata))
2967    }
2968
2969    fn persist_replica_lsn(&self, lsn: u64) {
2970        self.inner.db.store().set_config_tree(
2971            "red.replication",
2972            &crate::json!({
2973                "last_applied_lsn": lsn
2974            }),
2975        );
2976    }
2977
2978    fn persist_replication_health(
2979        &self,
2980        state: &str,
2981        last_error: &str,
2982        primary_lsn: Option<u64>,
2983        oldest_available_lsn: Option<u64>,
2984    ) {
2985        self.inner.db.store().set_config_tree(
2986            "red.replication",
2987            &crate::json!({
2988                "state": state,
2989                "last_error": last_error,
2990                "last_seen_primary_lsn": primary_lsn.unwrap_or(0),
2991                "last_seen_oldest_lsn": oldest_available_lsn.unwrap_or(0),
2992                "updated_at_unix_ms": SystemTime::now()
2993                    .duration_since(UNIX_EPOCH)
2994                    .unwrap_or_default()
2995                    .as_millis() as u64
2996            }),
2997        );
2998    }
2999
3000    /// Whether `SECRET('...')` literals should be encrypted with the
3001    /// vault AES key on INSERT. Default `true`.
3002    pub(crate) fn secret_auto_encrypt(&self) -> bool {
3003        self.config_bool("red.config.secret.auto_encrypt", true)
3004    }
3005
3006    /// Whether `Value::Secret` columns should be decrypted back to
3007    /// plaintext on SELECT when the vault is unsealed. Default `true`.
3008    /// Turning this off keeps secrets masked as `***` even while the
3009    /// vault is open — useful for audit trails or read-only exports.
3010    pub(crate) fn secret_auto_decrypt(&self) -> bool {
3011        self.config_bool("red.config.secret.auto_decrypt", true)
3012    }
3013
3014    /// Walk every record in `result` and swap `Value::Secret(bytes)`
3015    /// for the decrypted plaintext when the runtime has the vault
3016    /// AES key AND `red.config.secret.auto_decrypt = true`. If the
3017    /// key is missing, the vault is sealed, or auto_decrypt is off,
3018    /// secrets are left as `Value::Secret` which every formatter
3019    /// (Display, JSON) already masks as `***`.
3020    pub(crate) fn apply_secret_decryption(&self, result: &mut RuntimeQueryResult) {
3021        if !self.secret_auto_decrypt() {
3022            return;
3023        }
3024        let Some(key) = self.secret_aes_key() else {
3025            return;
3026        };
3027        for record in result.result.records.iter_mut() {
3028            for value in record.values_mut() {
3029                if let Value::Secret(ref bytes) = value {
3030                    if let Some(plain) =
3031                        super::impl_dml::decrypt_secret_payload(&key, bytes.as_slice())
3032                    {
3033                        if let Ok(text) = String::from_utf8(plain) {
3034                            *value = Value::text(text);
3035                        }
3036                    }
3037                }
3038            }
3039        }
3040    }
3041
3042    /// Emit a CDC change event and replicate to WAL buffer.
3043    /// Create a `MutationEngine` bound to this runtime.
3044    ///
3045    /// The engine is cheap to construct (no allocation) and should be
3046    /// dropped after `apply` returns. Use this from application-layer
3047    /// `create_row` / `create_rows_batch` instead of calling
3048    /// `bulk_insert` + `index_entity_insert` + `cdc_emit` separately.
3049    pub(crate) fn mutation_engine(&self) -> crate::runtime::mutation::MutationEngine<'_> {
3050        crate::runtime::mutation::MutationEngine::new(self)
3051    }
3052
3053    /// Public-mutation gate snapshot (PLAN.md W1).
3054    ///
3055    /// Surfaces that accept untrusted client requests (SQL DML/DDL,
3056    /// gRPC mutating RPCs, HTTP/native wire mutations, admin
3057    /// maintenance, serverless lifecycle) call `check_write` before
3058    /// dispatching to storage. Returns `RedDBError::ReadOnly` on any
3059    /// instance running as a replica or with `options.read_only =
3060    /// true`. The replica internal logical-WAL apply path reaches into
3061    /// the store directly and never calls this method, so legitimate
3062    /// replica catch-up still works.
3063    pub fn check_write(&self, kind: crate::runtime::write_gate::WriteKind) -> RedDBResult<()> {
3064        self.inner.write_gate.check(kind)
3065    }
3066
3067    /// Read-only handle to the gate, useful for transports that want
3068    /// to surface the policy in health/status output without taking on
3069    /// a dependency on the concrete enum.
3070    pub fn write_gate(&self) -> &crate::runtime::write_gate::WriteGate {
3071        &self.inner.write_gate
3072    }
3073
3074    /// Process lifecycle handle (PLAN.md Phase 1). Health probes,
3075    /// admin/shutdown, and signal handlers consult this single
3076    /// state machine.
3077    pub fn lifecycle(&self) -> &crate::runtime::lifecycle::Lifecycle {
3078        &self.inner.lifecycle
3079    }
3080
3081    /// Operator-imposed resource limits (PLAN.md Phase 4.1).
3082    pub fn resource_limits(&self) -> &crate::runtime::resource_limits::ResourceLimits {
3083        &self.inner.resource_limits
3084    }
3085
3086    /// Append-only audit log for admin mutations (PLAN.md Phase 6.5).
3087    pub fn audit_log(&self) -> &crate::runtime::audit_log::AuditLogger {
3088        &self.inner.audit_log
3089    }
3090
3091    /// Shared `Arc` to the audit logger — used by collaborators (the
3092    /// lease lifecycle, future request-context plumbing) that need to
3093    /// keep the logger alive past the runtime's stack frame.
3094    pub fn audit_log_arc(&self) -> Arc<crate::runtime::audit_log::AuditLogger> {
3095        Arc::clone(&self.inner.audit_log)
3096    }
3097
3098    /// Slice 10 of issue #527 — shared queue telemetry counters
3099    /// (delivered/acked/nacked). Cloned by `queue_delivery.rs` on
3100    /// each transition.
3101    pub(crate) fn queue_telemetry(
3102        &self,
3103    ) -> &crate::runtime::queue_telemetry::QueueTelemetryCounters {
3104        &self.inner.queue_telemetry
3105    }
3106
3107    /// Snapshots of the queue telemetry counters in label-deterministic
3108    /// order for `/metrics` rendering and the integration test.
3109    pub fn queue_telemetry_snapshot(
3110        &self,
3111    ) -> crate::runtime::queue_telemetry::QueueTelemetrySnapshot {
3112        crate::runtime::queue_telemetry::QueueTelemetrySnapshot {
3113            delivered: self.inner.queue_telemetry.delivered_snapshot(),
3114            acked: self.inner.queue_telemetry.acked_snapshot(),
3115            nacked: self.inner.queue_telemetry.nacked_snapshot(),
3116        }
3117    }
3118
3119    /// Slice 10 of issue #527 — render-time scan of pending entries
3120    /// per (queue, group) for the `queue_pending_gauge` exposition.
3121    /// Walks `red_queue_meta` live so the gauge cannot drift from
3122    /// the source of truth.
3123    pub fn queue_pending_counts(&self) -> Vec<((String, String), u64)> {
3124        let store = self.inner.db.store();
3125        crate::runtime::impl_queue::pending_counts_by_group(store.as_ref())
3126            .into_iter()
3127            .collect()
3128    }
3129
3130    /// Shared `Arc` to the write gate. Same rationale as
3131    /// `audit_log_arc`: collaborators (lease lifecycle, refresh
3132    /// thread) need a clone-cheap handle they can move into a
3133    /// background thread.
3134    pub fn write_gate_arc(&self) -> Arc<crate::runtime::write_gate::WriteGate> {
3135        Arc::clone(&self.inner.write_gate)
3136    }
3137
3138    /// Serverless writer-lease state machine. `None` when the operator
3139    /// did not opt into lease fencing (`RED_LEASE_REQUIRED` unset).
3140    pub fn lease_lifecycle(&self) -> Option<&Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>> {
3141        self.inner.lease_lifecycle.get()
3142    }
3143
3144    /// Install the lease lifecycle. Idempotent; subsequent calls
3145    /// return the previously stored value untouched.
3146    pub fn set_lease_lifecycle(
3147        &self,
3148        lifecycle: Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>,
3149    ) -> Result<(), Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>> {
3150        self.inner.lease_lifecycle.set(lifecycle)
3151    }
3152
3153    /// Reject the call when the requested batch size exceeds
3154    /// `RED_MAX_BATCH_SIZE`. Returns `RedDBError::QuotaExceeded`
3155    /// shaped so the HTTP layer can map it to 413 Payload Too
3156    /// Large (PLAN.md Phase 4.1).
3157    pub fn check_batch_size(&self, requested: usize) -> RedDBResult<()> {
3158        if self.inner.resource_limits.batch_size_exceeded(requested) {
3159            let max = self.inner.resource_limits.max_batch_size.unwrap_or(0);
3160            return Err(RedDBError::QuotaExceeded(format!(
3161                "max_batch_size:{requested}:{max}"
3162            )));
3163        }
3164        Ok(())
3165    }
3166
3167    /// Reject the call when the local DB file exceeds
3168    /// `RED_MAX_DB_SIZE_BYTES`. Reads file metadata once per call —
3169    /// the cost is a single `stat()` syscall, negligible against the
3170    /// I/O the caller is about to do. Returns `QuotaExceeded` shaped
3171    /// for HTTP 507 Insufficient Storage.
3172    pub fn check_db_size(&self) -> RedDBResult<()> {
3173        let Some(limit) = self.inner.resource_limits.max_db_size_bytes else {
3174            return Ok(());
3175        };
3176        if limit == 0 {
3177            return Ok(());
3178        }
3179        let Some(path) = self.inner.db.path() else {
3180            return Ok(());
3181        };
3182        let current = std::fs::metadata(path).map(|m| m.len()).unwrap_or(0);
3183        if current > limit {
3184            return Err(RedDBError::QuotaExceeded(format!(
3185                "max_db_size_bytes:{current}:{limit}"
3186            )));
3187        }
3188        Ok(())
3189    }
3190
3191    /// Graceful shutdown coordinator (PLAN.md Phase 1.1).
3192    ///
3193    /// Steps, in order, all idempotent across re-entrant calls:
3194    ///   1. Move lifecycle into `ShuttingDown` (concurrent callers
3195    ///      observe `Stopped` after first finishes).
3196    ///   2. Flush WAL + run final checkpoint via `db.flush()` so
3197    ///      every acked write is durable on disk.
3198    ///   3. If `backup_on_shutdown == true` and a remote backend is
3199    ///      configured, run a synchronous `trigger_backup()` so the
3200    ///      remote head reflects the final state.
3201    ///   4. Stamp the report and move to `Stopped`. Subsequent calls
3202    ///      return the cached report without re-running anything.
3203    ///
3204    /// On any error, the runtime is still marked `Stopped` so the
3205    /// process can exit; the caller logs the error context but does
3206    /// not retry the same shutdown — the operator can inspect the
3207    /// report fields to see which step failed.
3208    pub fn graceful_shutdown(
3209        &self,
3210        backup_on_shutdown: bool,
3211    ) -> RedDBResult<crate::runtime::lifecycle::ShutdownReport> {
3212        if !self.inner.lifecycle.begin_shutdown() {
3213            // Someone else already shut down (or is in flight). Return
3214            // the cached report so the HTTP caller and SIGTERM handler
3215            // get the same idempotent answer.
3216            return Ok(self.inner.lifecycle.shutdown_report().unwrap_or_default());
3217        }
3218
3219        let started_ms = std::time::SystemTime::now()
3220            .duration_since(std::time::UNIX_EPOCH)
3221            .map(|d| d.as_millis() as u64)
3222            .unwrap_or(0);
3223        let mut report = crate::runtime::lifecycle::ShutdownReport {
3224            started_at_ms: started_ms,
3225            ..Default::default()
3226        };
3227
3228        // Flush WAL + run any pending checkpoint. Local fsync is
3229        // unconditional — even a lease-lost replica needs its WAL on
3230        // disk before exit so a future restore has the latest tail.
3231        // The remote upload is gated separately so a lost-lease writer
3232        // doesn't clobber the new holder's state on its way out.
3233        let flush_res = self.inner.db.flush_local_only();
3234        report.flushed_wal = flush_res.is_ok();
3235        report.final_checkpoint = flush_res.is_ok();
3236        if let Err(err) = &flush_res {
3237            tracing::error!(
3238                target: "reddb::lifecycle",
3239                error = %err,
3240                "graceful_shutdown: local flush failed"
3241            );
3242        } else if let Err(lease_err) =
3243            self.assert_remote_write_allowed("shutdown/checkpoint_upload")
3244        {
3245            tracing::warn!(
3246                target: "reddb::serverless::lease",
3247                error = %lease_err,
3248                "graceful_shutdown: remote upload skipped — lease not held"
3249            );
3250        } else if let Err(err) = self.inner.db.upload_to_remote_backend() {
3251            tracing::error!(
3252                target: "reddb::lifecycle",
3253                error = %err,
3254                "graceful_shutdown: remote upload failed"
3255            );
3256        }
3257
3258        // Optional final backup. Skipped silently when no remote
3259        // backend is configured — `trigger_backup()` returns Err
3260        // anyway in that case, but logging it as a shutdown failure
3261        // would be misleading on a standalone (no-backend) runtime.
3262        if backup_on_shutdown && self.inner.db.remote_backend.is_some() {
3263            // The trigger_backup gate now reads `WriteKind::Backup`,
3264            // which a replica/read_only instance refuses. That's
3265            // intentional — replicas don't drive backups; only the
3266            // primary does. We still want shutdown to flush its WAL
3267            // even if the backup branch is gated off.
3268            match self.trigger_backup() {
3269                Ok(result) => {
3270                    report.backup_uploaded = result.uploaded;
3271                }
3272                Err(err) => {
3273                    tracing::warn!(
3274                        target: "reddb::lifecycle",
3275                        error = %err,
3276                        "graceful_shutdown: final backup skipped"
3277                    );
3278                }
3279            }
3280        }
3281
3282        let completed_ms = std::time::SystemTime::now()
3283            .duration_since(std::time::UNIX_EPOCH)
3284            .map(|d| d.as_millis() as u64)
3285            .unwrap_or(started_ms);
3286        report.completed_at_ms = completed_ms;
3287        report.duration_ms = completed_ms.saturating_sub(started_ms);
3288
3289        self.inner.lifecycle.finish_shutdown(report.clone());
3290        Ok(report)
3291    }
3292
3293    /// Emit a CDC record without invalidating the result cache.
3294    ///
3295    /// Used by `MutationEngine::append_batch` which calls
3296    /// `invalidate_result_cache` once for the whole batch before this
3297    /// loop, avoiding N write-lock acquisitions.
3298    pub(crate) fn cdc_emit_no_cache_invalidate(
3299        &self,
3300        operation: crate::replication::cdc::ChangeOperation,
3301        collection: &str,
3302        entity_id: u64,
3303        entity_kind: &str,
3304    ) -> u64 {
3305        let lsn = self
3306            .inner
3307            .cdc
3308            .emit(operation, collection, entity_id, entity_kind);
3309
3310        // Append to logical WAL replication buffer (if primary mode)
3311        if let Some(ref primary) = self.inner.db.replication {
3312            let store = self.inner.db.store();
3313            let entity = if operation == crate::replication::cdc::ChangeOperation::Delete {
3314                None
3315            } else {
3316                store.get(collection, EntityId::new(entity_id))
3317            };
3318            let record = ChangeRecord {
3319                lsn,
3320                timestamp: SystemTime::now()
3321                    .duration_since(UNIX_EPOCH)
3322                    .unwrap_or_default()
3323                    .as_millis() as u64,
3324                operation,
3325                collection: collection.to_string(),
3326                entity_id,
3327                entity_kind: entity_kind.to_string(),
3328                entity_bytes: entity
3329                    .as_ref()
3330                    .map(|e| UnifiedStore::serialize_entity(e, store.format_version())),
3331                metadata: self.latest_metadata_for(collection, entity_id),
3332            };
3333            let encoded = record.encode();
3334            primary.wal_buffer.append(record.lsn, encoded.clone());
3335            if let Some(spool) = &primary.logical_wal_spool {
3336                let _ = spool.append(record.lsn, &encoded);
3337            }
3338        }
3339        lsn
3340    }
3341
3342    pub(crate) fn cdc_emit_insert_batch_no_cache_invalidate(
3343        &self,
3344        collection: &str,
3345        ids: &[EntityId],
3346        entity_kind: &str,
3347    ) -> Vec<u64> {
3348        if ids.is_empty() {
3349            return Vec::new();
3350        }
3351
3352        // Without logical replication, CDC only needs the in-memory event
3353        // ring. Reserve all LSNs and push the batch under one mutex instead
3354        // of taking the ring lock once per inserted row.
3355        if self.inner.db.replication.is_none() {
3356            return self.inner.cdc.emit_batch_same_collection(
3357                crate::replication::cdc::ChangeOperation::Insert,
3358                collection,
3359                entity_kind,
3360                ids.iter().map(|id| id.raw()),
3361            );
3362        }
3363
3364        // Replication needs one logical-WAL record per entity with the
3365        // serialized entity bytes, so keep the existing per-row path.
3366        ids.iter()
3367            .map(|id| {
3368                self.cdc_emit_no_cache_invalidate(
3369                    crate::replication::cdc::ChangeOperation::Insert,
3370                    collection,
3371                    id.raw(),
3372                    entity_kind,
3373                )
3374            })
3375            .collect()
3376    }
3377
3378    pub fn cdc_emit(
3379        &self,
3380        operation: crate::replication::cdc::ChangeOperation,
3381        collection: &str,
3382        entity_id: u64,
3383        entity_kind: &str,
3384    ) -> u64 {
3385        let lsn = self
3386            .inner
3387            .cdc
3388            .emit(operation, collection, entity_id, entity_kind);
3389        // Perf: prior to this we called `invalidate_result_cache()`
3390        // which wipes EVERY cached query, across every table, under
3391        // a write lock — turning each INSERT into a serialisation
3392        // point for all readers. Swap to the per-table variant so
3393        // unrelated query caches survive.
3394        self.invalidate_result_cache_for_table(collection);
3395
3396        // Append to logical WAL replication buffer (if primary mode)
3397        if let Some(ref primary) = self.inner.db.replication {
3398            let store = self.inner.db.store();
3399            let entity = if operation == crate::replication::cdc::ChangeOperation::Delete {
3400                None
3401            } else {
3402                store.get(collection, EntityId::new(entity_id))
3403            };
3404            let record = ChangeRecord {
3405                lsn,
3406                timestamp: SystemTime::now()
3407                    .duration_since(UNIX_EPOCH)
3408                    .unwrap_or_default()
3409                    .as_millis() as u64,
3410                operation,
3411                collection: collection.to_string(),
3412                entity_id,
3413                entity_kind: entity_kind.to_string(),
3414                entity_bytes: entity
3415                    .as_ref()
3416                    .map(|entity| UnifiedStore::serialize_entity(entity, store.format_version())),
3417                metadata: self.latest_metadata_for(collection, entity_id),
3418            };
3419            let encoded = record.encode();
3420            primary.wal_buffer.append(record.lsn, encoded.clone());
3421            if let Some(spool) = &primary.logical_wal_spool {
3422                let _ = spool.append(record.lsn, &encoded);
3423            }
3424        }
3425        lsn
3426    }
3427
3428    pub(crate) fn cdc_emit_kv(
3429        &self,
3430        operation: crate::replication::cdc::ChangeOperation,
3431        collection: &str,
3432        key: &str,
3433        entity_id: u64,
3434        before: Option<crate::json::Value>,
3435        after: Option<crate::json::Value>,
3436    ) -> u64 {
3437        let lsn = self
3438            .inner
3439            .cdc
3440            .emit_kv(operation, collection, key, entity_id, before, after);
3441        self.inner.kv_stats.incr_watch_events_emitted();
3442        self.invalidate_result_cache_for_table(collection);
3443        lsn
3444    }
3445
3446    pub(crate) fn record_kv_watch_event(
3447        &self,
3448        operation: crate::replication::cdc::ChangeOperation,
3449        collection: &str,
3450        key: &str,
3451        entity_id: u64,
3452        before: Option<crate::json::Value>,
3453        after: Option<crate::json::Value>,
3454    ) {
3455        if self.current_xid().is_some() {
3456            let conn_id = current_connection_id();
3457            let event = crate::replication::cdc::KvWatchEvent {
3458                collection: collection.to_string(),
3459                key: key.to_string(),
3460                op: operation,
3461                before,
3462                after,
3463                lsn: 0,
3464                committed_at: 0,
3465                dropped_event_count: 0,
3466            };
3467            self.inner
3468                .pending_kv_watch_events
3469                .write()
3470                .entry(conn_id)
3471                .or_default()
3472                .push(event);
3473            return;
3474        }
3475
3476        self.cdc_emit_kv(operation, collection, key, entity_id, before, after);
3477    }
3478
3479    pub(crate) fn cdc_emit_prebuilt(
3480        &self,
3481        operation: crate::replication::cdc::ChangeOperation,
3482        collection: &str,
3483        entity: &UnifiedEntity,
3484        entity_kind: &str,
3485        metadata: Option<&crate::storage::Metadata>,
3486        invalidate_cache: bool,
3487    ) -> u64 {
3488        self.cdc_emit_prebuilt_with_columns(
3489            operation,
3490            collection,
3491            entity,
3492            entity_kind,
3493            metadata,
3494            invalidate_cache,
3495            None,
3496        )
3497    }
3498
3499    /// `cdc_emit_prebuilt` plus the list of column names whose values
3500    /// changed on this update. Callers that have already computed a
3501    /// `RowDamageVector` pass it here so downstream CDC consumers can
3502    /// filter events by touched column without re-diffing.
3503    /// `changed_columns` is only meaningful for `Update` operations —
3504    /// insert and delete events ignore it.
3505    pub(crate) fn cdc_emit_prebuilt_with_columns(
3506        &self,
3507        operation: crate::replication::cdc::ChangeOperation,
3508        collection: &str,
3509        entity: &UnifiedEntity,
3510        entity_kind: &str,
3511        metadata: Option<&crate::storage::Metadata>,
3512        invalidate_cache: bool,
3513        changed_columns: Option<Vec<String>>,
3514    ) -> u64 {
3515        if invalidate_cache {
3516            self.invalidate_result_cache();
3517        }
3518
3519        let public_id = entity.logical_id().raw();
3520        let lsn = self.inner.cdc.emit_with_columns(
3521            operation,
3522            collection,
3523            public_id,
3524            entity_kind,
3525            changed_columns,
3526        );
3527
3528        if let Some(ref primary) = self.inner.db.replication {
3529            let store = self.inner.db.store();
3530            let record = ChangeRecord {
3531                lsn,
3532                timestamp: SystemTime::now()
3533                    .duration_since(UNIX_EPOCH)
3534                    .unwrap_or_default()
3535                    .as_millis() as u64,
3536                operation,
3537                collection: collection.to_string(),
3538                entity_id: entity.id.raw(),
3539                entity_kind: entity_kind.to_string(),
3540                entity_bytes: Some(UnifiedStore::serialize_entity(
3541                    entity,
3542                    store.format_version(),
3543                )),
3544                metadata: metadata
3545                    .map(metadata_to_json)
3546                    .or_else(|| self.latest_metadata_for(collection, entity.id.raw())),
3547            };
3548            let encoded = record.encode();
3549            primary.wal_buffer.append(record.lsn, encoded.clone());
3550            if let Some(spool) = &primary.logical_wal_spool {
3551                let _ = spool.append(record.lsn, &encoded);
3552            }
3553        }
3554
3555        lsn
3556    }
3557
3558    pub(crate) fn cdc_emit_prebuilt_batch<'a, I>(
3559        &self,
3560        operation: crate::replication::cdc::ChangeOperation,
3561        entity_kind: &str,
3562        items: I,
3563        invalidate_cache: bool,
3564    ) where
3565        I: IntoIterator<
3566            Item = (
3567                &'a str,
3568                &'a UnifiedEntity,
3569                Option<&'a crate::storage::Metadata>,
3570            ),
3571        >,
3572    {
3573        let items: Vec<(&str, &UnifiedEntity, Option<&crate::storage::Metadata>)> =
3574            items.into_iter().collect();
3575        if items.is_empty() {
3576            return;
3577        }
3578
3579        if invalidate_cache {
3580            self.invalidate_result_cache();
3581        }
3582
3583        for (collection, entity, metadata) in items {
3584            self.cdc_emit_prebuilt(operation, collection, entity, entity_kind, metadata, false);
3585        }
3586    }
3587
3588    fn run_replica_loop(&self, primary_addr: String) {
3589        let endpoint = if primary_addr.starts_with("http") {
3590            primary_addr
3591        } else {
3592            format!("http://{primary_addr}")
3593        };
3594        let poll_ms = self.inner.db.options().replication.poll_interval_ms;
3595        let max_count = self.inner.db.options().replication.max_batch_size;
3596        let mut since_lsn = self.config_u64("red.replication.last_applied_lsn", 0);
3597
3598        let runtime = match tokio::runtime::Builder::new_current_thread()
3599            .enable_all()
3600            .build()
3601        {
3602            Ok(runtime) => runtime,
3603            Err(_) => return,
3604        };
3605
3606        runtime.block_on(async move {
3607            use crate::grpc::proto::red_db_client::RedDbClient;
3608            use crate::grpc::proto::JsonPayloadRequest;
3609
3610            let mut client = loop {
3611                match RedDbClient::connect(endpoint.clone()).await {
3612                    Ok(client) => {
3613                        self.persist_replication_health("connecting", "", None, None);
3614                        break client;
3615                    }
3616                    Err(_) => {
3617                        self.persist_replication_health(
3618                            "connecting",
3619                            "waiting for primary connection",
3620                            None,
3621                            None,
3622                        );
3623                        std::thread::sleep(std::time::Duration::from_millis(poll_ms.max(250)))
3624                    }
3625                }
3626            };
3627
3628            // PLAN.md Phase 11.5 — stateful applier guards LSN
3629            // monotonicity across pulls. Seed with the persisted
3630            // `last_applied_lsn` so reboots don't lose the chain
3631            // pointer.
3632            let applier = crate::replication::logical::LogicalChangeApplier::new(since_lsn);
3633
3634            loop {
3635                let payload = crate::json!({
3636                    "since_lsn": since_lsn,
3637                    "max_count": max_count
3638                });
3639                let request = tonic::Request::new(JsonPayloadRequest {
3640                    payload_json: crate::json::to_string(&payload)
3641                        .unwrap_or_else(|_| "{}".to_string()),
3642                });
3643
3644                if let Ok(response) = client.pull_wal_records(request).await {
3645                    if let Ok(value) =
3646                        crate::json::from_str::<crate::json::Value>(&response.into_inner().payload)
3647                    {
3648                        let current_lsn =
3649                            value.get("current_lsn").and_then(crate::json::Value::as_u64);
3650                        let oldest_available_lsn = value
3651                            .get("oldest_available_lsn")
3652                            .and_then(crate::json::Value::as_u64);
3653                        if since_lsn > 0
3654                            && oldest_available_lsn
3655                                .map(|oldest| oldest > since_lsn.saturating_add(1))
3656                                .unwrap_or(false)
3657                        {
3658                            self.persist_replication_health(
3659                                "stalled_gap",
3660                                "replica is behind the oldest logical WAL available on primary; re-bootstrap required",
3661                                current_lsn,
3662                                oldest_available_lsn,
3663                            );
3664                            std::thread::sleep(std::time::Duration::from_millis(poll_ms.max(250)));
3665                            continue;
3666                        }
3667                        if let Some(records) =
3668                            value.get("records").and_then(crate::json::Value::as_array)
3669                        {
3670                            for record in records {
3671                                let Some(data_hex) =
3672                                    record.get("data").and_then(crate::json::Value::as_str)
3673                                else {
3674                                    continue;
3675                                };
3676                                let Ok(data) = hex::decode(data_hex) else {
3677                                    self.inner.replica_apply_metrics.record(
3678                                        crate::replication::logical::ApplyErrorKind::Decode,
3679                                    );
3680                                    self.persist_replication_health(
3681                                        "apply_error",
3682                                        "failed to decode WAL record hex payload",
3683                                        current_lsn,
3684                                        oldest_available_lsn,
3685                                    );
3686                                    continue;
3687                                };
3688                                let Ok(change) = ChangeRecord::decode(&data) else {
3689                                    self.inner.replica_apply_metrics.record(
3690                                        crate::replication::logical::ApplyErrorKind::Decode,
3691                                    );
3692                                    self.persist_replication_health(
3693                                        "apply_error",
3694                                        "failed to decode logical WAL record",
3695                                        current_lsn,
3696                                        oldest_available_lsn,
3697                                    );
3698                                    continue;
3699                                };
3700                                match applier.apply(
3701                                    self.inner.db.as_ref(),
3702                                    &change,
3703                                    ApplyMode::Replica,
3704                                ) {
3705                                    Ok(crate::replication::logical::ApplyOutcome::Applied) => {
3706                                        self.invalidate_result_cache_for_table(&change.collection);
3707                                        since_lsn = since_lsn.max(change.lsn);
3708                                        self.persist_replica_lsn(since_lsn);
3709                                    }
3710                                    Ok(_) => {
3711                                        // Idempotent / Skipped: no advance, no error.
3712                                    }
3713                                    Err(err) => {
3714                                        self.inner.replica_apply_metrics.record(err.kind());
3715                                        // Issue #205 — emit operator-grade event
3716                                        // for the two replication-fatal kinds. `Gap`
3717                                        // / `Apply` / `Decode` already persist via
3718                                        // `persist_replication_health`; the
3719                                        // OperatorEvent variants only cover the
3720                                        // two "stream is broken" / "follower
3721                                        // diverged" conditions an operator must act
3722                                        // on out-of-band.
3723                                        match &err {
3724                                            crate::replication::logical::LogicalApplyError::Divergence { lsn, expected: _, got: _ } => {
3725                                                crate::telemetry::operator_event::OperatorEvent::Divergence {
3726                                                    peer: "primary".to_string(),
3727                                                    leader_lsn: *lsn,
3728                                                    follower_lsn: since_lsn,
3729                                                }
3730                                                .emit_global();
3731                                            }
3732                                            crate::replication::logical::LogicalApplyError::Gap { last, next } => {
3733                                                crate::telemetry::operator_event::OperatorEvent::ReplicationBroken {
3734                                                    peer: "primary".to_string(),
3735                                                    reason: format!("stalled gap last={last} next={next}"),
3736                                                }
3737                                                .emit_global();
3738                                            }
3739                                            _ => {}
3740                                        }
3741                                        let kind = match &err {
3742                                            crate::replication::logical::LogicalApplyError::Gap { .. } => "stalled_gap",
3743                                            crate::replication::logical::LogicalApplyError::Divergence { .. } => "divergence",
3744                                            _ => "apply_error",
3745                                        };
3746                                        self.persist_replication_health(
3747                                            kind,
3748                                            &format!("replica apply rejected: {err}"),
3749                                            current_lsn,
3750                                            oldest_available_lsn,
3751                                        );
3752                                        // Stop applying this batch. The
3753                                        // outer loop will retry on next
3754                                        // pull, which on a real Gap will
3755                                        // not magically heal — operator
3756                                        // must rebootstrap. For
3757                                        // Divergence, we explicitly do
3758                                        // not advance; this keeps the
3759                                        // replica visibly unhealthy
3760                                        // instead of silently swallowing
3761                                        // corruption.
3762                                        break;
3763                                    }
3764                                }
3765                            }
3766                        }
3767                        self.persist_replication_health(
3768                            "healthy",
3769                            "",
3770                            current_lsn,
3771                            oldest_available_lsn,
3772                        );
3773                    } else {
3774                        self.persist_replication_health(
3775                            "apply_error",
3776                            "failed to parse pull_wal_records response",
3777                            None,
3778                            None,
3779                        );
3780                    }
3781                } else {
3782                    self.persist_replication_health(
3783                        "connecting",
3784                        "primary pull_wal_records request failed",
3785                        None,
3786                        None,
3787                    );
3788                }
3789
3790                std::thread::sleep(std::time::Duration::from_millis(poll_ms));
3791            }
3792        });
3793    }
3794
3795    /// Poll CDC events since a given LSN.
3796    pub fn cdc_poll(
3797        &self,
3798        since_lsn: u64,
3799        max_count: usize,
3800    ) -> Vec<crate::replication::cdc::ChangeEvent> {
3801        self.inner.cdc.poll(since_lsn, max_count)
3802    }
3803
3804    /// PLAN.md Phase 11.4 — current CDC LSN. Public mutation
3805    /// surfaces (HTTP query, gRPC entity ops) call this immediately
3806    /// after a successful write to feed `enforce_commit_policy`.
3807    pub fn cdc_current_lsn(&self) -> u64 {
3808        self.inner.cdc.current_lsn()
3809    }
3810
3811    pub fn kv_watch_events_since(
3812        &self,
3813        collection: &str,
3814        key: &str,
3815        since_lsn: u64,
3816        max_count: usize,
3817    ) -> Vec<crate::replication::cdc::KvWatchEvent> {
3818        self.inner
3819            .cdc
3820            .poll(since_lsn, max_count)
3821            .into_iter()
3822            .filter_map(|event| event.kv)
3823            .filter(|event| event.collection == collection && event.key == key)
3824            .collect()
3825    }
3826
3827    pub fn kv_watch_events_since_prefix(
3828        &self,
3829        collection: &str,
3830        prefix: &str,
3831        since_lsn: u64,
3832        max_count: usize,
3833    ) -> Vec<crate::replication::cdc::KvWatchEvent> {
3834        self.inner
3835            .cdc
3836            .poll(since_lsn, max_count)
3837            .into_iter()
3838            .filter_map(|event| event.kv)
3839            .filter(|event| event.collection == collection && event.key.starts_with(prefix))
3840            .collect()
3841    }
3842
3843    pub(crate) fn kv_watch_subscribe<'a>(
3844        &'a self,
3845        collection: impl Into<String>,
3846        key: impl Into<String>,
3847        from_lsn: Option<u64>,
3848    ) -> crate::runtime::kv_watch::KvWatchStream<'a> {
3849        crate::runtime::kv_watch::KvWatchStream::subscribe(
3850            &self.inner.cdc,
3851            &self.inner.kv_stats,
3852            collection,
3853            key,
3854            from_lsn,
3855            self.kv_watch_idle_timeout_ms(),
3856        )
3857    }
3858
3859    pub(crate) fn kv_watch_subscribe_prefix<'a>(
3860        &'a self,
3861        collection: impl Into<String>,
3862        prefix: impl Into<String>,
3863        from_lsn: Option<u64>,
3864    ) -> crate::runtime::kv_watch::KvWatchStream<'a> {
3865        crate::runtime::kv_watch::KvWatchStream::subscribe_prefix(
3866            &self.inner.cdc,
3867            &self.inner.kv_stats,
3868            collection,
3869            prefix,
3870            from_lsn,
3871            self.kv_watch_idle_timeout_ms(),
3872        )
3873    }
3874
3875    pub(crate) fn kv_watch_idle_timeout_ms(&self) -> u64 {
3876        self.config_u64("red.config.kv.watch.idle_timeout_ms", 60_000)
3877    }
3878
3879    /// Get backup scheduler status.
3880    pub fn backup_status(&self) -> crate::replication::scheduler::BackupStatus {
3881        self.inner.backup_scheduler.status()
3882    }
3883
3884    /// Borrow the runtime's result Blob Cache.
3885    ///
3886    /// Wired for the `/admin/blob_cache/sweep` and
3887    /// `/admin/blob_cache/flush_namespace` HTTP handlers (issue #148
3888    /// follow-up): both delegate to
3889    /// `crate::storage::cache::sweeper::BlobCacheSweeper`, which takes a
3890    /// `&BlobCache`. Also used by `trigger_backup` when
3891    /// `red.config.backup.include_blob_cache=true` to locate the L2
3892    /// directory for archival.
3893    pub fn result_blob_cache(&self) -> &crate::storage::cache::BlobCache {
3894        &self.inner.result_blob_cache
3895    }
3896
3897    /// PLAN.md Phase 11.4 — owned snapshot of every registered
3898    /// replica's state on this primary. Returns empty vec on
3899    /// non-primary instances or when no replicas are registered yet.
3900    pub fn primary_replica_snapshots(&self) -> Vec<crate::replication::primary::ReplicaState> {
3901        self.inner
3902            .db
3903            .replication
3904            .as_ref()
3905            .map(|repl| repl.replica_snapshots())
3906            .unwrap_or_default()
3907    }
3908
3909    /// PLAN.md Phase 11.4 — active commit policy. Reads
3910    /// `RED_PRIMARY_COMMIT_POLICY` once at runtime construction;
3911    /// future env reloads will need a reload endpoint. Default is
3912    /// `Local` — current behavior, no replica blocking.
3913    pub fn commit_policy(&self) -> crate::replication::CommitPolicy {
3914        crate::replication::CommitPolicy::from_env()
3915    }
3916
3917    /// PLAN.md Phase 11.5 — accessor for replica-side apply error
3918    /// counters (gap / divergence / apply / decode). Returned
3919    /// snapshot is consistent across the four counters; the labels
3920    /// match `reddb_replica_apply_errors_total{kind}`.
3921    pub fn replica_apply_error_counts(
3922        &self,
3923    ) -> [(crate::replication::logical::ApplyErrorKind, u64); 4] {
3924        self.inner.replica_apply_metrics.snapshot()
3925    }
3926
3927    /// PLAN.md Phase 4.4 — per-caller quota bucket. Always
3928    /// returned; `is_configured()` lets callers short-circuit.
3929    pub fn quota_bucket(&self) -> &crate::runtime::quota_bucket::QuotaBucket {
3930        &self.inner.quota_bucket
3931    }
3932
3933    /// PLAN.md Phase 11.4 — observability snapshot of every
3934    /// replica's durable LSN as known to the commit waiter. Empty
3935    /// vec on non-primary instances or when no replica has acked.
3936    pub fn commit_waiter_snapshot(&self) -> Vec<(String, u64)> {
3937        self.inner
3938            .db
3939            .replication
3940            .as_ref()
3941            .map(|repl| repl.commit_waiter.snapshot())
3942            .unwrap_or_default()
3943    }
3944
3945    /// PLAN.md Phase 11.4 — `(reached, timed_out, not_required, last_micros)`
3946    /// counters for /metrics. Always-zero on non-primary instances.
3947    pub fn commit_waiter_metrics_snapshot(&self) -> (u64, u64, u64, u64) {
3948        self.inner
3949            .db
3950            .replication
3951            .as_ref()
3952            .map(|repl| repl.commit_waiter.metrics_snapshot())
3953            .unwrap_or((0, 0, 0, 0))
3954    }
3955
3956    /// PLAN.md Phase 11.4 — block until at least `count` replicas
3957    /// have durably applied through `target_lsn`, or `timeout`
3958    /// elapses. Returns the `AwaitOutcome` so the caller can decide
3959    /// whether to surface a timeout error to the client or continue
3960    /// (the policy mapping lives in the commit dispatcher).
3961    ///
3962    /// Foundation only — the write commit path doesn't yet call
3963    /// this. Wiring it is a per-surface task gated on the operator
3964    /// flipping `RED_PRIMARY_COMMIT_POLICY` away from `local`.
3965    pub fn await_replica_acks(
3966        &self,
3967        target_lsn: u64,
3968        count: u32,
3969        timeout: std::time::Duration,
3970    ) -> crate::replication::AwaitOutcome {
3971        match &self.inner.db.replication {
3972            Some(repl) => repl.commit_waiter.await_acks(target_lsn, count, timeout),
3973            None => {
3974                // No replication configured: policy must be `Local`.
3975                // Treat as immediate `NotRequired` so callers don't
3976                // block on a degenerate setup.
3977                crate::replication::AwaitOutcome::NotRequired
3978            }
3979        }
3980    }
3981
3982    /// PLAN.md Phase 11.4 — enforce the configured commit policy
3983    /// against `post_lsn` (the LSN of the just-completed write).
3984    /// Returns `Ok(AwaitOutcome)` on every successful enforcement
3985    /// (including `Reached` and `TimedOut` when fail-on-timeout is
3986    /// off). Returns `Err(ReadOnly)` only when:
3987    ///   * policy is `AckN(n)` with `n > 0`
3988    ///   * the wait timed out
3989    ///   * `RED_COMMIT_FAIL_ON_TIMEOUT=true` is set
3990    ///
3991    /// The HTTP / gRPC / wire surfaces map the error to 504 / wire
3992    /// backoff. Default behaviour (env unset) logs warn and returns
3993    /// success — matches PLAN.md "default v1 stays local" semantics
3994    /// while still letting the operator opt into hard-blocking.
3995    pub fn enforce_commit_policy(
3996        &self,
3997        post_lsn: u64,
3998    ) -> RedDBResult<crate::replication::AwaitOutcome> {
3999        let n = match self.commit_policy() {
4000            crate::replication::CommitPolicy::AckN(n) if n > 0 => n,
4001            _ => return Ok(crate::replication::AwaitOutcome::NotRequired),
4002        };
4003        let timeout_ms = std::env::var("RED_REPLICATION_ACK_TIMEOUT_MS")
4004            .ok()
4005            .and_then(|v| v.parse::<u64>().ok())
4006            .unwrap_or(5_000);
4007        let outcome =
4008            self.await_replica_acks(post_lsn, n, std::time::Duration::from_millis(timeout_ms));
4009        if let crate::replication::AwaitOutcome::TimedOut { observed, required } = &outcome {
4010            tracing::warn!(
4011                target: "reddb::commit",
4012                post_lsn,
4013                observed = *observed,
4014                required = *required,
4015                timeout_ms,
4016                "ack_n: timed out waiting for replicas"
4017            );
4018            let fail = std::env::var("RED_COMMIT_FAIL_ON_TIMEOUT")
4019                .ok()
4020                .map(|v| {
4021                    let t = v.trim();
4022                    t.eq_ignore_ascii_case("true") || t == "1" || t.eq_ignore_ascii_case("yes")
4023                })
4024                .unwrap_or(false);
4025            if fail {
4026                return Err(RedDBError::ReadOnly(format!(
4027                    "commit policy timed out at lsn {post_lsn}: observed={observed} required={required} (RED_COMMIT_FAIL_ON_TIMEOUT=true)"
4028                )));
4029            }
4030        }
4031        Ok(outcome)
4032    }
4033
4034    /// PLAN.md Phase 6.3 — whether at-rest encryption is configured.
4035    /// Reads `RED_ENCRYPTION_KEY` / `RED_ENCRYPTION_KEY_FILE` lazily;
4036    /// returns `("enabled", None)` when a key is loadable, `("error", Some(msg))`
4037    /// when the operator set the env but it doesn't parse, and
4038    /// `("disabled", None)` when no key is configured. The pager
4039    /// hookup is deferred — this accessor surfaces the operator's
4040    /// intent for /admin/status without yet using the key in writes.
4041    pub fn encryption_at_rest_status(&self) -> (&'static str, Option<String>) {
4042        match crate::crypto::page_encryption::key_from_env() {
4043            Ok(Some(_)) => ("enabled", None),
4044            Ok(None) => ("disabled", None),
4045            Err(err) => ("error", Some(err)),
4046        }
4047    }
4048
4049    /// PLAN.md Phase 11.5 — current replica apply health label
4050    /// (`ok`, `gap`, `divergence`, `apply_error`, `connecting`,
4051    /// `stalled_gap`). Read from the persisted `red.replication.state`
4052    /// config key updated by the replica loop. Returns `None` on
4053    /// non-replica instances or when no apply has run yet.
4054    pub fn replica_apply_health(&self) -> Option<String> {
4055        let state = self.config_string("red.replication.state", "");
4056        if state.is_empty() {
4057            None
4058        } else {
4059            Some(state)
4060        }
4061    }
4062
4063    /// Current local LSN paired with the LSN of the most recently
4064    /// archived WAL segment. The difference is the replication /
4065    /// archive lag operators alert on (PLAN.md Phase 5.1). Returns
4066    /// `(0, 0)` when neither replication nor archiving is configured.
4067    pub fn wal_archive_progress(&self) -> (u64, u64) {
4068        let current_lsn = self
4069            .inner
4070            .db
4071            .replication
4072            .as_ref()
4073            .map(|repl| {
4074                repl.logical_wal_spool
4075                    .as_ref()
4076                    .map(|spool| spool.current_lsn())
4077                    .unwrap_or_else(|| repl.wal_buffer.current_lsn())
4078            })
4079            .unwrap_or_else(|| self.inner.cdc.current_lsn());
4080        let last_archived_lsn = self.config_u64("red.config.timeline.last_archived_lsn", 0);
4081        (current_lsn, last_archived_lsn)
4082    }
4083
4084    /// Trigger an immediate backup.
4085    pub fn trigger_backup(&self) -> RedDBResult<crate::replication::scheduler::BackupResult> {
4086        self.check_write(crate::runtime::write_gate::WriteKind::Backup)?;
4087        // Defense in depth — check_write above already rejects when
4088        // the lease is NotHeld, but log + audit the lease angle here
4089        // explicitly so dashboards distinguish "lease lost" from a
4090        // generic read-only refusal.
4091        self.assert_remote_write_allowed("admin/backup")?;
4092        let started = std::time::Instant::now();
4093        let snapshot = self.create_snapshot()?;
4094        let mut uploaded = false;
4095
4096        if let (Some(backend), Some(path)) = (&self.inner.db.remote_backend, self.inner.db.path()) {
4097            let default_snapshot_prefix = self.inner.db.options().default_snapshot_prefix();
4098            let default_wal_prefix = self.inner.db.options().default_wal_archive_prefix();
4099            let default_head_key = self.inner.db.options().default_backup_head_key();
4100            let snapshot_prefix = self.config_string(
4101                "red.config.backup.snapshot_prefix",
4102                &default_snapshot_prefix,
4103            );
4104            let wal_prefix =
4105                self.config_string("red.config.wal.archive.prefix", &default_wal_prefix);
4106            let head_key = self.config_string("red.config.backup.head_key", &default_head_key);
4107            let timeline_id = self.config_string("red.config.timeline.id", "main");
4108            let snapshot_key = crate::storage::wal::archive_snapshot(
4109                backend.as_ref(),
4110                path,
4111                snapshot.snapshot_id,
4112                &snapshot_prefix,
4113            )
4114            .map_err(|err| RedDBError::Internal(err.to_string()))?;
4115            let current_lsn = self
4116                .inner
4117                .db
4118                .replication
4119                .as_ref()
4120                .map(|repl| {
4121                    repl.logical_wal_spool
4122                        .as_ref()
4123                        .map(|spool| spool.current_lsn())
4124                        .unwrap_or_else(|| repl.wal_buffer.current_lsn())
4125                })
4126                .unwrap_or_else(|| self.inner.cdc.current_lsn());
4127            let last_archived_lsn = self.config_u64("red.config.timeline.last_archived_lsn", 0);
4128            // Hash the local snapshot bytes so the manifest can carry
4129            // the digest for restore-side verification (PLAN.md
4130            // Phase 4). Failure to hash is non-fatal — we still
4131            // publish the manifest, just without a checksum, so a
4132            // future fix can backfill rather than losing the backup.
4133            let snapshot_sha256 =
4134                crate::storage::wal::SnapshotManifest::compute_snapshot_sha256(path)
4135                    .map_err(|err| {
4136                        tracing::warn!(
4137                            target: "reddb::backup",
4138                            error = %err,
4139                            snapshot_id = snapshot.snapshot_id,
4140                            "snapshot hash failed; manifest will lack checksum"
4141                        );
4142                    })
4143                    .ok();
4144            let manifest = crate::storage::wal::SnapshotManifest {
4145                timeline_id: timeline_id.clone(),
4146                snapshot_key: snapshot_key.clone(),
4147                snapshot_id: snapshot.snapshot_id,
4148                snapshot_time: snapshot.created_at_unix_ms as u64,
4149                base_lsn: current_lsn,
4150                schema_version: crate::api::REDDB_FORMAT_VERSION,
4151                format_version: crate::api::REDDB_FORMAT_VERSION,
4152                snapshot_sha256,
4153            };
4154            crate::storage::wal::publish_snapshot_manifest(backend.as_ref(), &manifest)
4155                .map_err(|err| RedDBError::Internal(err.to_string()))?;
4156
4157            // PLAN.md Phase 11.3 — read the head of the WAL hash chain
4158            // so the new segment can link back. `None` means we're
4159            // starting a fresh timeline (after a clean restore or on
4160            // first archive ever); the segment's `prev_hash` will be
4161            // `None` and restore-side validation accepts that only for
4162            // the first segment in `plan.wal_segments`.
4163            let prev_segment_hash = self.config_string("red.config.timeline.last_segment_hash", "");
4164            let prev_hash_arg = if prev_segment_hash.is_empty() {
4165                None
4166            } else {
4167                Some(prev_segment_hash)
4168            };
4169
4170            let archived_lsn = if let Some(primary) = &self.inner.db.replication {
4171                let oldest = primary
4172                    .logical_wal_spool
4173                    .as_ref()
4174                    .and_then(|spool| spool.oldest_lsn().ok().flatten())
4175                    .or_else(|| primary.wal_buffer.oldest_lsn())
4176                    .unwrap_or(last_archived_lsn);
4177                if last_archived_lsn > 0 && last_archived_lsn < oldest.saturating_sub(1) {
4178                    return Err(RedDBError::Internal(format!(
4179                        "logical WAL gap detected: last_archived_lsn={last_archived_lsn}, oldest_available_lsn={oldest}"
4180                    )));
4181                }
4182                let records = if let Some(spool) = &primary.logical_wal_spool {
4183                    spool
4184                        .read_since(last_archived_lsn, usize::MAX)
4185                        .map_err(|err| RedDBError::Internal(err.to_string()))?
4186                } else {
4187                    primary.wal_buffer.read_since(last_archived_lsn, usize::MAX)
4188                };
4189                if let Some(meta) = crate::storage::wal::archive_change_records(
4190                    backend.as_ref(),
4191                    &wal_prefix,
4192                    &records,
4193                    prev_hash_arg,
4194                )
4195                .map_err(|err| RedDBError::Internal(err.to_string()))?
4196                {
4197                    if let Some(spool) = &primary.logical_wal_spool {
4198                        let _ = spool.prune_through(meta.lsn_end);
4199                    }
4200                    // Advance the chain head so the next archive call
4201                    // links to this segment's hash. If the segment has
4202                    // no sha256 (legacy / hashing failed) we leave the
4203                    // head as-is — the next segment then carries the
4204                    // prior chain head, preserving continuity.
4205                    if let Some(sha) = &meta.sha256 {
4206                        self.inner.db.store().set_config_tree(
4207                            "red.config.timeline",
4208                            &crate::json!({ "last_segment_hash": sha }),
4209                        );
4210                    }
4211                    meta.lsn_end
4212                } else {
4213                    last_archived_lsn
4214                }
4215            } else {
4216                last_archived_lsn
4217            };
4218
4219            let head = crate::storage::wal::BackupHead {
4220                timeline_id,
4221                snapshot_key,
4222                snapshot_id: snapshot.snapshot_id,
4223                snapshot_time: snapshot.created_at_unix_ms as u64,
4224                current_lsn,
4225                last_archived_lsn: archived_lsn,
4226                wal_prefix,
4227            };
4228            crate::storage::wal::publish_backup_head(backend.as_ref(), &head_key, &head)
4229                .map_err(|err| RedDBError::Internal(err.to_string()))?;
4230            self.inner.db.store().set_config_tree(
4231                "red.config.timeline",
4232                &crate::json!({
4233                    "last_archived_lsn": archived_lsn,
4234                    "id": head.timeline_id
4235                }),
4236            );
4237
4238            // PLAN.md Phase 2.4 — refresh the unified `MANIFEST.json`
4239            // at the prefix root so external tooling sees a single
4240            // catalog of every snapshot + WAL segment with their
4241            // checksums. Best-effort: a manifest publish failure
4242            // doesn't fail the backup (the per-artifact sidecars
4243            // already give restore-side integrity), but it does log
4244            // so dashboards can flag stale catalogs.
4245            if let Err(err) = crate::storage::wal::publish_unified_manifest_for_prefix(
4246                backend.as_ref(),
4247                &snapshot_prefix,
4248            ) {
4249                tracing::warn!(
4250                    target: "reddb::backup",
4251                    error = %err,
4252                    snapshot_prefix = %snapshot_prefix,
4253                    "unified MANIFEST.json refresh failed; per-artifact sidecars unaffected"
4254                );
4255            }
4256
4257            // PLAN.md Phase 11.4 — when the operator picked a
4258            // commit policy that demands replica durability, block
4259            // until the configured count of replicas has acked the
4260            // archived LSN (or the timeout fires). For backup the
4261            // policy decides the *DR posture* — `local` returns
4262            // immediately, `ack_n` ensures at least N replicas saw
4263            // the new tail before we report success to the
4264            // operator. A `TimedOut` is logged but does NOT fail
4265            // the backup: the local WAL + remote upload are durable
4266            // regardless; the missing acks are reported via
4267            // /metrics and /admin/status so the operator can decide.
4268            match self.commit_policy() {
4269                crate::replication::CommitPolicy::AckN(n) if n > 0 => {
4270                    let timeout = std::env::var("RED_REPLICATION_ACK_TIMEOUT_MS")
4271                        .ok()
4272                        .and_then(|v| v.parse::<u64>().ok())
4273                        .unwrap_or(5_000);
4274                    let outcome = self.await_replica_acks(
4275                        archived_lsn,
4276                        n,
4277                        std::time::Duration::from_millis(timeout),
4278                    );
4279                    match outcome {
4280                        crate::replication::AwaitOutcome::Reached(count) => {
4281                            tracing::debug!(
4282                                target: "reddb::backup",
4283                                archived_lsn,
4284                                n,
4285                                count,
4286                                "ack_n: replicas synced before backup return"
4287                            );
4288                        }
4289                        crate::replication::AwaitOutcome::TimedOut { observed, required } => {
4290                            tracing::warn!(
4291                                target: "reddb::backup",
4292                                archived_lsn,
4293                                observed,
4294                                required,
4295                                timeout_ms = timeout,
4296                                "ack_n: timed out waiting for replicas; backup uploaded but DR posture degraded"
4297                            );
4298                        }
4299                        crate::replication::AwaitOutcome::NotRequired => {}
4300                    }
4301                }
4302                _ => {} // Local / RemoteWal / Quorum: no blocking yet
4303            }
4304
4305            // Issue #148 follow-up — opt-in archive of the L2 Blob Cache
4306            // directory tree. Default off so a standard backup stays
4307            // small; flip via `red.config.backup.include_blob_cache=true`
4308            // when warm-cache restore is required (per
4309            // docs/operations/blob-cache-backup-restore.md §1).
4310            //
4311            // The L2 tree is *derived* state (ADR 0006) — its absence
4312            // never causes data loss; it only affects post-restore
4313            // p99 latency until the cache re-warms. We therefore log
4314            // (not fail) on per-file upload errors so a partial L2
4315            // upload never aborts a healthy snapshot+WAL backup.
4316            if self.config_bool("red.config.backup.include_blob_cache", false) {
4317                let blob_cache_prefix = self.config_string(
4318                    "red.config.backup.blob_cache_prefix",
4319                    &format!("{snapshot_prefix}blob_cache/"),
4320                );
4321                if let Some(l2_path) = self.inner.result_blob_cache.l2_path() {
4322                    match crate::storage::cache::archive_blob_cache_l2(
4323                        backend.as_ref(),
4324                        l2_path,
4325                        &blob_cache_prefix,
4326                    ) {
4327                        Ok(count) => {
4328                            tracing::info!(
4329                                target: "reddb::backup",
4330                                files_uploaded = count,
4331                                blob_cache_prefix = %blob_cache_prefix,
4332                                "include_blob_cache: archived L2 directory"
4333                            );
4334                        }
4335                        Err(err) => {
4336                            tracing::warn!(
4337                                target: "reddb::backup",
4338                                error = %err,
4339                                blob_cache_prefix = %blob_cache_prefix,
4340                                "include_blob_cache: L2 archive failed; backup proceeding (cache is derived state)"
4341                            );
4342                        }
4343                    }
4344                } else {
4345                    tracing::debug!(
4346                        target: "reddb::backup",
4347                        "include_blob_cache=true but no L2 path configured; nothing to archive"
4348                    );
4349                }
4350            }
4351
4352            uploaded = true;
4353        }
4354
4355        Ok(crate::replication::scheduler::BackupResult {
4356            snapshot_id: snapshot.snapshot_id,
4357            uploaded,
4358            duration_ms: started.elapsed().as_millis() as u64,
4359            timestamp: snapshot.created_at_unix_ms as u64,
4360        })
4361    }
4362
4363    pub fn acquire(&self) -> RedDBResult<RuntimeConnection> {
4364        let mut pool = self
4365            .inner
4366            .pool
4367            .lock()
4368            .map_err(|e| RedDBError::Internal(format!("connection pool lock poisoned: {e}")))?;
4369        if pool.active >= self.inner.pool_config.max_connections {
4370            return Err(RedDBError::Internal(
4371                "connection pool exhausted".to_string(),
4372            ));
4373        }
4374
4375        let id = if let Some(id) = pool.idle.pop() {
4376            id
4377        } else {
4378            let id = pool.next_id;
4379            pool.next_id += 1;
4380            id
4381        };
4382        pool.active += 1;
4383        pool.total_checkouts += 1;
4384        drop(pool);
4385
4386        Ok(RuntimeConnection {
4387            id,
4388            inner: Arc::clone(&self.inner),
4389        })
4390    }
4391
4392    pub fn checkpoint(&self) -> RedDBResult<()> {
4393        // Local fsync always allowed — losing the lease shouldn't
4394        // prevent us from durably persisting what's already in memory.
4395        // The remote upload is the side-effect that risks clobbering a
4396        // peer's state, so it's behind the lease gate.
4397        self.inner.db.flush_local_only().map_err(|err| {
4398            // Issue #205 — local flush failure is a CheckpointFailed
4399            // operator-grade event. The local-flush path also covers
4400            // the WAL fsync we depend on, so a failure here doubles as
4401            // the WalFsyncFailed signal for the runtime entry point.
4402            let msg = err.to_string();
4403            crate::telemetry::operator_event::OperatorEvent::CheckpointFailed {
4404                lsn: 0,
4405                error: msg.clone(),
4406            }
4407            .emit_global();
4408            crate::telemetry::operator_event::OperatorEvent::WalFsyncFailed {
4409                path: "<flush_local_only>".to_string(),
4410                error: msg.clone(),
4411            }
4412            .emit_global();
4413            RedDBError::Engine(msg)
4414        })?;
4415        if let Err(err) = self.assert_remote_write_allowed("checkpoint") {
4416            tracing::warn!(
4417                target: "reddb::serverless::lease",
4418                error = %err,
4419                "checkpoint: skipping remote upload — lease not held"
4420            );
4421            return Ok(());
4422        }
4423        self.inner
4424            .db
4425            .upload_to_remote_backend()
4426            .map_err(|err| RedDBError::Engine(err.to_string()))
4427    }
4428
4429    /// Guard remote-mutating operations on the writer lease.
4430    /// Returns `Ok(())` when no remote backend is configured (the
4431    /// lease is irrelevant) or the lease state is `NotRequired` /
4432    /// `Held`. Returns `RedDBError::ReadOnly` when the lease is
4433    /// `NotHeld`, with an audit-friendly action label so the caller
4434    /// can record the rejection.
4435    pub(crate) fn assert_remote_write_allowed(&self, action: &str) -> RedDBResult<()> {
4436        if self.inner.db.remote_backend.is_none() {
4437            return Ok(());
4438        }
4439        match self.inner.write_gate.lease_state() {
4440            crate::runtime::write_gate::LeaseGateState::NotHeld => {
4441                self.inner.audit_log.record(
4442                    action,
4443                    "system",
4444                    "remote_backend",
4445                    "err: writer lease not held",
4446                    crate::json::Value::Null,
4447                );
4448                Err(RedDBError::ReadOnly(format!(
4449                    "writer lease not held — {action} blocked (serverless fence)"
4450                )))
4451            }
4452            _ => Ok(()),
4453        }
4454    }
4455
4456    pub fn run_maintenance(&self) -> RedDBResult<()> {
4457        self.inner
4458            .db
4459            .run_maintenance()
4460            .map_err(|err| RedDBError::Internal(err.to_string()))
4461    }
4462
4463    pub fn scan_collection(
4464        &self,
4465        collection: &str,
4466        cursor: Option<ScanCursor>,
4467        limit: usize,
4468    ) -> RedDBResult<ScanPage> {
4469        let store = self.inner.db.store();
4470        let manager = store
4471            .get_collection(collection)
4472            .ok_or_else(|| RedDBError::NotFound(collection.to_string()))?;
4473
4474        let mut entities = manager.query_all(|_| true);
4475        entities.sort_by_key(|entity| entity.id.raw());
4476
4477        let offset = cursor.map(|cursor| cursor.offset).unwrap_or(0);
4478        let total = entities.len();
4479        let end = total.min(offset.saturating_add(limit.max(1)));
4480        let items = if offset >= total {
4481            Vec::new()
4482        } else {
4483            entities[offset..end].to_vec()
4484        };
4485        let next = (end < total).then_some(ScanCursor { offset: end });
4486
4487        Ok(ScanPage {
4488            collection: collection.to_string(),
4489            items,
4490            next,
4491            total,
4492        })
4493    }
4494
4495    pub fn catalog(&self) -> CatalogModelSnapshot {
4496        self.inner.db.catalog_model_snapshot()
4497    }
4498
4499    pub fn catalog_consistency_report(&self) -> crate::catalog::CatalogConsistencyReport {
4500        self.inner.db.catalog_consistency_report()
4501    }
4502
4503    pub fn catalog_attention_summary(&self) -> CatalogAttentionSummary {
4504        crate::catalog::attention_summary(&self.catalog())
4505    }
4506
4507    pub fn collection_attention(&self) -> Vec<CollectionDescriptor> {
4508        crate::catalog::collection_attention(&self.catalog())
4509    }
4510
4511    pub fn index_attention(&self) -> Vec<CatalogIndexStatus> {
4512        crate::catalog::index_attention(&self.catalog())
4513    }
4514
4515    pub fn graph_projection_attention(&self) -> Vec<CatalogGraphProjectionStatus> {
4516        crate::catalog::graph_projection_attention(&self.catalog())
4517    }
4518
4519    pub fn analytics_job_attention(&self) -> Vec<CatalogAnalyticsJobStatus> {
4520        crate::catalog::analytics_job_attention(&self.catalog())
4521    }
4522
4523    pub fn stats(&self) -> RuntimeStats {
4524        let pool = runtime_pool_lock(self);
4525        RuntimeStats {
4526            active_connections: pool.active,
4527            idle_connections: pool.idle.len(),
4528            total_checkouts: pool.total_checkouts,
4529            paged_mode: self.inner.db.is_paged(),
4530            started_at_unix_ms: self.inner.started_at_unix_ms,
4531            store: self.inner.db.stats(),
4532            system: SystemInfo::collect(),
4533            result_blob_cache: self.inner.result_blob_cache.stats(),
4534            kv: self.inner.kv_stats.snapshot(),
4535            metrics_ingest: self.inner.metrics_ingest_stats.snapshot(),
4536        }
4537    }
4538
4539    pub(crate) fn record_metrics_ingest(
4540        &self,
4541        accepted_samples: u64,
4542        accepted_series: u64,
4543        rejected_samples: u64,
4544        rejected_series: u64,
4545    ) {
4546        self.inner.metrics_ingest_stats.record(
4547            accepted_samples,
4548            accepted_series,
4549            rejected_samples,
4550            rejected_series,
4551        );
4552    }
4553
4554    pub(crate) fn record_metrics_cardinality_budget_rejections(&self, rejected_series: u64) {
4555        self.inner
4556            .metrics_ingest_stats
4557            .record_cardinality_budget_rejections(rejected_series);
4558    }
4559
4560    pub(crate) fn record_metrics_tenant_activity(
4561        &self,
4562        tenant: &str,
4563        namespace: &str,
4564        operation: &str,
4565    ) {
4566        self.inner
4567            .metrics_tenant_activity_stats
4568            .record(tenant, namespace, operation);
4569    }
4570
4571    pub(crate) fn metrics_tenant_activity_snapshot(
4572        &self,
4573    ) -> Vec<crate::runtime::MetricsTenantActivityStats> {
4574        self.inner.metrics_tenant_activity_stats.snapshot()
4575    }
4576
4577    /// Execute a query under a typed scope override without embedding
4578    /// the tenant / user / role values into the SQL string. Use this
4579    /// from transport middleware (HTTP / gRPC / worker loops) where the
4580    /// scope is resolved from auth claims and the SQL is a parameterised
4581    /// template — avoids the string-concat injection risk of building
4582    /// `WITHIN TENANT '<id>' …` manually, and is drop-in compatible with
4583    /// prepared statements that didn't know about tenancy.
4584    ///
4585    /// Precedence matches the `WITHIN` clause: the passed `scope`
4586    /// overrides `SET LOCAL TENANT`, which overrides `SET TENANT`.
4587    /// The override is pushed on the thread-local scope stack for the
4588    /// duration of the call and popped on return — pool-shared
4589    /// connections cannot leak it across requests.
4590    pub fn execute_query_with_scope(
4591        &self,
4592        query: &str,
4593        scope: crate::runtime::within_clause::ScopeOverride,
4594    ) -> RedDBResult<RuntimeQueryResult> {
4595        if scope.is_empty() {
4596            return self.execute_query(query);
4597        }
4598        let _scope_guard = ScopeOverrideGuard::install(scope);
4599        self.execute_query(query)
4600    }
4601
4602    /// Issue #205 — single lifecycle exit for slow-query logging.
4603    ///
4604    /// `execute_query_inner` does the real work; this wrapper times it
4605    /// and, if elapsed exceeds the configured threshold, hands the
4606    /// triple `(QueryKind, elapsed_ms, sql_redacted, scope)` to the
4607    /// SlowQueryLogger. The threshold + sample_pct were captured at
4608    /// SlowQueryLogger construction (runtime startup), so the per-call
4609    /// cost on below-threshold paths is one relaxed atomic load.
4610    pub fn execute_query(&self, query: &str) -> RedDBResult<RuntimeQueryResult> {
4611        let started = std::time::Instant::now();
4612        let result = self.execute_query_inner(query);
4613        let elapsed_ms = started.elapsed().as_millis() as u64;
4614
4615        // Build EffectiveScope from the same thread-locals frame-build
4616        // consults — keeps the slow-log row consistent with the audit /
4617        // RLS view of "this statement". `ai_scope()` is the canonical
4618        // builder.
4619        let scope = self.ai_scope();
4620        let kind = match result
4621            .as_ref()
4622            .map(|r| r.statement_type)
4623            .unwrap_or("select")
4624        {
4625            "select" => crate::telemetry::slow_query_logger::QueryKind::Select,
4626            "insert" => crate::telemetry::slow_query_logger::QueryKind::Insert,
4627            "update" => crate::telemetry::slow_query_logger::QueryKind::Update,
4628            "delete" => crate::telemetry::slow_query_logger::QueryKind::Delete,
4629            _ => crate::telemetry::slow_query_logger::QueryKind::Internal,
4630        };
4631        // SQL redaction: pass the raw query through. The slow-query
4632        // logger writes structured JSON so embedded literals stay
4633        // escape-safe at the JSON boundary (proven by
4634        // `adversarial_sql_is_escape_safe` in slow_query_logger.rs).
4635        // PII redaction (e.g. literal masking) is a follow-up.
4636        self.inner
4637            .slow_query_logger
4638            .record(kind, elapsed_ms, query.to_string(), &scope);
4639
4640        result
4641    }
4642
4643    #[inline(never)]
4644    fn execute_query_inner(&self, query: &str) -> RedDBResult<RuntimeQueryResult> {
4645        // ── ULTRA-TURBO: autocommit `SELECT * FROM t WHERE _entity_id = N` ──
4646        //
4647        // Moved above every boot-cost the normal path pays (WITHIN
4648        // strip, SET LOCAL parse, tx_local_tenants read, snapshot
4649        // guard, tracing span, tx_contexts read) because the bench's
4650        // `select_point` scenario was observed at 28× vs PostgreSQL —
4651        // the dominant cost wasn't the entity fetch but the ceremony
4652        // before it. Only fires when there's no ambient transaction
4653        // context or WITHIN override, so the snapshot install we skip
4654        // truly is a no-op for this query.
4655        if !has_scope_override_active()
4656            && !query.trim_start().starts_with("WITHIN")
4657            && !query.trim_start().starts_with("within")
4658            && !self
4659                .inner
4660                .tx_contexts
4661                .read()
4662                .contains_key(&current_connection_id())
4663        {
4664            if let Some(result) = self.try_fast_entity_lookup(query) {
4665                return result;
4666            }
4667        }
4668
4669        // `WITHIN TENANT '<id>' [USER '<u>'] [AS ROLE '<r>'] <stmt>` —
4670        // strip the prefix, push a stack-scoped override, recurse on
4671        // the inner statement, pop on return. Stack lives in a
4672        // thread-local but is balanced by the RAII guard, so a
4673        // pool-shared connection cannot leak the override across
4674        // requests and an early `?` return still pops cleanly.
4675        match crate::runtime::within_clause::try_strip_within_prefix(query) {
4676            Ok(Some((scope, inner))) => {
4677                let _scope_guard = ScopeOverrideGuard::install(scope);
4678                // Re-enter the inner path, NOT `execute_query`, so the
4679                // slow-query lifecycle hook records exactly one row per
4680                // top-level statement (the WITHIN-stripped form would
4681                // double-record).
4682                return self.execute_query_inner(inner);
4683            }
4684            Ok(None) => {}
4685            Err(msg) => return Err(RedDBError::Query(msg)),
4686        }
4687
4688        // `EXPLAIN <stmt>` — introspection. Runs the planner on the
4689        // inner statement (WITHOUT executing it) and returns the
4690        // CanonicalLogicalNode tree as rows so the caller can see the
4691        // operator shape and estimated cost. `EXPLAIN ALTER FOR ...`
4692        // is a distinct schema-diff command and continues down the
4693        // regular SQL path.
4694        if let Some(inner) = strip_explain_prefix(query) {
4695            return self.explain_as_rows(query, inner);
4696        }
4697
4698        // `SET LOCAL TENANT '<id>'` — write the per-transaction tenant
4699        // override and return. Outside a transaction the statement is
4700        // an error (matches PG semantics: SET LOCAL only takes effect
4701        // within an active transaction).
4702        if let Some(value) = parse_set_local_tenant(query)? {
4703            let conn_id = current_connection_id();
4704            if !self.inner.tx_contexts.read().contains_key(&conn_id) {
4705                return Err(RedDBError::Query(
4706                    "SET LOCAL TENANT requires an active transaction".to_string(),
4707                ));
4708            }
4709            self.inner
4710                .tx_local_tenants
4711                .write()
4712                .insert(conn_id, value.clone());
4713            return Ok(RuntimeQueryResult::ok_message(
4714                query.to_string(),
4715                &match &value {
4716                    Some(id) => format!("local tenant set: {id}"),
4717                    None => "local tenant cleared".to_string(),
4718                },
4719                "set_local_tenant",
4720            ));
4721        }
4722
4723        if super::red_schema::is_system_schema_write(query) {
4724            return Err(RedDBError::Query(
4725                super::red_schema::READ_ONLY_ERROR.to_string(),
4726            ));
4727        }
4728
4729        let rewritten_query = super::red_schema::rewrite_virtual_names(query);
4730        let execution_query = rewritten_query.as_deref().unwrap_or(query);
4731
4732        let frame = super::statement_frame::StatementExecutionFrame::build(self, execution_query)?;
4733        let _frame_guards = frame.install(self);
4734
4735        // Phase 6 logging: enter a span stamped with conn_id / tenant
4736        // / query_len. Every downstream tracing::info!/warn!/error!
4737        // inherits these fields — no need to thread them manually
4738        // through storage/scan layers. Entered AFTER the WITHIN /
4739        // SET LOCAL TENANT resolution above so the span reflects the
4740        // effective scope for this statement.
4741        let _log_span = crate::telemetry::span::query_span(query).entered();
4742
4743        // ── CTE prelude (#41) — `WITH x AS (...) SELECT ... FROM x` ──
4744        if let Some(rewritten) = frame.prepare_cte(execution_query)? {
4745            return self.execute_query_expr(rewritten);
4746        }
4747
4748        // ── TURBO: bypass SQL parse for SELECT * FROM x WHERE _entity_id = N ──
4749        if let Some(result) = self.try_fast_entity_lookup(execution_query) {
4750            return result;
4751        }
4752
4753        // ── Result cache: return cached result if still fresh (30s TTL) ──
4754        if let Some(result) = frame.read_result_cache(self) {
4755            return Ok(result);
4756        }
4757
4758        let prepared = frame.prepare_statement(self, execution_query)?;
4759        let mode = prepared.mode;
4760        let expr = prepared.expr;
4761
4762        let statement = query_expr_name(&expr);
4763        let result_cache_scopes = query_expr_result_cache_scopes(&expr);
4764
4765        let _lock_guard = frame.prepare_dispatch(self, &expr)?;
4766        let frame_iface: &dyn super::statement_frame::ReadFrame = &frame;
4767
4768        let query_result = match expr {
4769            QueryExpr::Graph(_) | QueryExpr::Path(_) => {
4770                // Apply MVCC visibility + RLS gate while materialising the
4771                // graph: every node entity is screened against the source
4772                // collection's policy chain (basic and `Nodes`-targeted)
4773                // and dropped when the caller's tenant / role doesn't
4774                // admit it. Edges are pruned automatically because the
4775                // graph builder skips edges whose endpoints aren't in
4776                // `allowed_nodes`.
4777                let (graph, node_properties, edge_properties) =
4778                    self.materialize_graph_with_rls()?;
4779                let result =
4780                    crate::storage::query::unified::UnifiedExecutor::execute_on_with_graph_properties(
4781                        &graph,
4782                        &expr,
4783                        node_properties,
4784                        edge_properties,
4785                    )
4786                        .map_err(|err| RedDBError::Query(err.to_string()))?;
4787
4788                Ok(RuntimeQueryResult {
4789                    query: query.to_string(),
4790                    mode,
4791                    statement,
4792                    engine: "materialized-graph",
4793                    result,
4794                    affected_rows: 0,
4795                    statement_type: "select",
4796                })
4797            }
4798            QueryExpr::Table(table) => {
4799                let table = self.resolve_table_expr_subqueries(
4800                    table,
4801                    &frame as &dyn super::statement_frame::ReadFrame,
4802                )?;
4803                if super::red_schema::is_virtual_table(&table.table) {
4804                    return Ok(RuntimeQueryResult {
4805                        query: query.to_string(),
4806                        mode,
4807                        statement,
4808                        engine: "runtime-red-schema",
4809                        result: super::red_schema::red_query(
4810                            self,
4811                            &table.table,
4812                            &table,
4813                            &frame as &dyn super::statement_frame::ReadFrame,
4814                        )?,
4815                        affected_rows: 0,
4816                        statement_type: "select",
4817                    });
4818                }
4819
4820                if let Some(result) = self.execute_probabilistic_select(&table)? {
4821                    return Ok(RuntimeQueryResult {
4822                        query: query.to_string(),
4823                        mode,
4824                        statement,
4825                        engine: "runtime-probabilistic",
4826                        result,
4827                        affected_rows: 0,
4828                        statement_type: "select",
4829                    });
4830                }
4831
4832                // Foreign-table intercept (Phase 3.2.2 PG parity).
4833                //
4834                // When the referenced table matches a `CREATE FOREIGN TABLE`
4835                // registration, short-circuit into the FDW scan. Phase 3.2
4836                // wrappers don't yet support pushdown, so filters/projections
4837                // apply post-scan via `apply_foreign_table_filters` — good
4838                // enough for correctness; perf work lands in 3.2.3.
4839                if self.inner.foreign_tables.is_foreign_table(&table.table) {
4840                    let records = self
4841                        .inner
4842                        .foreign_tables
4843                        .scan(&table.table)
4844                        .map_err(|e| RedDBError::Internal(e.to_string()))?;
4845                    let result = apply_foreign_table_filters(records, &table);
4846                    return Ok(RuntimeQueryResult {
4847                        query: query.to_string(),
4848                        mode,
4849                        statement,
4850                        engine: "runtime-fdw",
4851                        result,
4852                        affected_rows: 0,
4853                        statement_type: "select",
4854                    });
4855                }
4856
4857                // Row-Level Security enforcement (Phase 2.5.2 PG parity).
4858                //
4859                // When RLS is enabled on this table, fetch every policy
4860                // that applies to the current (role, SELECT) pair and
4861                // fold them into the query's WHERE clause: policies
4862                // OR-combine (any of them admitting the row is enough),
4863                // then AND into the caller's existing filter.
4864                //
4865                // Anonymous callers (no thread-local identity) pass
4866                // `role = None`; policies with a specific `TO role`
4867                // clause skip, but `TO PUBLIC` policies still apply.
4868                //
4869                // When `inject_rls_filters` returns `None` the table has
4870                // RLS enabled but no policy admits the caller's role —
4871                // short-circuit with an empty result set instead of
4872                // synthesising a contradiction filter.
4873                let Some(table_with_rls) = self.authorize_relational_table_select(
4874                    table,
4875                    &frame as &dyn super::statement_frame::ReadFrame,
4876                )?
4877                else {
4878                    let empty = crate::storage::query::unified::UnifiedResult::empty();
4879                    return Ok(RuntimeQueryResult {
4880                        query: query.to_string(),
4881                        mode,
4882                        statement,
4883                        engine: "runtime-table-rls",
4884                        result: empty,
4885                        affected_rows: 0,
4886                        statement_type: "select",
4887                    });
4888                };
4889                Ok(RuntimeQueryResult {
4890                    query: query.to_string(),
4891                    mode,
4892                    statement,
4893                    engine: "runtime-table",
4894                    result: execute_runtime_table_query(
4895                        &self.inner.db,
4896                        &table_with_rls,
4897                        Some(&self.inner.index_store),
4898                    )?,
4899                    affected_rows: 0,
4900                    statement_type: "select",
4901                })
4902            }
4903            QueryExpr::Join(join) => {
4904                // Fold per-table RLS filters into each `QueryExpr::Table`
4905                // leaf of the join tree before executing. Without this
4906                // the join executor scans both tables raw and ignores
4907                // policies — a `WITHIN TENANT 'x'` against a join of
4908                // two tenant-scoped tables would leak cross-tenant rows.
4909                // When any leaf has RLS enabled and zero matching policy,
4910                // short-circuit to an empty join result instead of
4911                // emitting a contradiction filter.
4912                let join_with_rls = match self.authorize_relational_join_select(
4913                    join,
4914                    &frame as &dyn super::statement_frame::ReadFrame,
4915                )? {
4916                    Some(j) => j,
4917                    None => {
4918                        return Ok(RuntimeQueryResult {
4919                            query: query.to_string(),
4920                            mode,
4921                            statement,
4922                            engine: "runtime-join-rls",
4923                            result: crate::storage::query::unified::UnifiedResult::empty(),
4924                            affected_rows: 0,
4925                            statement_type: "select",
4926                        });
4927                    }
4928                };
4929                Ok(RuntimeQueryResult {
4930                    query: query.to_string(),
4931                    mode,
4932                    statement,
4933                    engine: "runtime-join",
4934                    result: execute_runtime_join_query(&self.inner.db, &join_with_rls)?,
4935                    affected_rows: 0,
4936                    statement_type: "select",
4937                })
4938            }
4939            QueryExpr::Vector(vector) => Ok(RuntimeQueryResult {
4940                query: query.to_string(),
4941                mode,
4942                statement,
4943                engine: "runtime-vector",
4944                result: execute_runtime_vector_query(&self.inner.db, &vector)?,
4945                affected_rows: 0,
4946                statement_type: "select",
4947            }),
4948            QueryExpr::Hybrid(hybrid) => Ok(RuntimeQueryResult {
4949                query: query.to_string(),
4950                mode,
4951                statement,
4952                engine: "runtime-hybrid",
4953                result: execute_runtime_hybrid_query(&self.inner.db, &hybrid)?,
4954                affected_rows: 0,
4955                statement_type: "select",
4956            }),
4957            // DML execution
4958            QueryExpr::Insert(ref insert) if super::red_schema::is_virtual_table(&insert.table) => {
4959                Err(RedDBError::Query(
4960                    super::red_schema::READ_ONLY_ERROR.to_string(),
4961                ))
4962            }
4963            QueryExpr::Update(ref update) if super::red_schema::is_virtual_table(&update.table) => {
4964                Err(RedDBError::Query(
4965                    super::red_schema::READ_ONLY_ERROR.to_string(),
4966                ))
4967            }
4968            QueryExpr::Delete(ref delete) if super::red_schema::is_virtual_table(&delete.table) => {
4969                Err(RedDBError::Query(
4970                    super::red_schema::READ_ONLY_ERROR.to_string(),
4971                ))
4972            }
4973            QueryExpr::Insert(ref insert) => self
4974                .with_deferred_store_wal_for_dml(self.insert_may_emit_events(insert), || {
4975                    self.execute_insert(query, insert)
4976                }),
4977            QueryExpr::Update(ref update) => self
4978                .with_deferred_store_wal_for_dml(self.update_may_emit_events(update), || {
4979                    self.execute_update(query, update)
4980                }),
4981            QueryExpr::Delete(ref delete) => self
4982                .with_deferred_store_wal_for_dml(self.delete_may_emit_events(delete), || {
4983                    self.execute_delete(query, delete)
4984                }),
4985            // DDL execution
4986            QueryExpr::CreateTable(ref create) => self.execute_create_table(query, create),
4987            QueryExpr::CreateCollection(ref create) => {
4988                self.execute_create_collection(query, create)
4989            }
4990            QueryExpr::CreateVector(ref create) => self.execute_create_vector(query, create),
4991            QueryExpr::DropTable(ref drop_tbl) => self.execute_drop_table(query, drop_tbl),
4992            QueryExpr::DropGraph(ref drop_graph) => self.execute_drop_graph(query, drop_graph),
4993            QueryExpr::DropVector(ref drop_vector) => self.execute_drop_vector(query, drop_vector),
4994            QueryExpr::DropDocument(ref drop_document) => {
4995                self.execute_drop_document(query, drop_document)
4996            }
4997            QueryExpr::DropKv(ref drop_kv) => self.execute_drop_kv(query, drop_kv),
4998            QueryExpr::DropCollection(ref drop_collection) => {
4999                self.execute_drop_collection(query, drop_collection)
5000            }
5001            QueryExpr::Truncate(ref truncate) => self.execute_truncate(query, truncate),
5002            QueryExpr::AlterTable(ref alter) => self.execute_alter_table(query, alter),
5003            QueryExpr::ExplainAlter(ref explain) => self.execute_explain_alter(query, explain),
5004            // Graph analytics commands
5005            QueryExpr::GraphCommand(ref cmd) => self.execute_graph_command(query, cmd),
5006            // Search commands
5007            QueryExpr::SearchCommand(ref cmd) => self.execute_search_command(query, cmd),
5008            // ASK: RAG query with LLM synthesis
5009            QueryExpr::Ask(ref ask) => self.execute_ask(query, ask),
5010            QueryExpr::CreateIndex(ref create_idx) => self.execute_create_index(query, create_idx),
5011            QueryExpr::DropIndex(ref drop_idx) => self.execute_drop_index(query, drop_idx),
5012            QueryExpr::ProbabilisticCommand(ref cmd) => {
5013                self.execute_probabilistic_command(query, cmd)
5014            }
5015            // Time-series DDL
5016            QueryExpr::CreateTimeSeries(ref ts) => self.execute_create_timeseries(query, ts),
5017            QueryExpr::DropTimeSeries(ref ts) => self.execute_drop_timeseries(query, ts),
5018            // Queue DDL and commands
5019            QueryExpr::CreateQueue(ref q) => self.execute_create_queue(query, q),
5020            QueryExpr::AlterQueue(ref q) => self.execute_alter_queue(query, q),
5021            QueryExpr::DropQueue(ref q) => self.execute_drop_queue(query, q),
5022            QueryExpr::QueueSelect(ref q) => self.execute_queue_select(query, q),
5023            QueryExpr::QueueCommand(ref cmd) => self.execute_queue_command(query, cmd),
5024            QueryExpr::EventsBackfill(ref backfill) => {
5025                self.execute_events_backfill(query, backfill)
5026            }
5027            QueryExpr::EventsBackfillStatus { ref collection } => Err(RedDBError::Query(format!(
5028                "EVENTS BACKFILL STATUS for '{collection}' is not implemented in this slice"
5029            ))),
5030            QueryExpr::KvCommand(ref cmd) => self.execute_kv_command(query, cmd),
5031            QueryExpr::ConfigCommand(ref cmd) => self.execute_config_command(query, cmd),
5032            QueryExpr::CreateTree(ref tree) => self.execute_create_tree(query, tree),
5033            QueryExpr::DropTree(ref tree) => self.execute_drop_tree(query, tree),
5034            QueryExpr::TreeCommand(ref cmd) => self.execute_tree_command(query, cmd),
5035            // SET CONFIG key = value
5036            QueryExpr::SetConfig { ref key, ref value } => {
5037                if key.starts_with("red.secret.") {
5038                    return Err(RedDBError::Query(
5039                        "red.secret.* is reserved for vault secrets; use SET SECRET".to_string(),
5040                    ));
5041                }
5042                let store = self.inner.db.store();
5043                let json_val = match value {
5044                    Value::Text(s) => crate::serde_json::Value::String(s.to_string()),
5045                    Value::Integer(n) => crate::serde_json::Value::Number(*n as f64),
5046                    Value::Float(n) => crate::serde_json::Value::Number(*n),
5047                    Value::Boolean(b) => crate::serde_json::Value::Bool(*b),
5048                    _ => crate::serde_json::Value::String(value.to_string()),
5049                };
5050                store.set_config_tree(key, &json_val);
5051                update_current_config_value(key, value.clone());
5052                // Config changes can flip runtime behavior mid-session
5053                // (auto_decrypt, auto_encrypt, etc.) — invalidate the
5054                // result cache so subsequent reads re-execute against
5055                // the new config.
5056                self.invalidate_result_cache();
5057                Ok(RuntimeQueryResult::ok_message(
5058                    query.to_string(),
5059                    &format!("config set: {key}"),
5060                    "set",
5061                ))
5062            }
5063            // SET SECRET key = value
5064            QueryExpr::SetSecret { ref key, ref value } => {
5065                if key.starts_with("red.config.") {
5066                    return Err(RedDBError::Query(
5067                        "red.config.* is reserved for config; use SET CONFIG".to_string(),
5068                    ));
5069                }
5070                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
5071                    RedDBError::Query("SET SECRET requires an enabled, unsealed vault".to_string())
5072                })?;
5073                if matches!(value, Value::Null) {
5074                    auth_store
5075                        .vault_kv_try_delete(key)
5076                        .map_err(|err| RedDBError::Query(err.to_string()))?;
5077                    update_current_secret_value(key, None);
5078                    self.invalidate_result_cache();
5079                    return Ok(RuntimeQueryResult::ok_message(
5080                        query.to_string(),
5081                        &format!("secret deleted: {key}"),
5082                        "delete_secret",
5083                    ));
5084                }
5085                let value = secret_sql_value_to_string(value)?;
5086                auth_store
5087                    .vault_kv_try_set(key.clone(), value.clone())
5088                    .map_err(|err| RedDBError::Query(err.to_string()))?;
5089                update_current_secret_value(key, Some(value));
5090                self.invalidate_result_cache();
5091                Ok(RuntimeQueryResult::ok_message(
5092                    query.to_string(),
5093                    &format!("secret set: {key}"),
5094                    "set_secret",
5095                ))
5096            }
5097            // DELETE SECRET key
5098            QueryExpr::DeleteSecret { ref key } => {
5099                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
5100                    RedDBError::Query(
5101                        "DELETE SECRET requires an enabled, unsealed vault".to_string(),
5102                    )
5103                })?;
5104                let deleted = auth_store
5105                    .vault_kv_try_delete(key)
5106                    .map_err(|err| RedDBError::Query(err.to_string()))?;
5107                if deleted {
5108                    update_current_secret_value(key, None);
5109                }
5110                self.invalidate_result_cache();
5111                Ok(RuntimeQueryResult::ok_message(
5112                    query.to_string(),
5113                    &format!("secret deleted: {key}"),
5114                    if deleted {
5115                        "delete_secret"
5116                    } else {
5117                        "delete_secret_not_found"
5118                    },
5119                ))
5120            }
5121            // SHOW SECRET[S] [prefix]
5122            QueryExpr::ShowSecrets { ref prefix } => {
5123                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
5124                    RedDBError::Query("SHOW SECRET requires an enabled, unsealed vault".to_string())
5125                })?;
5126                if !auth_store.is_vault_backed() {
5127                    return Err(RedDBError::Query(
5128                        "SHOW SECRET requires an enabled, unsealed vault".to_string(),
5129                    ));
5130                }
5131                let mut keys = auth_store.vault_kv_keys();
5132                keys.sort();
5133                let mut result = UnifiedResult::with_columns(vec![
5134                    "key".into(),
5135                    "value".into(),
5136                    "status".into(),
5137                ]);
5138                for key in keys {
5139                    if let Some(ref pfx) = prefix {
5140                        if !key.starts_with(pfx) {
5141                            continue;
5142                        }
5143                    }
5144                    let mut record = UnifiedRecord::new();
5145                    record.set("key", Value::text(key));
5146                    record.set("value", Value::text("***"));
5147                    record.set("status", Value::text("active"));
5148                    result.push(record);
5149                }
5150                Ok(RuntimeQueryResult {
5151                    query: query.to_string(),
5152                    mode,
5153                    statement: "show_secrets",
5154                    engine: "runtime-secret",
5155                    result,
5156                    affected_rows: 0,
5157                    statement_type: "select",
5158                })
5159            }
5160            // SHOW CONFIG [prefix]
5161            QueryExpr::ShowConfig { ref prefix } => {
5162                let store = self.inner.db.store();
5163                let all_collections = store.list_collections();
5164                if !all_collections.contains(&"red_config".to_string()) {
5165                    let result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
5166                    return Ok(RuntimeQueryResult {
5167                        query: query.to_string(),
5168                        mode,
5169                        statement: "show_config",
5170                        engine: "runtime-config",
5171                        result,
5172                        affected_rows: 0,
5173                        statement_type: "select",
5174                    });
5175                }
5176                let manager = store
5177                    .get_collection("red_config")
5178                    .ok_or_else(|| RedDBError::NotFound("red_config".to_string()))?;
5179                let entities = manager.query_all(|_| true);
5180                let mut latest = std::collections::BTreeMap::<String, (u64, Value, Value)>::new();
5181                for entity in entities {
5182                    if let EntityData::Row(ref row) = entity.data {
5183                        if let Some(ref named) = row.named {
5184                            let key_val = named.get("key").cloned().unwrap_or(Value::Null);
5185                            let val = named.get("value").cloned().unwrap_or(Value::Null);
5186                            let key_str = match &key_val {
5187                                Value::Text(s) => s.as_ref(),
5188                                _ => continue,
5189                            };
5190                            if let Some(ref pfx) = prefix {
5191                                if !key_str.starts_with(pfx.as_str()) {
5192                                    continue;
5193                                }
5194                            }
5195                            let entity_id = entity.id.raw();
5196                            match latest.get(key_str) {
5197                                Some((prev_id, _, _)) if *prev_id > entity_id => {}
5198                                _ => {
5199                                    latest.insert(key_str.to_string(), (entity_id, key_val, val));
5200                                }
5201                            }
5202                        }
5203                    }
5204                }
5205                let mut result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
5206                for (_, key_val, val) in latest.into_values() {
5207                    let mut record = UnifiedRecord::new();
5208                    record.set("key", key_val);
5209                    record.set("value", val);
5210                    result.push(record);
5211                }
5212                Ok(RuntimeQueryResult {
5213                    query: query.to_string(),
5214                    mode,
5215                    statement: "show_config",
5216                    engine: "runtime-config",
5217                    result,
5218                    affected_rows: 0,
5219                    statement_type: "select",
5220                })
5221            }
5222            // Session-local multi-tenancy handle (Phase 2.5.3).
5223            //
5224            // SET TENANT 'id' / SET TENANT NULL / RESET TENANT — writes
5225            // the thread-local; SHOW TENANT returns it. Paired with the
5226            // CURRENT_TENANT() scalar for use in RLS policies.
5227            QueryExpr::SetTenant(ref value) => {
5228                match value {
5229                    Some(id) => set_current_tenant(id.clone()),
5230                    None => clear_current_tenant(),
5231                }
5232                Ok(RuntimeQueryResult::ok_message(
5233                    query.to_string(),
5234                    &match value {
5235                        Some(id) => format!("tenant set: {id}"),
5236                        None => "tenant cleared".to_string(),
5237                    },
5238                    "set_tenant",
5239                ))
5240            }
5241            QueryExpr::ShowTenant => {
5242                let mut result = UnifiedResult::with_columns(vec!["tenant".into()]);
5243                let mut record = UnifiedRecord::new();
5244                record.set(
5245                    "tenant",
5246                    current_tenant().map(Value::text).unwrap_or(Value::Null),
5247                );
5248                result.push(record);
5249                Ok(RuntimeQueryResult {
5250                    query: query.to_string(),
5251                    mode,
5252                    statement: "show_tenant",
5253                    engine: "runtime-tenant",
5254                    result,
5255                    affected_rows: 0,
5256                    statement_type: "select",
5257                })
5258            }
5259            // Transaction control (Phase 2.3 PG parity).
5260            //
5261            // BEGIN allocates a real `Xid` and stores a `TxnContext` keyed by
5262            // the current connection's id. COMMIT/ROLLBACK release it through
5263            // the `SnapshotManager` so future snapshots see the correct set of
5264            // active/aborted transactions.
5265            //
5266            // Tuple stamping (xmin/xmax) and read-path visibility filtering
5267            // land in Phase 2.3.2 — this dispatch only manages the snapshot
5268            // registry. Statements running outside a TxnContext still behave
5269            // as autocommit (xid=0 → visible to every snapshot).
5270            QueryExpr::TransactionControl(ref ctl) => {
5271                use crate::storage::query::ast::TxnControl;
5272                use crate::storage::transaction::snapshot::{TxnContext, Xid};
5273                use crate::storage::transaction::IsolationLevel;
5274
5275                // Phase 2.3 keys transactions by a thread-local connection id.
5276                // The stdio/gRPC paths wire a real per-connection id later;
5277                // for embedded use (one RedDBRuntime per process-ish caller)
5278                // we fall back to a deterministic placeholder.
5279                let conn_id = current_connection_id();
5280
5281                let (kind, msg) = match ctl {
5282                    TxnControl::Begin => {
5283                        let mgr = Arc::clone(&self.inner.snapshot_manager);
5284                        let xid = mgr.begin();
5285                        let snapshot = mgr.snapshot(xid);
5286                        let ctx = TxnContext {
5287                            xid,
5288                            isolation: IsolationLevel::SnapshotIsolation,
5289                            snapshot,
5290                            savepoints: Vec::new(),
5291                            released_sub_xids: Vec::new(),
5292                        };
5293                        self.inner.tx_contexts.write().insert(conn_id, ctx);
5294                        ("begin", format!("BEGIN — xid={xid} (snapshot isolation)"))
5295                    }
5296                    TxnControl::Commit => {
5297                        // SET LOCAL TENANT ends with the transaction.
5298                        self.inner.tx_local_tenants.write().remove(&conn_id);
5299                        let ctx = self.inner.tx_contexts.write().remove(&conn_id);
5300                        match ctx {
5301                            Some(ctx) => {
5302                                let mut own_xids = std::collections::HashSet::new();
5303                                own_xids.insert(ctx.xid);
5304                                for (_, sub) in &ctx.savepoints {
5305                                    own_xids.insert(*sub);
5306                                }
5307                                for sub in &ctx.released_sub_xids {
5308                                    own_xids.insert(*sub);
5309                                }
5310                                if let Err(err) = self.check_table_row_write_conflicts(
5311                                    conn_id,
5312                                    &ctx.snapshot,
5313                                    &own_xids,
5314                                ) {
5315                                    for (_, sub) in &ctx.savepoints {
5316                                        self.inner.snapshot_manager.rollback(*sub);
5317                                    }
5318                                    for sub in &ctx.released_sub_xids {
5319                                        self.inner.snapshot_manager.rollback(*sub);
5320                                    }
5321                                    self.inner.snapshot_manager.rollback(ctx.xid);
5322                                    self.revive_pending_versioned_updates(conn_id);
5323                                    self.revive_pending_tombstones(conn_id);
5324                                    self.discard_pending_kv_watch_events(conn_id);
5325                                    self.discard_pending_store_wal_actions(conn_id);
5326                                    return Err(err);
5327                                }
5328                                self.restore_pending_write_stamps(conn_id);
5329                                if let Err(err) = self.flush_pending_store_wal_actions(conn_id) {
5330                                    for (_, sub) in &ctx.savepoints {
5331                                        self.inner.snapshot_manager.rollback(*sub);
5332                                    }
5333                                    for sub in &ctx.released_sub_xids {
5334                                        self.inner.snapshot_manager.rollback(*sub);
5335                                    }
5336                                    self.inner.snapshot_manager.rollback(ctx.xid);
5337                                    self.revive_pending_versioned_updates(conn_id);
5338                                    self.revive_pending_tombstones(conn_id);
5339                                    self.discard_pending_kv_watch_events(conn_id);
5340                                    return Err(err);
5341                                }
5342                                // Phase 2.3.2e: commit every open sub-xid
5343                                // so they also become visible. Their
5344                                // work is promoted to the parent txn's
5345                                // result exactly like a RELEASE would
5346                                // have done.
5347                                for (_, sub) in &ctx.savepoints {
5348                                    self.inner.snapshot_manager.commit(*sub);
5349                                }
5350                                for sub in &ctx.released_sub_xids {
5351                                    self.inner.snapshot_manager.commit(*sub);
5352                                }
5353                                self.inner.snapshot_manager.commit(ctx.xid);
5354                                self.finalize_pending_versioned_updates(conn_id);
5355                                self.finalize_pending_tombstones(conn_id);
5356                                self.finalize_pending_kv_watch_events(conn_id);
5357                                ("commit", format!("COMMIT — xid={} committed", ctx.xid))
5358                            }
5359                            None => (
5360                                "commit",
5361                                "COMMIT outside transaction — no-op (autocommit)".to_string(),
5362                            ),
5363                        }
5364                    }
5365                    TxnControl::Rollback => {
5366                        self.inner.tx_local_tenants.write().remove(&conn_id);
5367                        let ctx = self.inner.tx_contexts.write().remove(&conn_id);
5368                        match ctx {
5369                            Some(ctx) => {
5370                                // Phase 2.3.2e: abort every open sub-xid
5371                                // too so their writes stay hidden.
5372                                for (_, sub) in &ctx.savepoints {
5373                                    self.inner.snapshot_manager.rollback(*sub);
5374                                }
5375                                for sub in &ctx.released_sub_xids {
5376                                    self.inner.snapshot_manager.rollback(*sub);
5377                                }
5378                                self.inner.snapshot_manager.rollback(ctx.xid);
5379                                // Phase 2.3.2b: tuples that the txn had
5380                                // xmax-stamped become live again — wipe xmax
5381                                // back to 0 so later snapshots see them.
5382                                self.revive_pending_versioned_updates(conn_id);
5383                                self.revive_pending_tombstones(conn_id);
5384                                self.discard_pending_kv_watch_events(conn_id);
5385                                self.discard_pending_store_wal_actions(conn_id);
5386                                ("rollback", format!("ROLLBACK — xid={} aborted", ctx.xid))
5387                            }
5388                            None => (
5389                                "rollback",
5390                                "ROLLBACK outside transaction — no-op (autocommit)".to_string(),
5391                            ),
5392                        }
5393                    }
5394                    // Phase 2.3.2e: savepoints map onto sub-xids. Each
5395                    // SAVEPOINT allocates a fresh xid and pushes it
5396                    // onto the per-txn stack so subsequent writes can
5397                    // be selectively rolled back. RELEASE pops without
5398                    // aborting; ROLLBACK TO aborts the sub-xid (and
5399                    // any nested ones) + revives their tombstones.
5400                    TxnControl::Savepoint(name) => {
5401                        let mgr = Arc::clone(&self.inner.snapshot_manager);
5402                        let mut guard = self.inner.tx_contexts.write();
5403                        match guard.get_mut(&conn_id) {
5404                            Some(ctx) => {
5405                                let sub = mgr.begin();
5406                                ctx.savepoints.push((name.clone(), sub));
5407                                ("savepoint", format!("SAVEPOINT {name} — sub_xid={sub}"))
5408                            }
5409                            None => (
5410                                "savepoint",
5411                                "SAVEPOINT outside transaction — no-op".to_string(),
5412                            ),
5413                        }
5414                    }
5415                    TxnControl::ReleaseSavepoint(name) => {
5416                        let mut guard = self.inner.tx_contexts.write();
5417                        match guard.get_mut(&conn_id) {
5418                            Some(ctx) => {
5419                                let pos = ctx
5420                                    .savepoints
5421                                    .iter()
5422                                    .position(|(n, _)| n == name)
5423                                    .ok_or_else(|| {
5424                                        RedDBError::Internal(format!(
5425                                            "savepoint {name} does not exist"
5426                                        ))
5427                                    })?;
5428                                // RELEASE pops the named savepoint and
5429                                // any nested ones. Their sub-xids move
5430                                // to `released_sub_xids` so they commit
5431                                // (or roll back) alongside the parent
5432                                // xid — PG semantics: released
5433                                // savepoints still contribute their
5434                                // work, but their names are gone.
5435                                let released = ctx.savepoints.len() - pos;
5436                                let popped: Vec<Xid> = ctx
5437                                    .savepoints
5438                                    .split_off(pos)
5439                                    .into_iter()
5440                                    .map(|(_, x)| x)
5441                                    .collect();
5442                                ctx.released_sub_xids.extend(popped);
5443                                (
5444                                    "release_savepoint",
5445                                    format!("RELEASE SAVEPOINT {name} — {released} level(s)"),
5446                                )
5447                            }
5448                            None => (
5449                                "release_savepoint",
5450                                "RELEASE outside transaction — no-op".to_string(),
5451                            ),
5452                        }
5453                    }
5454                    TxnControl::RollbackToSavepoint(name) => {
5455                        let mgr = Arc::clone(&self.inner.snapshot_manager);
5456                        // Splice out the savepoint + nested ones under
5457                        // a narrow lock, then run the snapshot-manager
5458                        // + tombstone side-effects without the tx map
5459                        // held so nothing re-enters.
5460                        let drop_result: Option<(Xid, Vec<Xid>)> = {
5461                            let mut guard = self.inner.tx_contexts.write();
5462                            if let Some(ctx) = guard.get_mut(&conn_id) {
5463                                let pos = ctx
5464                                    .savepoints
5465                                    .iter()
5466                                    .position(|(n, _)| n == name)
5467                                    .ok_or_else(|| {
5468                                        RedDBError::Internal(format!(
5469                                            "savepoint {name} does not exist"
5470                                        ))
5471                                    })?;
5472                                let savepoint_xid = ctx.savepoints[pos].1;
5473                                let aborted: Vec<Xid> = ctx
5474                                    .savepoints
5475                                    .split_off(pos)
5476                                    .into_iter()
5477                                    .map(|(_, x)| x)
5478                                    .collect();
5479                                Some((savepoint_xid, aborted))
5480                            } else {
5481                                None
5482                            }
5483                        };
5484
5485                        match drop_result {
5486                            Some((savepoint_xid, aborted)) => {
5487                                for x in &aborted {
5488                                    mgr.rollback(*x);
5489                                }
5490                                let reverted_updates =
5491                                    self.revive_versioned_updates_since(conn_id, savepoint_xid);
5492                                let revived = self.revive_tombstones_since(conn_id, savepoint_xid);
5493                                (
5494                                    "rollback_to_savepoint",
5495                                    format!(
5496                                        "ROLLBACK TO SAVEPOINT {name} — aborted {} sub_xid(s), reverted {reverted_updates} update(s), revived {revived} tombstone(s)",
5497                                        aborted.len(),
5498                                    ),
5499                                )
5500                            }
5501                            None => (
5502                                "rollback_to_savepoint",
5503                                "ROLLBACK TO outside transaction — no-op".to_string(),
5504                            ),
5505                        }
5506                    }
5507                };
5508                Ok(RuntimeQueryResult::ok_message(
5509                    query.to_string(),
5510                    &msg,
5511                    kind,
5512                ))
5513            }
5514            // Schema + Sequence DDL (Phase 1.3 PG parity).
5515            //
5516            // Schemas are lightweight logical namespaces: a CREATE SCHEMA call
5517            // just registers the name in `red_config` under `schema.{name}`.
5518            // Table lookups still happen by collection name; clients using
5519            // `schema.table` qualified names collapse to collection `schema.table`.
5520            //
5521            // Sequences persist a 64-bit counter + metadata (start, increment)
5522            // in `red_config` under `sequence.{name}.*`. Scalar callers
5523            // `nextval('name')` / `currval('name')` arrive with the MVCC phase
5524            // once we have a proper mutating-function dispatch path; for now the
5525            // DDL just establishes the catalog entry so clients don't error.
5526            QueryExpr::CreateSchema(ref q) => {
5527                let store = self.inner.db.store();
5528                let key = format!("schema.{}", q.name);
5529                if store.get_config(&key).is_some() {
5530                    if q.if_not_exists {
5531                        return Ok(RuntimeQueryResult::ok_message(
5532                            query.to_string(),
5533                            &format!("schema {} already exists — skipped", q.name),
5534                            "create_schema",
5535                        ));
5536                    }
5537                    return Err(RedDBError::Internal(format!(
5538                        "schema {} already exists",
5539                        q.name
5540                    )));
5541                }
5542                store.set_config_tree(&key, &crate::serde_json::Value::Bool(true));
5543                Ok(RuntimeQueryResult::ok_message(
5544                    query.to_string(),
5545                    &format!("schema {} created", q.name),
5546                    "create_schema",
5547                ))
5548            }
5549            QueryExpr::DropSchema(ref q) => {
5550                let store = self.inner.db.store();
5551                let key = format!("schema.{}", q.name);
5552                let existed = store.get_config(&key).is_some();
5553                if !existed && !q.if_exists {
5554                    return Err(RedDBError::Internal(format!(
5555                        "schema {} does not exist",
5556                        q.name
5557                    )));
5558                }
5559                // Remove marker from red_config via set to null.
5560                store.set_config_tree(&key, &crate::serde_json::Value::Null);
5561                let suffix = if q.cascade {
5562                    " (CASCADE accepted — tables untouched)"
5563                } else {
5564                    ""
5565                };
5566                Ok(RuntimeQueryResult::ok_message(
5567                    query.to_string(),
5568                    &format!("schema {} dropped{}", q.name, suffix),
5569                    "drop_schema",
5570                ))
5571            }
5572            QueryExpr::CreateSequence(ref q) => {
5573                let store = self.inner.db.store();
5574                let base = format!("sequence.{}", q.name);
5575                let start_key = format!("{base}.start");
5576                let incr_key = format!("{base}.increment");
5577                let curr_key = format!("{base}.current");
5578                if store.get_config(&start_key).is_some() {
5579                    if q.if_not_exists {
5580                        return Ok(RuntimeQueryResult::ok_message(
5581                            query.to_string(),
5582                            &format!("sequence {} already exists — skipped", q.name),
5583                            "create_sequence",
5584                        ));
5585                    }
5586                    return Err(RedDBError::Internal(format!(
5587                        "sequence {} already exists",
5588                        q.name
5589                    )));
5590                }
5591                // Persist start + increment, and set current so the first
5592                // nextval returns `start`.
5593                let initial_current = q.start - q.increment;
5594                store.set_config_tree(
5595                    &start_key,
5596                    &crate::serde_json::Value::Number(q.start as f64),
5597                );
5598                store.set_config_tree(
5599                    &incr_key,
5600                    &crate::serde_json::Value::Number(q.increment as f64),
5601                );
5602                store.set_config_tree(
5603                    &curr_key,
5604                    &crate::serde_json::Value::Number(initial_current as f64),
5605                );
5606                Ok(RuntimeQueryResult::ok_message(
5607                    query.to_string(),
5608                    &format!(
5609                        "sequence {} created (start={}, increment={})",
5610                        q.name, q.start, q.increment
5611                    ),
5612                    "create_sequence",
5613                ))
5614            }
5615            QueryExpr::DropSequence(ref q) => {
5616                let store = self.inner.db.store();
5617                let base = format!("sequence.{}", q.name);
5618                let existed = store.get_config(&format!("{base}.start")).is_some();
5619                if !existed && !q.if_exists {
5620                    return Err(RedDBError::Internal(format!(
5621                        "sequence {} does not exist",
5622                        q.name
5623                    )));
5624                }
5625                for k in ["start", "increment", "current"] {
5626                    store.set_config_tree(&format!("{base}.{k}"), &crate::serde_json::Value::Null);
5627                }
5628                Ok(RuntimeQueryResult::ok_message(
5629                    query.to_string(),
5630                    &format!("sequence {} dropped", q.name),
5631                    "drop_sequence",
5632                ))
5633            }
5634            // Views — CREATE [MATERIALIZED] VIEW (Phase 2.1 PG parity).
5635            //
5636            // The view definition is stored in-memory on RuntimeInner (not
5637            // persisted). SELECTs that reference the view name will substitute
5638            // the stored `QueryExpr` via `resolve_view_reference` during
5639            // planning (same entry point used by table-name resolution).
5640            //
5641            // Materialized views additionally allocate a slot in
5642            // `MaterializedViewCache`; a REFRESH repopulates that slot.
5643            QueryExpr::CreateView(ref q) => {
5644                let mut views = self.inner.views.write();
5645                if views.contains_key(&q.name) && !q.or_replace {
5646                    if q.if_not_exists {
5647                        return Ok(RuntimeQueryResult::ok_message(
5648                            query.to_string(),
5649                            &format!("view {} already exists — skipped", q.name),
5650                            "create_view",
5651                        ));
5652                    }
5653                    return Err(RedDBError::Internal(format!(
5654                        "view {} already exists",
5655                        q.name
5656                    )));
5657                }
5658                views.insert(q.name.clone(), Arc::new(q.clone()));
5659                drop(views);
5660
5661                // Materialized view: register cache slot (data is empty until REFRESH).
5662                if q.materialized {
5663                    use crate::storage::cache::result::{MaterializedViewDef, RefreshPolicy};
5664                    let refresh = match q.refresh_every_ms {
5665                        Some(ms) => {
5666                            RefreshPolicy::Periodic(std::time::Duration::from_millis(ms))
5667                        }
5668                        None => RefreshPolicy::Manual,
5669                    };
5670                    let def = MaterializedViewDef {
5671                        name: q.name.clone(),
5672                        query: format!("<parsed view {}>", q.name),
5673                        dependencies: collect_table_refs(&q.query),
5674                        refresh,
5675                        retention_duration_ms: q.retention_duration_ms,
5676                    };
5677                    self.inner.materialized_views.write().register(def);
5678                }
5679                // Plan cache may have cached a plan that didn't know about this
5680                // view — invalidate so future references pick up the new binding.
5681                // Result cache gets flushed too: OR REPLACE must not serve a
5682                // prior execution of the obsolete body.
5683                self.invalidate_plan_cache();
5684                self.invalidate_result_cache();
5685
5686                Ok(RuntimeQueryResult::ok_message(
5687                    query.to_string(),
5688                    &format!(
5689                        "{}view {} created",
5690                        if q.materialized { "materialized " } else { "" },
5691                        q.name
5692                    ),
5693                    "create_view",
5694                ))
5695            }
5696            QueryExpr::DropView(ref q) => {
5697                let mut views = self.inner.views.write();
5698                let existed = views.remove(&q.name).is_some();
5699                drop(views);
5700                if q.materialized || existed {
5701                    // Try the materialised cache too — silent if absent.
5702                    self.inner.materialized_views.write().remove(&q.name);
5703                }
5704                // Drop any plan / result cache entries that baked the
5705                // view body into their QueryExpr.
5706                self.invalidate_plan_cache();
5707                self.invalidate_result_cache();
5708                if !existed && !q.if_exists {
5709                    return Err(RedDBError::Internal(format!(
5710                        "view {} does not exist",
5711                        q.name
5712                    )));
5713                }
5714                self.invalidate_plan_cache();
5715                Ok(RuntimeQueryResult::ok_message(
5716                    query.to_string(),
5717                    &format!("view {} dropped", q.name),
5718                    "drop_view",
5719                ))
5720            }
5721            QueryExpr::RefreshMaterializedView(ref q) => {
5722                // Look up the view definition, execute its underlying query,
5723                // and stash the serialized result in the materialised cache.
5724                let view = {
5725                    let views = self.inner.views.read();
5726                    views.get(&q.name).cloned()
5727                };
5728                let view = match view {
5729                    Some(v) => v,
5730                    None => {
5731                        return Err(RedDBError::Internal(format!(
5732                            "view {} does not exist",
5733                            q.name
5734                        )))
5735                    }
5736                };
5737                if !view.materialized {
5738                    return Err(RedDBError::Internal(format!(
5739                        "view {} is not materialized — REFRESH requires \
5740                         CREATE MATERIALIZED VIEW",
5741                        q.name
5742                    )));
5743                }
5744                // Execute the underlying query fresh.
5745                let started = std::time::Instant::now();
5746                let now_ms = std::time::SystemTime::now()
5747                    .duration_since(std::time::UNIX_EPOCH)
5748                    .map(|d| d.as_millis() as u64)
5749                    .unwrap_or(0);
5750                match self.execute_query_expr((*view.query).clone()) {
5751                    Ok(inner_result) => {
5752                        let duration_ms = started.elapsed().as_millis() as u64;
5753                        let row_count = inner_result.result.records.len() as u64;
5754                        let serialized = format!("{:?}", inner_result.result);
5755                        self.inner
5756                            .materialized_views
5757                            .write()
5758                            .record_refresh_success(
5759                                &q.name,
5760                                serialized.into_bytes(),
5761                                row_count,
5762                                duration_ms,
5763                                now_ms,
5764                            );
5765                        Ok(RuntimeQueryResult::ok_message(
5766                            query.to_string(),
5767                            &format!("materialized view {} refreshed", q.name),
5768                            "refresh_materialized_view",
5769                        ))
5770                    }
5771                    Err(err) => {
5772                        let duration_ms = started.elapsed().as_millis() as u64;
5773                        let msg = err.to_string();
5774                        self.inner
5775                            .materialized_views
5776                            .write()
5777                            .record_refresh_failure(
5778                                &q.name,
5779                                msg.clone(),
5780                                duration_ms,
5781                                now_ms,
5782                            );
5783                        Err(err)
5784                    }
5785                }
5786            }
5787            // Row Level Security (Phase 2.5 PG parity).
5788            //
5789            // Policies live in an in-memory registry keyed by (table, name).
5790            // Enforcement (AND-ing the policy's USING clause into every
5791            // query's WHERE for the table) arrives in Phase 2.5.2 via the
5792            // filter compiler; this dispatch only manages the catalog.
5793            QueryExpr::CreatePolicy(ref q) => {
5794                let key = (q.table.clone(), q.name.clone());
5795                self.inner
5796                    .rls_policies
5797                    .write()
5798                    .insert(key, Arc::new(q.clone()));
5799                self.invalidate_plan_cache();
5800                // Issue #120 — surface policy names in the
5801                // schema-vocabulary so AskPipeline (#121) can resolve
5802                // a policy reference back to its table.
5803                self.schema_vocabulary_apply(
5804                    crate::runtime::schema_vocabulary::DdlEvent::CreatePolicy {
5805                        collection: q.table.clone(),
5806                        policy: q.name.clone(),
5807                    },
5808                );
5809                Ok(RuntimeQueryResult::ok_message(
5810                    query.to_string(),
5811                    &format!("policy {} on {} created", q.name, q.table),
5812                    "create_policy",
5813                ))
5814            }
5815            QueryExpr::DropPolicy(ref q) => {
5816                let removed = self
5817                    .inner
5818                    .rls_policies
5819                    .write()
5820                    .remove(&(q.table.clone(), q.name.clone()))
5821                    .is_some();
5822                if !removed && !q.if_exists {
5823                    return Err(RedDBError::Internal(format!(
5824                        "policy {} on {} does not exist",
5825                        q.name, q.table
5826                    )));
5827                }
5828                self.invalidate_plan_cache();
5829                // Issue #120 — keep the schema-vocabulary policy
5830                // entry in sync.
5831                self.schema_vocabulary_apply(
5832                    crate::runtime::schema_vocabulary::DdlEvent::DropPolicy {
5833                        collection: q.table.clone(),
5834                        policy: q.name.clone(),
5835                    },
5836                );
5837                Ok(RuntimeQueryResult::ok_message(
5838                    query.to_string(),
5839                    &format!("policy {} on {} dropped", q.name, q.table),
5840                    "drop_policy",
5841                ))
5842            }
5843            // Foreign Data Wrappers (Phase 3.2 PG parity).
5844            //
5845            // CREATE SERVER / CREATE FOREIGN TABLE register into the shared
5846            // `ForeignTableRegistry`. The read path consults that registry
5847            // before dispatching a SELECT — when the table name matches a
5848            // registered foreign table, we forward the scan to the wrapper
5849            // and skip the normal collection lookup.
5850            //
5851            // Phase 3.2 is in-memory only; persistence across restarts is a
5852            // 3.2.2 follow-up that mirrors the view registry pattern.
5853            QueryExpr::CreateServer(ref q) => {
5854                use crate::storage::fdw::FdwOptions;
5855                let registry = Arc::clone(&self.inner.foreign_tables);
5856                if registry.server(&q.name).is_some() {
5857                    if q.if_not_exists {
5858                        return Ok(RuntimeQueryResult::ok_message(
5859                            query.to_string(),
5860                            &format!("server {} already exists — skipped", q.name),
5861                            "create_server",
5862                        ));
5863                    }
5864                    return Err(RedDBError::Internal(format!(
5865                        "server {} already exists",
5866                        q.name
5867                    )));
5868                }
5869                let mut opts = FdwOptions::new();
5870                for (k, v) in &q.options {
5871                    opts.values.insert(k.clone(), v.clone());
5872                }
5873                registry
5874                    .create_server(&q.name, &q.wrapper, opts)
5875                    .map_err(|e| RedDBError::Internal(e.to_string()))?;
5876                Ok(RuntimeQueryResult::ok_message(
5877                    query.to_string(),
5878                    &format!("server {} created (wrapper {})", q.name, q.wrapper),
5879                    "create_server",
5880                ))
5881            }
5882            QueryExpr::DropServer(ref q) => {
5883                let existed = self.inner.foreign_tables.drop_server(&q.name);
5884                if !existed && !q.if_exists {
5885                    return Err(RedDBError::Internal(format!(
5886                        "server {} does not exist",
5887                        q.name
5888                    )));
5889                }
5890                Ok(RuntimeQueryResult::ok_message(
5891                    query.to_string(),
5892                    &format!(
5893                        "server {} dropped{}",
5894                        q.name,
5895                        if q.cascade { " (cascade)" } else { "" }
5896                    ),
5897                    "drop_server",
5898                ))
5899            }
5900            QueryExpr::CreateForeignTable(ref q) => {
5901                use crate::storage::fdw::{FdwOptions, ForeignColumn, ForeignTable};
5902                let registry = Arc::clone(&self.inner.foreign_tables);
5903                if registry.foreign_table(&q.name).is_some() {
5904                    if q.if_not_exists {
5905                        return Ok(RuntimeQueryResult::ok_message(
5906                            query.to_string(),
5907                            &format!("foreign table {} already exists — skipped", q.name),
5908                            "create_foreign_table",
5909                        ));
5910                    }
5911                    return Err(RedDBError::Internal(format!(
5912                        "foreign table {} already exists",
5913                        q.name
5914                    )));
5915                }
5916                let mut opts = FdwOptions::new();
5917                for (k, v) in &q.options {
5918                    opts.values.insert(k.clone(), v.clone());
5919                }
5920                let columns: Vec<ForeignColumn> = q
5921                    .columns
5922                    .iter()
5923                    .map(|c| ForeignColumn {
5924                        name: c.name.clone(),
5925                        data_type: c.data_type.clone(),
5926                        not_null: c.not_null,
5927                    })
5928                    .collect();
5929                registry
5930                    .create_foreign_table(ForeignTable {
5931                        name: q.name.clone(),
5932                        server_name: q.server.clone(),
5933                        columns,
5934                        options: opts,
5935                    })
5936                    .map_err(|e| RedDBError::Internal(e.to_string()))?;
5937                self.invalidate_plan_cache();
5938                Ok(RuntimeQueryResult::ok_message(
5939                    query.to_string(),
5940                    &format!("foreign table {} created (server {})", q.name, q.server),
5941                    "create_foreign_table",
5942                ))
5943            }
5944            QueryExpr::DropForeignTable(ref q) => {
5945                let existed = self.inner.foreign_tables.drop_foreign_table(&q.name);
5946                if !existed && !q.if_exists {
5947                    return Err(RedDBError::Internal(format!(
5948                        "foreign table {} does not exist",
5949                        q.name
5950                    )));
5951                }
5952                self.invalidate_plan_cache();
5953                Ok(RuntimeQueryResult::ok_message(
5954                    query.to_string(),
5955                    &format!("foreign table {} dropped", q.name),
5956                    "drop_foreign_table",
5957                ))
5958            }
5959            // COPY table FROM 'path' (Phase 1.5 PG parity).
5960            //
5961            // Stream CSV rows through the shared `CsvImporter`. The collection
5962            // is auto-created on first insert (via `insert_auto`-style path);
5963            // VACUUM/ANALYZE afterwards is up to the caller.
5964            QueryExpr::CopyFrom(ref q) => {
5965                use crate::storage::import::{CsvConfig, CsvImporter};
5966                let store = self.inner.db.store();
5967                let cfg = CsvConfig {
5968                    collection: q.table.clone(),
5969                    has_header: q.has_header,
5970                    delimiter: q.delimiter.map(|c| c as u8).unwrap_or(b','),
5971                    ..CsvConfig::default()
5972                };
5973                let importer = CsvImporter::new(cfg);
5974                let stats = importer
5975                    .import_file(&q.path, store.as_ref())
5976                    .map_err(|e| RedDBError::Internal(format!("COPY failed: {e}")))?;
5977                // Tables are written → invalidate cached plans / result cache.
5978                self.note_table_write(&q.table);
5979                Ok(RuntimeQueryResult::ok_message(
5980                    query.to_string(),
5981                    &format!(
5982                        "COPY imported {} rows into {} ({} errors skipped, {}ms)",
5983                        stats.records_imported, q.table, stats.errors_skipped, stats.duration_ms
5984                    ),
5985                    "copy_from",
5986                ))
5987            }
5988            // Maintenance commands (Phase 1.2 PG parity).
5989            //
5990            // - VACUUM [FULL] [table]: refreshes planner stats for the target
5991            //   collection(s) and — when FULL — triggers a full pager persist
5992            //   (flushes dirty pages + fsync). Also invalidates the result cache
5993            //   so subsequent reads re-execute against the freshly compacted
5994            //   storage. RedDB's segment/btree GC runs continuously via the
5995            //   background lifecycle; explicit space reclamation for sealed
5996            //   segments arrives with Phase 2.3 (MVCC + dead-tuple reclamation).
5997            // - ANALYZE [table]: reruns `analyze_collection` +
5998            //   `persist_table_stats` via `refresh_table_planner_stats` so the
5999            //   planner has fresh histograms, distinct estimates, null counts.
6000            //
6001            // Both commands accept an optional target; omitting the target
6002            // iterates every collection in the store.
6003            QueryExpr::MaintenanceCommand(ref cmd) => {
6004                use crate::storage::query::ast::MaintenanceCommand as Mc;
6005                let store = self.inner.db.store();
6006                let (kind, msg) = match cmd {
6007                    Mc::Analyze { target } => {
6008                        let targets: Vec<String> = match target {
6009                            Some(t) => vec![t.clone()],
6010                            None => store.list_collections(),
6011                        };
6012                        for t in &targets {
6013                            self.refresh_table_planner_stats(t);
6014                        }
6015                        (
6016                            "analyze",
6017                            format!("ANALYZE refreshed stats for {} table(s)", targets.len()),
6018                        )
6019                    }
6020                    Mc::Vacuum { target, full } => {
6021                        let targets: Vec<String> = match target {
6022                            Some(t) => vec![t.clone()],
6023                            None => store.list_collections(),
6024                        };
6025                        let cutoff_xid = self.mvcc_vacuum_cutoff_xid();
6026                        let mut vacuum_stats =
6027                            crate::storage::unified::store::MvccVacuumStats::default();
6028                        for t in &targets {
6029                            let stats = store.vacuum_mvcc_history(t, cutoff_xid).map_err(|e| {
6030                                RedDBError::Internal(format!(
6031                                    "VACUUM MVCC history failed for {t}: {e}"
6032                                ))
6033                            })?;
6034                            if stats.reclaimed_versions > 0 {
6035                                self.rebuild_runtime_indexes_for_table(t)?;
6036                            }
6037                            vacuum_stats.add(&stats);
6038                        }
6039                        self.inner.snapshot_manager.prune_aborted(cutoff_xid);
6040                        // Stats refresh covers every target (same as ANALYZE).
6041                        for t in &targets {
6042                            self.refresh_table_planner_stats(t);
6043                        }
6044                        // FULL forces a pager persist (dirty-page flush + fsync).
6045                        // Regular VACUUM relies on the background writer / segment
6046                        // lifecycle so the command is non-blocking.
6047                        let persisted = if *full {
6048                            match store.persist() {
6049                                Ok(()) => true,
6050                                Err(e) => {
6051                                    return Err(RedDBError::Internal(format!(
6052                                        "VACUUM FULL persist failed: {e:?}"
6053                                    )));
6054                                }
6055                            }
6056                        } else {
6057                            false
6058                        };
6059                        // Result cache depended on pre-vacuum state.
6060                        self.invalidate_result_cache();
6061                        (
6062                            "vacuum",
6063                            format!(
6064                                "VACUUM{} processed {} table(s): scanned_versions={}, retained_versions={}, reclaimed_versions={}, retained_history_versions={}, reclaimed_history_versions={}, retained_tombstones={}, reclaimed_tombstones={}{}",
6065                                if *full { " FULL" } else { "" },
6066                                targets.len(),
6067                                vacuum_stats.scanned_versions,
6068                                vacuum_stats.retained_versions,
6069                                vacuum_stats.reclaimed_versions,
6070                                vacuum_stats.retained_history_versions,
6071                                vacuum_stats.reclaimed_history_versions,
6072                                vacuum_stats.retained_tombstones,
6073                                vacuum_stats.reclaimed_tombstones,
6074                                if persisted {
6075                                    " (pages flushed to disk)"
6076                                } else {
6077                                    ""
6078                                }
6079                            ),
6080                        )
6081                    }
6082                };
6083                Ok(RuntimeQueryResult::ok_message(
6084                    query.to_string(),
6085                    &msg,
6086                    kind,
6087                ))
6088            }
6089            // GRANT / REVOKE / ALTER USER (RBAC milestone).
6090            //
6091            // These hit the AuthStore directly. The privilege-check
6092            // gate at the top of `execute_query_expr` already decided
6093            // whether the caller may even run the statement; here we
6094            // just translate the AST into AuthStore calls.
6095            QueryExpr::Grant(ref g) => self.execute_grant_statement(query, g),
6096            QueryExpr::Revoke(ref r) => self.execute_revoke_statement(query, r),
6097            QueryExpr::AlterUser(ref a) => self.execute_alter_user_statement(query, a),
6098            QueryExpr::CreateIamPolicy { ref id, ref json } => {
6099                self.execute_create_iam_policy(query, id, json)
6100            }
6101            QueryExpr::DropIamPolicy { ref id } => self.execute_drop_iam_policy(query, id),
6102            QueryExpr::AttachPolicy {
6103                ref policy_id,
6104                ref principal,
6105            } => self.execute_attach_policy(query, policy_id, principal),
6106            QueryExpr::DetachPolicy {
6107                ref policy_id,
6108                ref principal,
6109            } => self.execute_detach_policy(query, policy_id, principal),
6110            QueryExpr::ShowPolicies { ref filter } => {
6111                self.execute_show_policies(query, filter.as_ref())
6112            }
6113            QueryExpr::ShowEffectivePermissions {
6114                ref user,
6115                ref resource,
6116            } => self.execute_show_effective_permissions(query, user, resource.as_ref()),
6117            QueryExpr::SimulatePolicy {
6118                ref user,
6119                ref action,
6120                ref resource,
6121            } => self.execute_simulate_policy(query, user, action, resource),
6122            QueryExpr::CreateMigration(ref q) => self.execute_create_migration(query, q),
6123            QueryExpr::ApplyMigration(ref q) => self.execute_apply_migration(query, q),
6124            QueryExpr::RollbackMigration(ref q) => self.execute_rollback_migration(query, q),
6125            QueryExpr::ExplainMigration(ref q) => self.execute_explain_migration(query, q),
6126        };
6127
6128        // Decrypt Value::Secret columns in-place before caching, so
6129        // cached results match the post-decrypt shape and repeat
6130        // queries skip the per-row AES-GCM pass.
6131        let mut query_result = query_result;
6132        if let Ok(ref mut result) = query_result {
6133            if result.statement_type == "select" {
6134                self.apply_secret_decryption(result);
6135            }
6136        }
6137
6138        // Cache SELECT results for 30s.
6139        // Skip: pre-serialized JSON (large clone), and result sets > 5 rows.
6140        // Large multi-row results (range scans, filtered scans) are rarely
6141        // repeated with the same literal values so the cache hit rate is near
6142        // zero while the clone cost (100 records × ~16 fields each) is high.
6143        // Aggregations (1 row) and point lookups (1 row) still benefit.
6144        if let Ok(ref result) = query_result {
6145            frame.write_result_cache(self, result, result_cache_scopes);
6146        }
6147
6148        query_result
6149    }
6150
6151    /// Snapshot of every registered materialized view's runtime
6152    /// state — feeds the `red.materialized_views` virtual table.
6153    /// Issue #583 slice 10.
6154    pub fn materialized_view_metadata(
6155        &self,
6156    ) -> Vec<crate::storage::cache::result::MaterializedViewMetadata> {
6157        self.inner.materialized_views.read().metadata()
6158    }
6159
6160    /// Drive scheduled refreshes for materialized views with a
6161    /// `REFRESH EVERY <duration>` clause. Called from the background
6162    /// scheduler thread (and from unit tests with a fake clock via
6163    /// `claim_due_at`). Each invocation atomically claims the set of
6164    /// due views (so two concurrent ticks never double-fire the same
6165    /// view) and runs each refresh through the standard execution
6166    /// path — failures are captured in `last_error` and the prior
6167    /// content stays intact. Issue #583 slice 10.
6168    /// Snapshot of every tracked retention sweeper state — feeds the
6169    /// three extra columns on `red.retention`. Issue #584 slice 12.
6170    pub(crate) fn retention_sweeper_snapshot(
6171        &self,
6172    ) -> Vec<(String, crate::runtime::retention_sweeper::SweeperState)> {
6173        self.inner.retention_sweeper.read().snapshot()
6174    }
6175
6176    /// Drive one tick of the retention sweeper. Iterates collections
6177    /// with a retention policy set, physically deletes at most
6178    /// `batch_size` expired rows per collection, and records the
6179    /// `last_sweep_at_ms` / `rows_swept_total` / pending estimate that
6180    /// `red.retention` exposes. Called from the background sweeper
6181    /// thread; safe to invoke directly from tests with a small batch
6182    /// size to drain rows deterministically. Issue #584 slice 12.
6183    ///
6184    /// Deletes are issued as `DELETE FROM <collection> WHERE
6185    /// <ts_column> < <cutoff>` through the standard `execute_query`
6186    /// chokepoint so WAL participation and snapshot guards apply
6187    /// exactly as for a user-issued DELETE — replicas replay the
6188    /// sweeper's deletes via the same WAL stream with no special
6189    /// handling on the replication side.
6190    ///
6191    /// Batching is enforced by tightening the cutoff: if more than
6192    /// `batch_size` rows are expired, the cutoff is dropped to the
6193    /// `batch_size`-th oldest expired timestamp + 1 so the predicate
6194    /// matches roughly `batch_size` rows; the remainder is reported
6195    /// as `current_rows_pending_sweep_estimate` and drained on the
6196    /// next tick.
6197    pub fn sweep_retention_tick(&self, batch_size: usize) {
6198        if batch_size == 0 {
6199            return;
6200        }
6201        let now_ms = std::time::SystemTime::now()
6202            .duration_since(std::time::UNIX_EPOCH)
6203            .map(|d| d.as_millis() as u64)
6204            .unwrap_or(0);
6205
6206        let store = self.inner.db.store();
6207        let collections = store.list_collections();
6208        for name in collections {
6209            let Some(contract) = self.inner.db.collection_contract(&name) else {
6210                continue;
6211            };
6212            let Some(retention_ms) = contract.retention_duration_ms else {
6213                continue;
6214            };
6215            let Some(ts_column) =
6216                crate::runtime::retention_filter::resolve_timestamp_column(&contract)
6217            else {
6218                continue;
6219            };
6220            let Some(manager) = store.get_collection(&name) else {
6221                continue;
6222            };
6223            let cutoff = (now_ms as i64).saturating_sub(retention_ms as i64);
6224
6225            // Single pass: collect expired timestamps. We keep the
6226            // full Vec rather than a bounded heap because the partial
6227            // sort below is the simplest correct way to find the
6228            // batch-th oldest; for the slice's "1000-row default
6229            // batch" target this is bounded enough for production
6230            // operation, and the alternative (in-place heap of size
6231            // batch+1) is a follow-up optimisation.
6232            let mut expired_ts: Vec<i64> = Vec::new();
6233            manager.for_each_entity(|entity| {
6234                let ts = match ts_column.as_str() {
6235                    "created_at" => Some(entity.created_at as i64),
6236                    "updated_at" => Some(entity.updated_at as i64),
6237                    other => entity
6238                        .data
6239                        .as_row()
6240                        .and_then(|row| row.get_field(other))
6241                        .and_then(|v| match v {
6242                            crate::storage::schema::Value::TimestampMs(t) => Some(*t),
6243                            crate::storage::schema::Value::Timestamp(t) => {
6244                                Some(t.saturating_mul(1_000))
6245                            }
6246                            crate::storage::schema::Value::BigInt(t) => Some(*t),
6247                            crate::storage::schema::Value::UnsignedInteger(t) => {
6248                                i64::try_from(*t).ok()
6249                            }
6250                            crate::storage::schema::Value::Integer(t) => Some(*t as i64),
6251                            _ => None,
6252                        }),
6253                };
6254                if let Some(t) = ts {
6255                    if t < cutoff {
6256                        expired_ts.push(t);
6257                    }
6258                }
6259                true
6260            });
6261
6262            let total_expired = expired_ts.len() as u64;
6263            if total_expired == 0 {
6264                self.inner
6265                    .retention_sweeper
6266                    .write()
6267                    .record_tick(&name, 0, 0, now_ms);
6268                continue;
6269            }
6270
6271            let (effective_cutoff, pending) = if (total_expired as usize) <= batch_size {
6272                (cutoff, 0u64)
6273            } else {
6274                // Tighten the cutoff to the (batch_size)-th oldest
6275                // expired timestamp + 1 so DELETE matches roughly
6276                // `batch_size` rows.
6277                expired_ts.sort_unstable();
6278                let nth = expired_ts[batch_size - 1];
6279                (
6280                    nth.saturating_add(1),
6281                    total_expired.saturating_sub(batch_size as u64),
6282                )
6283            };
6284
6285            let stmt = format!(
6286                "DELETE FROM {} WHERE {} < {}",
6287                name, ts_column, effective_cutoff
6288            );
6289            let deleted = match self.execute_query(&stmt) {
6290                Ok(r) => r.affected_rows,
6291                Err(_) => 0,
6292            };
6293
6294            self.inner
6295                .retention_sweeper
6296                .write()
6297                .record_tick(&name, deleted, pending, now_ms);
6298        }
6299    }
6300
6301    pub fn refresh_due_materialized_views(&self) {
6302        let due = {
6303            let mut cache = self.inner.materialized_views.write();
6304            cache.claim_due_at(std::time::Instant::now())
6305        };
6306        for name in due {
6307            // Round-trip through `execute_query` (rather than the
6308            // prepared-statement `execute_query_expr` fast path, which
6309            // explicitly rejects DDL/maintenance statements). Failures
6310            // are captured inside the RefreshMaterializedView handler
6311            // via `record_refresh_failure`; the scheduler ignores the
6312            // Result so one bad view doesn't halt the loop.
6313            let stmt = format!("REFRESH MATERIALIZED VIEW {}", name);
6314            let _ = self.execute_query(&stmt);
6315        }
6316    }
6317
6318    /// Execute a pre-parsed `QueryExpr` directly, bypassing SQL parsing and the
6319    /// plan cache. Used by the prepared-statement fast path so that `execute_prepared`
6320    /// calls pay zero parse + cache overhead.
6321    ///
6322    /// Applies secret decryption on SELECT results, identical to `execute_query`.
6323    pub fn execute_query_expr(&self, expr: QueryExpr) -> RedDBResult<RuntimeQueryResult> {
6324        let _config_snapshot_guard = ConfigSnapshotGuard::install(Arc::clone(&self.inner.db));
6325        let _secret_store_guard = SecretStoreGuard::install(self.inner.auth_store.read().clone());
6326        // View rewrite (Phase 2.1): substitute any `QueryExpr::Table(tq)`
6327        // whose `tq.table` matches a registered view with the view's
6328        // underlying query. Safe to call even when no views are registered.
6329        let expr = self.rewrite_view_refs(expr);
6330
6331        self.validate_model_operations_before_auth(&expr)?;
6332        // Granular RBAC privilege check. Runs before dispatch so a
6333        // denied caller never reaches storage. Fail-closed: any error
6334        // resolving the action / resource produces PermissionDenied.
6335        if let Err(err) = self.check_query_privilege(&expr) {
6336            return Err(RedDBError::Query(format!("permission denied: {err}")));
6337        }
6338
6339        let statement = query_expr_name(&expr);
6340        let mode = detect_mode(statement);
6341        let query_str = statement;
6342
6343        let result = self.dispatch_expr(expr, query_str, mode)?;
6344        let mut r = result;
6345        if r.statement_type == "select" {
6346            self.apply_secret_decryption(&mut r);
6347        }
6348        Ok(r)
6349    }
6350
6351    pub(super) fn validate_model_operations_before_auth(
6352        &self,
6353        expr: &QueryExpr,
6354    ) -> RedDBResult<()> {
6355        use crate::catalog::CollectionModel;
6356        use crate::runtime::ddl::polymorphic_resolver;
6357        use crate::storage::query::ast::KvCommand;
6358
6359        let system_schema_target = match expr {
6360            QueryExpr::DropTable(q) => Some(q.name.as_str()),
6361            QueryExpr::DropGraph(q) => Some(q.name.as_str()),
6362            QueryExpr::DropVector(q) => Some(q.name.as_str()),
6363            QueryExpr::DropDocument(q) => Some(q.name.as_str()),
6364            QueryExpr::DropKv(q) => Some(q.name.as_str()),
6365            QueryExpr::DropCollection(q) => Some(q.name.as_str()),
6366            QueryExpr::Truncate(q) => Some(q.name.as_str()),
6367            _ => None,
6368        };
6369        if system_schema_target.is_some_and(crate::runtime::impl_ddl::is_system_schema_name) {
6370            return Err(RedDBError::Query("system schema is read-only".to_string()));
6371        }
6372
6373        let expected = match expr {
6374            QueryExpr::DropTable(q) => Some((q.name.as_str(), CollectionModel::Table)),
6375            QueryExpr::DropGraph(q) => Some((q.name.as_str(), CollectionModel::Graph)),
6376            QueryExpr::DropVector(q) => Some((q.name.as_str(), CollectionModel::Vector)),
6377            QueryExpr::DropDocument(q) => Some((q.name.as_str(), CollectionModel::Document)),
6378            QueryExpr::DropKv(q) => Some((q.name.as_str(), q.model)),
6379            QueryExpr::DropCollection(q) => q.model.map(|model| (q.name.as_str(), model)),
6380            QueryExpr::Truncate(q) => q.model.map(|model| (q.name.as_str(), model)),
6381            QueryExpr::KvCommand(cmd) => {
6382                let (collection, model) = match cmd {
6383                    KvCommand::Put {
6384                        collection, model, ..
6385                    }
6386                    | KvCommand::Get {
6387                        collection, model, ..
6388                    }
6389                    | KvCommand::Incr {
6390                        collection, model, ..
6391                    }
6392                    | KvCommand::Cas {
6393                        collection, model, ..
6394                    }
6395                    | KvCommand::Delete {
6396                        collection, model, ..
6397                    } => (collection.as_str(), *model),
6398                    KvCommand::Rotate { collection, .. }
6399                    | KvCommand::History { collection, .. }
6400                    | KvCommand::List { collection, .. }
6401                    | KvCommand::Purge { collection, .. } => {
6402                        (collection.as_str(), CollectionModel::Vault)
6403                    }
6404                    KvCommand::InvalidateTags { collection, .. } => {
6405                        (collection.as_str(), CollectionModel::Kv)
6406                    }
6407                    KvCommand::Watch {
6408                        collection, model, ..
6409                    } => (collection.as_str(), *model),
6410                    KvCommand::Unseal { collection, .. } => {
6411                        (collection.as_str(), CollectionModel::Vault)
6412                    }
6413                };
6414                Some((collection, model))
6415            }
6416            QueryExpr::ConfigCommand(cmd) => {
6417                self.validate_config_command_before_auth(cmd)?;
6418                None
6419            }
6420            _ => None,
6421        };
6422
6423        let Some((name, expected_model)) = expected else {
6424            return Ok(());
6425        };
6426        let snapshot = self.inner.db.catalog_model_snapshot();
6427        let Some(actual_model) = snapshot
6428            .collections
6429            .iter()
6430            .find(|collection| collection.name == name)
6431            .map(|collection| collection.declared_model.unwrap_or(collection.model))
6432        else {
6433            return Ok(());
6434        };
6435        polymorphic_resolver::ensure_model_match(expected_model, actual_model)
6436    }
6437
6438    /// Walk a `QueryExpr` and replace `QueryExpr::Table(tq)` nodes whose
6439    /// `tq.table` matches a registered view name with the view's stored
6440    /// body. Recurses through joins so `SELECT ... FROM t JOIN myview ...`
6441    /// resolves correctly. Pure operation — no side effects.
6442    pub(super) fn rewrite_view_refs(&self, expr: QueryExpr) -> QueryExpr {
6443        // Fast path: no views registered → return original expression.
6444        if self.inner.views.read().is_empty() {
6445            return expr;
6446        }
6447        self.rewrite_view_refs_inner(expr)
6448    }
6449
6450    fn rewrite_view_refs_inner(&self, expr: QueryExpr) -> QueryExpr {
6451        use crate::storage::query::ast::{Filter, TableSource};
6452        match expr {
6453            QueryExpr::Table(mut tq) => {
6454                // 1. If the TableSource is a subquery, recurse into it so
6455                //    `SELECT ... FROM (SELECT ... FROM myview) t` expands.
6456                //    The legacy `table` field (set to a synthetic
6457                //    "__subq_NNNN" sentinel) stays as-is so callers that
6458                //    read it keep compiling.
6459                if let Some(TableSource::Subquery(body)) = tq.source.take() {
6460                    tq.source = Some(TableSource::Subquery(Box::new(
6461                        self.rewrite_view_refs_inner(*body),
6462                    )));
6463                    return QueryExpr::Table(tq);
6464                }
6465
6466                // 2. Restore the source field (took it above for match).
6467                // When the source was `None` or `TableSource::Name(_)`, the
6468                // real lookup key is `tq.table` — check the view registry.
6469                let maybe_view = {
6470                    let views = self.inner.views.read();
6471                    views.get(&tq.table).cloned()
6472                };
6473                let Some(view) = maybe_view else {
6474                    return QueryExpr::Table(tq);
6475                };
6476
6477                // Recurse into the view body — views may reference other
6478                // views. The recursion yields the final QueryExpr we need
6479                // to merge the outer's filter / limit / offset into.
6480                let inner_expr = self.rewrite_view_refs_inner((*view.query).clone());
6481
6482                // Phase 5: when the body is a Table we merge the outer
6483                // TableQuery's WHERE / LIMIT / OFFSET into it so stacked
6484                // views filter recursively. Non-table bodies (Search,
6485                // Ask, Vector, Graph, Hybrid) can't meaningfully combine
6486                // with an outer Table query today — return the body
6487                // verbatim; outer predicates are lost. Full projection
6488                // merge lands in Phase 5.2.
6489                match inner_expr {
6490                    QueryExpr::Table(mut inner_tq) => {
6491                        if let Some(outer_filter) = tq.filter.take() {
6492                            inner_tq.filter = Some(match inner_tq.filter.take() {
6493                                Some(existing) => {
6494                                    Filter::And(Box::new(existing), Box::new(outer_filter))
6495                                }
6496                                None => outer_filter,
6497                            });
6498                        }
6499                        if let Some(outer_limit) = tq.limit {
6500                            inner_tq.limit = Some(match inner_tq.limit {
6501                                Some(existing) => existing.min(outer_limit),
6502                                None => outer_limit,
6503                            });
6504                        }
6505                        if let Some(outer_offset) = tq.offset {
6506                            inner_tq.offset = Some(match inner_tq.offset {
6507                                Some(existing) => existing + outer_offset,
6508                                None => outer_offset,
6509                            });
6510                        }
6511                        QueryExpr::Table(inner_tq)
6512                    }
6513                    other => other,
6514                }
6515            }
6516            QueryExpr::Join(mut jq) => {
6517                jq.left = Box::new(self.rewrite_view_refs_inner(*jq.left));
6518                jq.right = Box::new(self.rewrite_view_refs_inner(*jq.right));
6519                QueryExpr::Join(jq)
6520            }
6521            // Other variants don't carry nested QueryExpr that can reference
6522            // a view by table name. Return as-is.
6523            other => other,
6524        }
6525    }
6526
6527    /// Internal dispatch: route a `QueryExpr` to the appropriate executor.
6528    /// Shared by `execute_query` (after parse/cache) and `execute_query_expr`
6529    /// (direct call from prepared-statement handler).
6530    fn authorize_relational_table_select(
6531        &self,
6532        mut table: TableQuery,
6533        frame: &dyn super::statement_frame::ReadFrame,
6534    ) -> RedDBResult<Option<TableQuery>> {
6535        if let Some(TableSource::Subquery(inner)) = table.source.take() {
6536            let authorized_inner = self.authorize_relational_select_expr(*inner, frame)?;
6537            table.source = Some(TableSource::Subquery(Box::new(authorized_inner)));
6538            return Ok(Some(table));
6539        }
6540
6541        self.check_table_column_projection_authz(&table, frame)?;
6542
6543        if self.inner.rls_enabled_tables.read().contains(&table.table) {
6544            return Ok(inject_rls_filters(self, frame, table));
6545        }
6546
6547        Ok(Some(table))
6548    }
6549
6550    fn authorize_relational_join_select(
6551        &self,
6552        mut join: JoinQuery,
6553        frame: &dyn super::statement_frame::ReadFrame,
6554    ) -> RedDBResult<Option<JoinQuery>> {
6555        self.check_join_column_projection_authz(&join, frame)?;
6556        join.left = Box::new(self.authorize_relational_join_child(*join.left, frame)?);
6557        join.right = Box::new(self.authorize_relational_join_child(*join.right, frame)?);
6558        Ok(inject_rls_into_join(self, frame, join))
6559    }
6560
6561    fn authorize_relational_join_child(
6562        &self,
6563        expr: QueryExpr,
6564        frame: &dyn super::statement_frame::ReadFrame,
6565    ) -> RedDBResult<QueryExpr> {
6566        match expr {
6567            QueryExpr::Table(mut table) => {
6568                if let Some(TableSource::Subquery(inner)) = table.source.take() {
6569                    let authorized_inner = self.authorize_relational_select_expr(*inner, frame)?;
6570                    table.source = Some(TableSource::Subquery(Box::new(authorized_inner)));
6571                }
6572                Ok(QueryExpr::Table(table))
6573            }
6574            QueryExpr::Join(join) => self
6575                .authorize_relational_join_select(join, frame)?
6576                .map(QueryExpr::Join)
6577                .ok_or_else(|| {
6578                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6579                }),
6580            other => Ok(other),
6581        }
6582    }
6583
6584    fn authorize_relational_select_expr(
6585        &self,
6586        expr: QueryExpr,
6587        frame: &dyn super::statement_frame::ReadFrame,
6588    ) -> RedDBResult<QueryExpr> {
6589        match expr {
6590            QueryExpr::Table(table) => self
6591                .authorize_relational_table_select(table, frame)?
6592                .map(QueryExpr::Table)
6593                .ok_or_else(|| {
6594                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6595                }),
6596            QueryExpr::Join(join) => self
6597                .authorize_relational_join_select(join, frame)?
6598                .map(QueryExpr::Join)
6599                .ok_or_else(|| {
6600                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6601                }),
6602            other => Ok(other),
6603        }
6604    }
6605
6606    fn check_table_column_projection_authz(
6607        &self,
6608        table: &TableQuery,
6609        frame: &dyn super::statement_frame::ReadFrame,
6610    ) -> RedDBResult<()> {
6611        let Some((username, role)) = frame.identity() else {
6612            return Ok(());
6613        };
6614        let Some(auth_store) = self.inner.auth_store.read().clone() else {
6615            return Ok(());
6616        };
6617
6618        let columns = self.resolved_table_projection_columns(table)?;
6619        let request = ColumnAccessRequest::select(table.table.clone(), columns);
6620        let principal = UserId::from_parts(frame.effective_scope(), username);
6621        let ctx = runtime_iam_context(role, frame.effective_scope());
6622        let outcome = auth_store.check_column_projection_authz(&principal, &request, &ctx);
6623        if outcome.allowed() {
6624            return Ok(());
6625        }
6626
6627        if let Some(denied) = outcome.first_denied_column() {
6628            return Err(RedDBError::Query(format!(
6629                "permission denied: principal=`{username}` cannot select column `{}`",
6630                denied.resource.name
6631            )));
6632        }
6633        Err(RedDBError::Query(format!(
6634            "permission denied: principal=`{username}` cannot select table `{}`",
6635            table.table
6636        )))
6637    }
6638
6639    fn check_join_column_projection_authz(
6640        &self,
6641        join: &JoinQuery,
6642        frame: &dyn super::statement_frame::ReadFrame,
6643    ) -> RedDBResult<()> {
6644        let mut by_table: HashMap<String, BTreeSet<String>> = HashMap::new();
6645        let projections = crate::storage::query::sql_lowering::effective_join_projections(join);
6646        self.collect_join_projection_columns(join, &projections, &mut by_table)?;
6647
6648        for (table, columns) in by_table {
6649            let query = TableQuery {
6650                table,
6651                source: None,
6652                alias: None,
6653                select_items: Vec::new(),
6654                columns: columns.into_iter().map(Projection::Column).collect(),
6655                where_expr: None,
6656                filter: None,
6657                group_by_exprs: Vec::new(),
6658                group_by: Vec::new(),
6659                having_expr: None,
6660                having: None,
6661                order_by: Vec::new(),
6662                limit: None,
6663                limit_param: None,
6664                offset: None,
6665                offset_param: None,
6666                expand: None,
6667                as_of: None,
6668                sessionize: None,
6669            };
6670            self.check_table_column_projection_authz(&query, frame)?;
6671        }
6672        Ok(())
6673    }
6674
6675    fn collect_join_projection_columns(
6676        &self,
6677        join: &JoinQuery,
6678        projections: &[Projection],
6679        out: &mut HashMap<String, BTreeSet<String>>,
6680    ) -> RedDBResult<()> {
6681        let left = table_side_context(join.left.as_ref());
6682        let right = table_side_context(join.right.as_ref());
6683
6684        if projections
6685            .iter()
6686            .any(|projection| matches!(projection, Projection::All))
6687        {
6688            for side in [left.as_ref(), right.as_ref()].into_iter().flatten() {
6689                out.entry(side.table.clone())
6690                    .or_default()
6691                    .extend(self.table_all_projection_columns(&side.table)?);
6692            }
6693            return Ok(());
6694        }
6695
6696        for projection in projections {
6697            collect_projection_columns_for_join_side(
6698                projection,
6699                left.as_ref(),
6700                right.as_ref(),
6701                out,
6702            )?;
6703        }
6704        Ok(())
6705    }
6706
6707    fn resolved_table_projection_columns(&self, table: &TableQuery) -> RedDBResult<Vec<String>> {
6708        let projections = crate::storage::query::sql_lowering::effective_table_projections(table);
6709        if projections
6710            .iter()
6711            .any(|projection| matches!(projection, Projection::All))
6712        {
6713            return self.table_all_projection_columns(&table.table);
6714        }
6715
6716        let mut columns = BTreeSet::new();
6717        for projection in &projections {
6718            collect_projection_columns_for_table(
6719                projection,
6720                &table.table,
6721                table.alias.as_deref(),
6722                &mut columns,
6723            );
6724        }
6725        Ok(columns.into_iter().collect())
6726    }
6727
6728    fn table_all_projection_columns(&self, table: &str) -> RedDBResult<Vec<String>> {
6729        if let Some(contract) = self.inner.db.collection_contract_arc(table) {
6730            let columns: Vec<String> = contract
6731                .declared_columns
6732                .iter()
6733                .map(|column| column.name.clone())
6734                .collect();
6735            if !columns.is_empty() {
6736                return Ok(columns);
6737            }
6738        }
6739
6740        let records = scan_runtime_table_source_records_limited(&self.inner.db, table, Some(1))?;
6741        Ok(records
6742            .first()
6743            .map(|record| {
6744                record
6745                    .column_names()
6746                    .into_iter()
6747                    .map(|column| column.to_string())
6748                    .collect()
6749            })
6750            .unwrap_or_default())
6751    }
6752
6753    fn resolve_table_expr_subqueries(
6754        &self,
6755        mut table: TableQuery,
6756        frame: &dyn super::statement_frame::ReadFrame,
6757    ) -> RedDBResult<TableQuery> {
6758        if let Some(TableSource::Subquery(inner)) = table.source.take() {
6759            let inner = self.resolve_select_expr_subqueries(*inner, frame)?;
6760            table.source = Some(TableSource::Subquery(Box::new(inner)));
6761        }
6762
6763        let outer_scopes = relation_scopes_for_query(&QueryExpr::Table(table.clone()));
6764        for item in &mut table.select_items {
6765            if let crate::storage::query::ast::SelectItem::Expr { expr, .. } = item {
6766                *expr = self.resolve_expr_subqueries(expr.clone(), &outer_scopes, frame)?;
6767            }
6768        }
6769        if let Some(where_expr) = table.where_expr.take() {
6770            table.where_expr =
6771                Some(self.resolve_expr_subqueries(where_expr, &outer_scopes, frame)?);
6772            table.filter = None;
6773        }
6774        if let Some(having_expr) = table.having_expr.take() {
6775            table.having_expr =
6776                Some(self.resolve_expr_subqueries(having_expr, &outer_scopes, frame)?);
6777            table.having = None;
6778        }
6779        for expr in &mut table.group_by_exprs {
6780            *expr = self.resolve_expr_subqueries(expr.clone(), &outer_scopes, frame)?;
6781        }
6782        for clause in &mut table.order_by {
6783            if let Some(expr) = clause.expr.take() {
6784                clause.expr = Some(self.resolve_expr_subqueries(expr, &outer_scopes, frame)?);
6785            }
6786        }
6787        Ok(table)
6788    }
6789
6790    fn resolve_select_expr_subqueries(
6791        &self,
6792        expr: QueryExpr,
6793        frame: &dyn super::statement_frame::ReadFrame,
6794    ) -> RedDBResult<QueryExpr> {
6795        match expr {
6796            QueryExpr::Table(table) => self
6797                .resolve_table_expr_subqueries(table, frame)
6798                .map(QueryExpr::Table),
6799            QueryExpr::Join(mut join) => {
6800                join.left = Box::new(self.resolve_select_expr_subqueries(*join.left, frame)?);
6801                join.right = Box::new(self.resolve_select_expr_subqueries(*join.right, frame)?);
6802                Ok(QueryExpr::Join(join))
6803            }
6804            other => Ok(other),
6805        }
6806    }
6807
6808    fn resolve_expr_subqueries(
6809        &self,
6810        expr: crate::storage::query::ast::Expr,
6811        outer_scopes: &[String],
6812        frame: &dyn super::statement_frame::ReadFrame,
6813    ) -> RedDBResult<crate::storage::query::ast::Expr> {
6814        use crate::storage::query::ast::Expr;
6815
6816        match expr {
6817            Expr::Subquery { query, span } => {
6818                let values = self.execute_expr_subquery_values(query, outer_scopes, frame)?;
6819                if values.len() > 1 {
6820                    return Err(RedDBError::Query(
6821                        "scalar subquery returned more than one row".to_string(),
6822                    ));
6823                }
6824                Ok(Expr::Literal {
6825                    value: values.into_iter().next().unwrap_or(Value::Null),
6826                    span,
6827                })
6828            }
6829            Expr::BinaryOp { op, lhs, rhs, span } => Ok(Expr::BinaryOp {
6830                op,
6831                lhs: Box::new(self.resolve_expr_subqueries(*lhs, outer_scopes, frame)?),
6832                rhs: Box::new(self.resolve_expr_subqueries(*rhs, outer_scopes, frame)?),
6833                span,
6834            }),
6835            Expr::UnaryOp { op, operand, span } => Ok(Expr::UnaryOp {
6836                op,
6837                operand: Box::new(self.resolve_expr_subqueries(*operand, outer_scopes, frame)?),
6838                span,
6839            }),
6840            Expr::Cast {
6841                inner,
6842                target,
6843                span,
6844            } => Ok(Expr::Cast {
6845                inner: Box::new(self.resolve_expr_subqueries(*inner, outer_scopes, frame)?),
6846                target,
6847                span,
6848            }),
6849            Expr::FunctionCall { name, args, span } => {
6850                let args = args
6851                    .into_iter()
6852                    .map(|arg| self.resolve_expr_subqueries(arg, outer_scopes, frame))
6853                    .collect::<RedDBResult<Vec<_>>>()?;
6854                Ok(Expr::FunctionCall { name, args, span })
6855            }
6856            Expr::Case {
6857                branches,
6858                else_,
6859                span,
6860            } => {
6861                let branches = branches
6862                    .into_iter()
6863                    .map(|(cond, value)| {
6864                        Ok((
6865                            self.resolve_expr_subqueries(cond, outer_scopes, frame)?,
6866                            self.resolve_expr_subqueries(value, outer_scopes, frame)?,
6867                        ))
6868                    })
6869                    .collect::<RedDBResult<Vec<_>>>()?;
6870                let else_ = else_
6871                    .map(|expr| self.resolve_expr_subqueries(*expr, outer_scopes, frame))
6872                    .transpose()?
6873                    .map(Box::new);
6874                Ok(Expr::Case {
6875                    branches,
6876                    else_,
6877                    span,
6878                })
6879            }
6880            Expr::IsNull {
6881                operand,
6882                negated,
6883                span,
6884            } => Ok(Expr::IsNull {
6885                operand: Box::new(self.resolve_expr_subqueries(*operand, outer_scopes, frame)?),
6886                negated,
6887                span,
6888            }),
6889            Expr::InList {
6890                target,
6891                values,
6892                negated,
6893                span,
6894            } => {
6895                let target =
6896                    Box::new(self.resolve_expr_subqueries(*target, outer_scopes, frame)?);
6897                let mut resolved = Vec::new();
6898                for value in values {
6899                    if let Expr::Subquery { query, .. } = value {
6900                        resolved.extend(
6901                            self.execute_expr_subquery_values(query, outer_scopes, frame)?
6902                                .into_iter()
6903                                .map(Expr::lit),
6904                        );
6905                    } else {
6906                        resolved.push(self.resolve_expr_subqueries(value, outer_scopes, frame)?);
6907                    }
6908                }
6909                Ok(Expr::InList {
6910                    target,
6911                    values: resolved,
6912                    negated,
6913                    span,
6914                })
6915            }
6916            Expr::Between {
6917                target,
6918                low,
6919                high,
6920                negated,
6921                span,
6922            } => Ok(Expr::Between {
6923                target: Box::new(self.resolve_expr_subqueries(*target, outer_scopes, frame)?),
6924                low: Box::new(self.resolve_expr_subqueries(*low, outer_scopes, frame)?),
6925                high: Box::new(self.resolve_expr_subqueries(*high, outer_scopes, frame)?),
6926                negated,
6927                span,
6928            }),
6929            other => Ok(other),
6930        }
6931    }
6932
6933    fn execute_expr_subquery_values(
6934        &self,
6935        subquery: crate::storage::query::ast::ExprSubquery,
6936        outer_scopes: &[String],
6937        frame: &dyn super::statement_frame::ReadFrame,
6938    ) -> RedDBResult<Vec<Value>> {
6939        let query = *subquery.query;
6940        if query_references_outer_scope(&query, outer_scopes) {
6941            return Err(RedDBError::Query(
6942                "NOT_YET_SUPPORTED: correlated subqueries are not supported yet; track follow-up issue #470-correlated-subqueries".to_string(),
6943            ));
6944        }
6945        let query = self.rewrite_view_refs(query);
6946        let query = self.resolve_select_expr_subqueries(query, frame)?;
6947        let query = self.authorize_relational_select_expr(query, frame)?;
6948        let result = match query {
6949            QueryExpr::Table(table) => {
6950                execute_runtime_table_query(&self.inner.db, &table, Some(&self.inner.index_store))?
6951            }
6952            QueryExpr::Join(join) => execute_runtime_join_query(&self.inner.db, &join)?,
6953            other => {
6954                return Err(RedDBError::Query(format!(
6955                    "expression subquery must be a SELECT query, got {}",
6956                    query_expr_name(&other)
6957                )))
6958            }
6959        };
6960        first_column_values(result)
6961    }
6962
6963    fn dispatch_expr(
6964        &self,
6965        expr: QueryExpr,
6966        query_str: &str,
6967        mode: QueryMode,
6968    ) -> RedDBResult<RuntimeQueryResult> {
6969        let statement = query_expr_name(&expr);
6970        match expr {
6971            QueryExpr::Graph(_) | QueryExpr::Path(_) => {
6972                // Graph queries are not cacheable as prepared statements.
6973                Err(RedDBError::Query(
6974                    "graph queries cannot be used as prepared statements".to_string(),
6975                ))
6976            }
6977            QueryExpr::Table(table) => {
6978                let scope = self.ai_scope();
6979                let table = self.resolve_table_expr_subqueries(
6980                    table,
6981                    &scope as &dyn super::statement_frame::ReadFrame,
6982                )?;
6983                if super::red_schema::is_virtual_table(&table.table) {
6984                    return Ok(RuntimeQueryResult {
6985                        query: query_str.to_string(),
6986                        mode,
6987                        statement,
6988                        engine: "runtime-red-schema",
6989                        result: super::red_schema::red_query(
6990                            self,
6991                            &table.table,
6992                            &table,
6993                            &scope as &dyn super::statement_frame::ReadFrame,
6994                        )?,
6995                        affected_rows: 0,
6996                        statement_type: "select",
6997                    });
6998                }
6999                let Some(table_with_rls) = self.authorize_relational_table_select(
7000                    table,
7001                    &scope as &dyn super::statement_frame::ReadFrame,
7002                )?
7003                else {
7004                    return Ok(RuntimeQueryResult {
7005                        query: query_str.to_string(),
7006                        mode,
7007                        statement,
7008                        engine: "runtime-table-rls",
7009                        result: crate::storage::query::unified::UnifiedResult::empty(),
7010                        affected_rows: 0,
7011                        statement_type: "select",
7012                    });
7013                };
7014                Ok(RuntimeQueryResult {
7015                    query: query_str.to_string(),
7016                    mode,
7017                    statement,
7018                    engine: "runtime-table",
7019                    result: execute_runtime_table_query(
7020                        &self.inner.db,
7021                        &table_with_rls,
7022                        Some(&self.inner.index_store),
7023                    )?,
7024                    affected_rows: 0,
7025                    statement_type: "select",
7026                })
7027            }
7028            QueryExpr::Join(join) => {
7029                let scope = self.ai_scope();
7030                let Some(join_with_rls) = self.authorize_relational_join_select(
7031                    join,
7032                    &scope as &dyn super::statement_frame::ReadFrame,
7033                )?
7034                else {
7035                    return Ok(RuntimeQueryResult {
7036                        query: query_str.to_string(),
7037                        mode,
7038                        statement,
7039                        engine: "runtime-join-rls",
7040                        result: crate::storage::query::unified::UnifiedResult::empty(),
7041                        affected_rows: 0,
7042                        statement_type: "select",
7043                    });
7044                };
7045                Ok(RuntimeQueryResult {
7046                    query: query_str.to_string(),
7047                    mode,
7048                    statement,
7049                    engine: "runtime-join",
7050                    result: execute_runtime_join_query(&self.inner.db, &join_with_rls)?,
7051                    affected_rows: 0,
7052                    statement_type: "select",
7053                })
7054            }
7055            QueryExpr::Vector(vector) => Ok(RuntimeQueryResult {
7056                query: query_str.to_string(),
7057                mode,
7058                statement,
7059                engine: "runtime-vector",
7060                result: execute_runtime_vector_query(&self.inner.db, &vector)?,
7061                affected_rows: 0,
7062                statement_type: "select",
7063            }),
7064            QueryExpr::Hybrid(hybrid) => Ok(RuntimeQueryResult {
7065                query: query_str.to_string(),
7066                mode,
7067                statement,
7068                engine: "runtime-hybrid",
7069                result: execute_runtime_hybrid_query(&self.inner.db, &hybrid)?,
7070                affected_rows: 0,
7071                statement_type: "select",
7072            }),
7073            QueryExpr::Insert(ref insert) if super::red_schema::is_virtual_table(&insert.table) => {
7074                Err(RedDBError::Query(
7075                    super::red_schema::READ_ONLY_ERROR.to_string(),
7076                ))
7077            }
7078            QueryExpr::Update(ref update) if super::red_schema::is_virtual_table(&update.table) => {
7079                Err(RedDBError::Query(
7080                    super::red_schema::READ_ONLY_ERROR.to_string(),
7081                ))
7082            }
7083            QueryExpr::Delete(ref delete) if super::red_schema::is_virtual_table(&delete.table) => {
7084                Err(RedDBError::Query(
7085                    super::red_schema::READ_ONLY_ERROR.to_string(),
7086                ))
7087            }
7088            QueryExpr::Insert(ref insert) => self
7089                .with_deferred_store_wal_for_dml(self.insert_may_emit_events(insert), || {
7090                    self.execute_insert(query_str, insert)
7091                }),
7092            QueryExpr::Update(ref update) => self
7093                .with_deferred_store_wal_for_dml(self.update_may_emit_events(update), || {
7094                    self.execute_update(query_str, update)
7095                }),
7096            QueryExpr::Delete(ref delete) => self
7097                .with_deferred_store_wal_for_dml(self.delete_may_emit_events(delete), || {
7098                    self.execute_delete(query_str, delete)
7099                }),
7100            QueryExpr::SearchCommand(ref cmd) => self.execute_search_command(query_str, cmd),
7101            QueryExpr::Ask(ref ask) => self.execute_ask(query_str, ask),
7102            _ => Err(RedDBError::Query(format!(
7103                "prepared-statement execution does not support {statement} statements"
7104            ))),
7105        }
7106    }
7107
7108    /// Ultra-fast path: detect `SELECT * FROM table WHERE _entity_id = N` by string pattern
7109    /// and execute it without SQL parsing or planning. Returns None if pattern doesn't match.
7110    fn try_fast_entity_lookup(&self, query: &str) -> Option<RedDBResult<RuntimeQueryResult>> {
7111        // Pattern: "SELECT * FROM <table> WHERE _entity_id = <id>"
7112        // or "SELECT * FROM <table> WHERE _entity_id =<id>"
7113        let q = query.trim();
7114        if !q.starts_with("SELECT") && !q.starts_with("select") {
7115            return None;
7116        }
7117
7118        // Find "WHERE _entity_id = " or "WHERE _entity_id ="
7119        let where_pos = q
7120            .find("WHERE _entity_id")
7121            .or_else(|| q.find("where _entity_id"))?;
7122        let after_field = &q[where_pos + 16..].trim_start(); // skip "WHERE _entity_id"
7123        let after_eq = after_field.strip_prefix('=')?.trim_start();
7124
7125        // Parse the entity ID number
7126        let id_str = after_eq.trim();
7127        let entity_id: u64 = id_str.parse().ok()?;
7128
7129        // Extract table name: between "FROM " and " WHERE"
7130        let from_pos = q.find("FROM ").or_else(|| q.find("from "))? + 5;
7131        let table = q[from_pos..where_pos].trim();
7132        if table.is_empty()
7133            || table.contains(' ') && !table.contains(" AS ") && !table.contains(" as ")
7134        {
7135            return None; // complex query, fall through
7136        }
7137        let table_name = table.split_whitespace().next()?;
7138
7139        // Direct entity lookup — skips SQL parse, plan cache, result
7140        // cache, view rewriter, RLS gate. Safe because the gating in
7141        // `execute_query` guarantees no scope override / no
7142        // transaction context is active. MVCC visibility is still
7143        // honoured against the current snapshot.
7144        let store = self.inner.db.store();
7145        let entity = store
7146            .get(
7147                table_name,
7148                crate::storage::unified::EntityId::new(entity_id),
7149            )
7150            .filter(entity_visible_under_current_snapshot);
7151
7152        let count = if entity.is_some() { 1u64 } else { 0 };
7153
7154        // Materialize a record so downstream consumers that walk
7155        // `result.records` (embedded runtime API, decrypt pass, CLI)
7156        // see the row. Previously only `pre_serialized_json` was
7157        // filled, which caused those consumers to see zero rows and
7158        // skewed benchmarks.
7159        let records: Vec<crate::storage::query::unified::UnifiedRecord> = entity
7160            .as_ref()
7161            .and_then(|e| runtime_table_record_from_entity(e.clone()))
7162            .into_iter()
7163            .collect();
7164
7165        let json = match entity {
7166            Some(ref e) => execute_runtime_serialize_single_entity(e),
7167            None => r#"{"columns":[],"record_count":0,"selection":{"scope":"any"},"records":[]}"#
7168                .to_string(),
7169        };
7170
7171        Some(Ok(RuntimeQueryResult {
7172            query: query.to_string(),
7173            mode: crate::storage::query::modes::QueryMode::Sql,
7174            statement: "select",
7175            engine: "fast-entity-lookup",
7176            result: crate::storage::query::unified::UnifiedResult {
7177                columns: Vec::new(),
7178                records,
7179                stats: crate::storage::query::unified::QueryStats {
7180                    rows_scanned: count,
7181                    ..Default::default()
7182                },
7183                pre_serialized_json: Some(json),
7184            },
7185            affected_rows: 0,
7186            statement_type: "select",
7187        }))
7188    }
7189
7190    fn result_cache_backend(&self) -> RuntimeResultCacheBackend {
7191        match self
7192            .config_string(RESULT_CACHE_BACKEND_KEY, RESULT_CACHE_DEFAULT_BACKEND)
7193            .as_str()
7194        {
7195            "blob_cache" => RuntimeResultCacheBackend::BlobCache,
7196            "shadow" => RuntimeResultCacheBackend::Shadow,
7197            _ => RuntimeResultCacheBackend::Legacy,
7198        }
7199    }
7200
7201    pub(super) fn get_result_cache_entry(&self, key: &str) -> Option<RuntimeQueryResult> {
7202        match self.result_cache_backend() {
7203            RuntimeResultCacheBackend::Legacy => self.get_legacy_result_cache_entry(key),
7204            RuntimeResultCacheBackend::BlobCache => self.get_blob_result_cache_entry(key),
7205            RuntimeResultCacheBackend::Shadow => {
7206                let legacy = self.get_legacy_result_cache_entry(key);
7207                let blob = self.get_blob_result_cache_entry(key);
7208                if let (Some(ref legacy), Some(ref blob)) = (&legacy, &blob) {
7209                    if result_cache_fingerprint(legacy) != result_cache_fingerprint(blob) {
7210                        self.inner
7211                            .result_cache_shadow_divergences
7212                            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
7213                        tracing::warn!(
7214                            key,
7215                            metric = crate::runtime::METRIC_CACHE_SHADOW_DIVERGENCE_TOTAL,
7216                            "result cache shadow backend diverged from legacy"
7217                        );
7218                    }
7219                }
7220                legacy
7221            }
7222        }
7223    }
7224
7225    fn get_legacy_result_cache_entry(&self, key: &str) -> Option<RuntimeQueryResult> {
7226        let cache = self.inner.result_cache.read();
7227        cache.0.get(key).and_then(|entry| {
7228            if entry.cached_at.elapsed().as_secs() < RESULT_CACHE_TTL_SECS {
7229                Some(entry.result.clone())
7230            } else {
7231                None
7232            }
7233        })
7234    }
7235
7236    fn get_blob_result_cache_entry(&self, key: &str) -> Option<RuntimeQueryResult> {
7237        let hit = self
7238            .inner
7239            .result_blob_cache
7240            .get(RESULT_CACHE_BLOB_NAMESPACE, key)?;
7241        {
7242            let cache = self.inner.result_blob_entries.read();
7243            if let Some(entry) = cache.0.get(key) {
7244                return Some(entry.result.clone());
7245            }
7246        }
7247
7248        let (result, scopes) = decode_result_cache_payload(hit.value())?;
7249        let mut cache = self.inner.result_blob_entries.write();
7250        let (ref mut map, ref mut order) = *cache;
7251        if !map.contains_key(key) {
7252            order.push_back(key.to_string());
7253        }
7254        map.insert(
7255            key.to_string(),
7256            RuntimeResultCacheEntry {
7257                result: result.clone(),
7258                cached_at: std::time::Instant::now(),
7259                scopes,
7260            },
7261        );
7262        trim_result_cache(map, order);
7263        Some(result)
7264    }
7265
7266    pub(super) fn put_result_cache_entry(&self, key: &str, entry: RuntimeResultCacheEntry) {
7267        match self.result_cache_backend() {
7268            RuntimeResultCacheBackend::Legacy => self.put_legacy_result_cache_entry(key, entry),
7269            RuntimeResultCacheBackend::BlobCache => self.put_blob_result_cache_entry(key, entry),
7270            RuntimeResultCacheBackend::Shadow => {
7271                self.put_legacy_result_cache_entry(key, entry.clone());
7272                self.put_blob_result_cache_entry(key, entry);
7273            }
7274        }
7275    }
7276
7277    fn put_legacy_result_cache_entry(&self, key: &str, entry: RuntimeResultCacheEntry) {
7278        let mut cache = self.inner.result_cache.write();
7279        let (ref mut map, ref mut order) = *cache;
7280        if !map.contains_key(key) {
7281            order.push_back(key.to_string());
7282        }
7283        map.insert(key.to_string(), entry);
7284        trim_result_cache(map, order);
7285    }
7286
7287    fn put_blob_result_cache_entry(&self, key: &str, entry: RuntimeResultCacheEntry) {
7288        let policy = crate::storage::cache::BlobCachePolicy::default()
7289            .ttl_ms(RESULT_CACHE_TTL_SECS * 1000)
7290            .priority(200);
7291        let dependencies = entry.scopes.iter().cloned().collect::<Vec<_>>();
7292        let bytes = encode_result_cache_payload(&entry)
7293            .unwrap_or_else(|| result_cache_fingerprint(&entry.result).into_bytes());
7294        let put = crate::storage::cache::BlobCachePut::new(bytes)
7295            .with_dependencies(dependencies)
7296            .with_policy(policy);
7297        if self
7298            .inner
7299            .result_blob_cache
7300            .put(RESULT_CACHE_BLOB_NAMESPACE, key, put)
7301            .is_err()
7302        {
7303            return;
7304        }
7305
7306        let mut cache = self.inner.result_blob_entries.write();
7307        let (ref mut map, ref mut order) = *cache;
7308        if !map.contains_key(key) {
7309            order.push_back(key.to_string());
7310        }
7311        map.insert(key.to_string(), entry);
7312        trim_result_cache(map, order);
7313    }
7314
7315    pub fn result_cache_shadow_divergences(&self) -> u64 {
7316        self.inner
7317            .result_cache_shadow_divergences
7318            .load(std::sync::atomic::Ordering::Relaxed)
7319    }
7320
7321    /// Invalidate the result cache (call after any write operation).
7322    /// Full clear — use for DDL (DROP TABLE, schema changes) or when table is unknown.
7323    pub fn invalidate_result_cache(&self) {
7324        let mut cache = self.inner.result_cache.write();
7325        cache.0.clear();
7326        cache.1.clear();
7327        let mut blob_entries = self.inner.result_blob_entries.write();
7328        blob_entries.0.clear();
7329        blob_entries.1.clear();
7330        self.inner
7331            .result_blob_cache
7332            .invalidate_namespace(RESULT_CACHE_BLOB_NAMESPACE);
7333        let mut ask_entries = self.inner.ask_answer_cache_entries.write();
7334        ask_entries.0.clear();
7335        ask_entries.1.clear();
7336        self.inner
7337            .result_blob_cache
7338            .invalidate_namespace(ASK_ANSWER_CACHE_NAMESPACE);
7339    }
7340
7341    /// Invalidate only result cache entries that declared a dependency on `table`.
7342    /// Cheaper than a full clear: unrelated tables keep their cached results.
7343    pub(crate) fn invalidate_result_cache_for_table(&self, table: &str) {
7344        // Hot-path probe both backends before taking write locks. The blob
7345        // backend is node-local, same as the legacy result cache.
7346        let legacy_has_match = {
7347            let cache = self.inner.result_cache.read();
7348            let (ref map, _) = *cache;
7349            !map.is_empty() && map.values().any(|entry| entry.scopes.contains(table))
7350        };
7351        let blob_has_match = {
7352            let cache = self.inner.result_blob_entries.read();
7353            let (ref map, _) = *cache;
7354            !map.is_empty() && map.values().any(|entry| entry.scopes.contains(table))
7355        };
7356        if legacy_has_match {
7357            let mut cache = self.inner.result_cache.write();
7358            let (ref mut map, ref mut order) = *cache;
7359            map.retain(|_, entry| !entry.scopes.contains(table));
7360            order.retain(|key| map.contains_key(key));
7361        }
7362
7363        if matches!(
7364            self.result_cache_backend(),
7365            RuntimeResultCacheBackend::BlobCache | RuntimeResultCacheBackend::Shadow
7366        ) {
7367            let mut blob_entries = self.inner.result_blob_entries.write();
7368            let (ref mut blob_map, ref mut blob_order) = *blob_entries;
7369            blob_map.clear();
7370            blob_order.clear();
7371            self.inner
7372                .result_blob_cache
7373                .invalidate_namespace(RESULT_CACHE_BLOB_NAMESPACE);
7374        } else if blob_has_match {
7375            let mut blob_entries = self.inner.result_blob_entries.write();
7376            let (ref mut blob_map, ref mut blob_order) = *blob_entries;
7377            blob_map.retain(|_, entry| !entry.scopes.contains(table));
7378            blob_order.retain(|key| blob_map.contains_key(key));
7379        }
7380        let mut ask_entries = self.inner.ask_answer_cache_entries.write();
7381        ask_entries.0.clear();
7382        ask_entries.1.clear();
7383        self.inner
7384            .result_blob_cache
7385            .invalidate_namespace(ASK_ANSWER_CACHE_NAMESPACE);
7386    }
7387
7388    pub(crate) fn invalidate_plan_cache(&self) {
7389        self.inner.query_cache.write().clear();
7390        self.inner
7391            .ddl_epoch
7392            .fetch_add(1, std::sync::atomic::Ordering::Release);
7393    }
7394
7395    /// Read the monotonic DDL epoch counter. Bumped by every
7396    /// `invalidate_plan_cache` call so prepared-statement holders can
7397    /// detect schema drift between PREPARE and EXECUTE.
7398    pub fn ddl_epoch(&self) -> u64 {
7399        self.inner
7400            .ddl_epoch
7401            .load(std::sync::atomic::Ordering::Acquire)
7402    }
7403
7404    pub(crate) fn clear_table_planner_stats(&self, table: &str) {
7405        let store = self.inner.db.store();
7406        crate::storage::query::planner::stats_catalog::clear_table_stats(store.as_ref(), table);
7407        self.invalidate_plan_cache();
7408    }
7409
7410    /// Replay `tenant_tables.*.column` keys from red_config at boot so
7411    /// `CREATE TABLE ... TENANT BY (col)` declarations persist across
7412    /// restarts (Phase 2.5.4). Reads every row of the `red_config`
7413    /// collection, picks the keys matching the tenant-marker shape,
7414    /// and calls `register_tenant_table` for each.
7415    ///
7416    /// Safe no-op when `red_config` doesn't exist (first boot on a
7417    /// fresh datadir).
7418    pub(crate) fn rehydrate_tenant_tables(&self) {
7419        let store = self.inner.db.store();
7420        let Some(manager) = store.get_collection("red_config") else {
7421            return;
7422        };
7423        // Replay in insertion order (SegmentManager iteration). Multiple
7424        // toggles on the same table leave several rows behind — the
7425        // last one processed wins because each register/unregister
7426        // call overwrites the in-memory state.
7427        for entity in manager.query_all(|_| true) {
7428            let crate::storage::unified::entity::EntityData::Row(row) = &entity.data else {
7429                continue;
7430            };
7431            let Some(named) = &row.named else { continue };
7432            let Some(crate::storage::schema::Value::Text(key)) = named.get("key") else {
7433                continue;
7434            };
7435            // Shape: tenant_tables.{table}.column
7436            let Some(rest) = key.strip_prefix("tenant_tables.") else {
7437                continue;
7438            };
7439            let Some((table, suffix)) = rest.rsplit_once('.') else {
7440                // Issue #205 — a `tenant_tables.*` row that doesn't
7441                // split cleanly is a schema-shape regression: the
7442                // metadata writer must always emit the `.column`
7443                // suffix, so reaching this branch means an upgrade
7444                // with incompatible state or external tampering.
7445                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7446                    collection: "red_config".to_string(),
7447                    detail: format!("malformed tenant_tables key: {key}"),
7448                }
7449                .emit_global();
7450                continue;
7451            };
7452            if suffix != "column" {
7453                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7454                    collection: "red_config".to_string(),
7455                    detail: format!("unexpected tenant_tables suffix: {key}"),
7456                }
7457                .emit_global();
7458                continue;
7459            }
7460            match named.get("value") {
7461                Some(crate::storage::schema::Value::Text(column)) => {
7462                    self.register_tenant_table(table, column);
7463                }
7464                // Null / missing value = DISABLE TENANCY marker.
7465                Some(crate::storage::schema::Value::Null) | None => {
7466                    self.unregister_tenant_table(table);
7467                }
7468                _ => {}
7469            }
7470        }
7471    }
7472
7473    pub(crate) fn rehydrate_declared_column_schemas(&self) {
7474        let store = self.inner.db.store();
7475        for contract in self.inner.db.collection_contracts() {
7476            let columns: Vec<String> = contract
7477                .declared_columns
7478                .iter()
7479                .map(|column| column.name.clone())
7480                .collect();
7481            let Some(manager) = store.get_collection(&contract.name) else {
7482                continue;
7483            };
7484            manager.set_column_schema_if_empty(columns);
7485        }
7486    }
7487
7488    /// Register a table as tenant-scoped (Phase 2.5.4). Installs the
7489    /// in-memory column mapping, the implicit RLS policy, and enables
7490    /// row-level security on the table. Idempotent — re-registering
7491    /// the same `(table, column)` replaces the prior auto-policy.
7492    pub fn register_tenant_table(&self, table: &str, column: &str) {
7493        use crate::storage::query::ast::{
7494            CompareOp, CreatePolicyQuery, Expr, FieldRef, Filter, Span,
7495        };
7496        self.inner
7497            .tenant_tables
7498            .write()
7499            .insert(table.to_string(), column.to_string());
7500
7501        // Build the policy: col = CURRENT_TENANT()
7502        // Uses CompareExpr so the comparison happens at runtime against
7503        // the thread-local tenant value read by the CURRENT_TENANT
7504        // scalar. Spans are synthetic — there's no source location for
7505        // an auto-generated policy.
7506        let lhs = Expr::Column {
7507            field: FieldRef::TableColumn {
7508                table: table.to_string(),
7509                column: column.to_string(),
7510            },
7511            span: Span::synthetic(),
7512        };
7513        let rhs = Expr::FunctionCall {
7514            name: "CURRENT_TENANT".to_string(),
7515            args: Vec::new(),
7516            span: Span::synthetic(),
7517        };
7518        let policy_filter = Filter::CompareExpr {
7519            lhs,
7520            op: CompareOp::Eq,
7521            rhs,
7522        };
7523
7524        let policy = CreatePolicyQuery {
7525            name: "__tenant_iso".to_string(),
7526            table: table.to_string(),
7527            action: None, // None = ALL actions (SELECT/INSERT/UPDATE/DELETE)
7528            role: None,   // None = every role
7529            using: Box::new(policy_filter),
7530            // Auto-tenancy defaults to Table targets. Collections of
7531            // other kinds (graph / vector / queue / timeseries) that
7532            // opt in via `ALTER ... ENABLE TENANCY` should use the
7533            // matching kind — but for now we keep the auto-policy
7534            // kind-agnostic so the evaluator can apply it to any
7535            // entity living in the collection.
7536            target_kind: crate::storage::query::ast::PolicyTargetKind::Table,
7537        };
7538
7539        // Replace any prior auto-policy for this table (column rename).
7540        self.inner.rls_policies.write().insert(
7541            (table.to_string(), "__tenant_iso".to_string()),
7542            Arc::new(policy),
7543        );
7544        self.inner
7545            .rls_enabled_tables
7546            .write()
7547            .insert(table.to_string());
7548
7549        // Auto-build a hash index on the tenant column. Every read/write
7550        // against a tenant-scoped table carries an implicit
7551        // `col = CURRENT_TENANT()` predicate from the auto-policy, so an
7552        // index on that column is on the hot path of every query. Without
7553        // it, every SELECT/UPDATE/DELETE degrades to a full scan.
7554        self.ensure_tenant_index(table, column);
7555    }
7556
7557    /// Auto-create the hash index that backs the tenant-iso RLS predicate.
7558    /// Skipped when:
7559    ///   * the column is dotted (nested path — flat secondary indices
7560    ///     don't cover those today; RLS still works via the policy)
7561    ///   * `__tenant_idx_{table}` already exists (idempotent on rehydrate)
7562    ///   * the user already registered an index whose first column matches
7563    ///     (avoids redundant duplicates of a user-defined composite)
7564    fn ensure_tenant_index(&self, table: &str, column: &str) {
7565        if column.contains('.') {
7566            return;
7567        }
7568        let index_name = format!("__tenant_idx_{table}");
7569        let registry = self.inner.index_store.list_indices(table);
7570        if registry.iter().any(|idx| idx.name == index_name) {
7571            return;
7572        }
7573        if registry
7574            .iter()
7575            .any(|idx| idx.columns.first().map(|c| c.as_str()) == Some(column))
7576        {
7577            return;
7578        }
7579
7580        let store = self.inner.db.store();
7581        let Some(manager) = store.get_collection(table) else {
7582            return;
7583        };
7584        let entities = manager.query_all(|_| true);
7585        let entity_fields: Vec<(
7586            crate::storage::unified::EntityId,
7587            Vec<(String, crate::storage::schema::Value)>,
7588        )> = entities
7589            .iter()
7590            .map(|e| {
7591                let fields = match &e.data {
7592                    crate::storage::EntityData::Row(row) => {
7593                        if let Some(ref named) = row.named {
7594                            named.iter().map(|(k, v)| (k.clone(), v.clone())).collect()
7595                        } else if let Some(ref schema) = row.schema {
7596                            schema
7597                                .iter()
7598                                .zip(row.columns.iter())
7599                                .map(|(k, v)| (k.clone(), v.clone()))
7600                                .collect()
7601                        } else {
7602                            Vec::new()
7603                        }
7604                    }
7605                    crate::storage::EntityData::Node(node) => node
7606                        .properties
7607                        .iter()
7608                        .map(|(k, v)| (k.clone(), v.clone()))
7609                        .collect(),
7610                    _ => Vec::new(),
7611                };
7612                (e.id, fields)
7613            })
7614            .collect();
7615
7616        let columns = vec![column.to_string()];
7617        if self
7618            .inner
7619            .index_store
7620            .create_index(
7621                &index_name,
7622                table,
7623                &columns,
7624                super::index_store::IndexMethodKind::Hash,
7625                false,
7626                &entity_fields,
7627            )
7628            .is_err()
7629        {
7630            return;
7631        }
7632        self.inner
7633            .index_store
7634            .register(super::index_store::RegisteredIndex {
7635                name: index_name,
7636                collection: table.to_string(),
7637                columns,
7638                method: super::index_store::IndexMethodKind::Hash,
7639                unique: false,
7640            });
7641        self.invalidate_plan_cache();
7642    }
7643
7644    /// Drop the auto-generated tenant index, if one exists. Called from
7645    /// `unregister_tenant_table` so DISABLE TENANCY / DROP TABLE clean up.
7646    fn drop_tenant_index(&self, table: &str) {
7647        let index_name = format!("__tenant_idx_{table}");
7648        self.inner.index_store.drop_index(&index_name, table);
7649    }
7650
7651    /// Retrieve the tenant column for a table, if any (Phase 2.5.4).
7652    /// Used by the INSERT auto-fill path to know which column to
7653    /// populate with `current_tenant()` when the user didn't name it.
7654    pub fn tenant_column(&self, table: &str) -> Option<String> {
7655        self.inner.tenant_tables.read().get(table).cloned()
7656    }
7657
7658    /// Remove a table's tenant registration (Phase 2.5.4). Called by
7659    /// DROP TABLE / ALTER TABLE DISABLE TENANCY. Removes the auto-policy
7660    /// but leaves any user-installed explicit policies intact.
7661    pub fn unregister_tenant_table(&self, table: &str) {
7662        self.inner.tenant_tables.write().remove(table);
7663        self.inner
7664            .rls_policies
7665            .write()
7666            .remove(&(table.to_string(), "__tenant_iso".to_string()));
7667        self.drop_tenant_index(table);
7668        // Only clear RLS enablement if no other policies remain.
7669        let has_other_policies = self
7670            .inner
7671            .rls_policies
7672            .read()
7673            .keys()
7674            .any(|(t, _)| t == table);
7675        if !has_other_policies {
7676            self.inner.rls_enabled_tables.write().remove(table);
7677        }
7678    }
7679
7680    /// Record that the running transaction has marked `id` in `collection`
7681    /// for deletion (Phase 2.3.2b MVCC tombstones). `stamper_xid` is the
7682    /// xid that was written into `xmax` — either the parent txn xid or
7683    /// the innermost savepoint sub-xid. Savepoint rollback filters by
7684    /// this xid to revive only its own tombstones.
7685    pub(crate) fn record_pending_tombstone(
7686        &self,
7687        conn_id: u64,
7688        collection: &str,
7689        id: crate::storage::unified::entity::EntityId,
7690        stamper_xid: crate::storage::transaction::snapshot::Xid,
7691        previous_xmax: crate::storage::transaction::snapshot::Xid,
7692    ) {
7693        self.inner
7694            .pending_tombstones
7695            .write()
7696            .entry(conn_id)
7697            .or_default()
7698            .push((collection.to_string(), id, stamper_xid, previous_xmax));
7699    }
7700
7701    pub(crate) fn record_pending_versioned_update(
7702        &self,
7703        conn_id: u64,
7704        collection: &str,
7705        old_id: crate::storage::unified::entity::EntityId,
7706        new_id: crate::storage::unified::entity::EntityId,
7707        stamper_xid: crate::storage::transaction::snapshot::Xid,
7708        previous_xmax: crate::storage::transaction::snapshot::Xid,
7709    ) {
7710        self.inner
7711            .pending_versioned_updates
7712            .write()
7713            .entry(conn_id)
7714            .or_default()
7715            .push((
7716                collection.to_string(),
7717                old_id,
7718                new_id,
7719                stamper_xid,
7720                previous_xmax,
7721            ));
7722    }
7723
7724    fn with_deferred_store_wal_if_transaction<T>(
7725        &self,
7726        f: impl FnOnce() -> RedDBResult<T>,
7727    ) -> RedDBResult<T> {
7728        let conn_id = current_connection_id();
7729        if !self.inner.tx_contexts.read().contains_key(&conn_id) {
7730            return f();
7731        }
7732
7733        crate::storage::UnifiedStore::begin_deferred_store_wal_capture();
7734        let result = f();
7735        let captured = crate::storage::UnifiedStore::take_deferred_store_wal_capture();
7736        match result {
7737            Ok(value) => {
7738                self.record_pending_store_wal_actions(conn_id, captured);
7739                Ok(value)
7740            }
7741            Err(err) => Err(err),
7742        }
7743    }
7744
7745    fn with_deferred_store_wal_for_dml<T>(
7746        &self,
7747        capture_autocommit_events: bool,
7748        f: impl FnOnce() -> RedDBResult<T>,
7749    ) -> RedDBResult<T> {
7750        let conn_id = current_connection_id();
7751        if self.inner.tx_contexts.read().contains_key(&conn_id) {
7752            return self.with_deferred_store_wal_if_transaction(f);
7753        }
7754        if !capture_autocommit_events {
7755            return f();
7756        }
7757
7758        crate::storage::UnifiedStore::begin_deferred_store_wal_capture();
7759        let result = f();
7760        let captured = crate::storage::UnifiedStore::take_deferred_store_wal_capture();
7761        self.inner
7762            .db
7763            .store()
7764            .append_deferred_store_wal_actions(captured)
7765            .map_err(|err| RedDBError::Internal(err.to_string()))?;
7766        result
7767    }
7768
7769    fn insert_may_emit_events(&self, query: &InsertQuery) -> bool {
7770        !query.suppress_events
7771            && self.collection_has_event_subscriptions_for_operation(
7772                &query.table,
7773                crate::catalog::SubscriptionOperation::Insert,
7774            )
7775    }
7776
7777    fn update_may_emit_events(&self, query: &UpdateQuery) -> bool {
7778        !query.suppress_events
7779            && self.collection_has_event_subscriptions_for_operation(
7780                &query.table,
7781                crate::catalog::SubscriptionOperation::Update,
7782            )
7783    }
7784
7785    fn delete_may_emit_events(&self, query: &DeleteQuery) -> bool {
7786        !query.suppress_events
7787            && self.collection_has_event_subscriptions_for_operation(
7788                &query.table,
7789                crate::catalog::SubscriptionOperation::Delete,
7790            )
7791    }
7792
7793    fn collection_has_event_subscriptions_for_operation(
7794        &self,
7795        collection: &str,
7796        operation: crate::catalog::SubscriptionOperation,
7797    ) -> bool {
7798        let Some(contract) = self.db().collection_contract_arc(collection) else {
7799            return false;
7800        };
7801        contract.subscriptions.iter().any(|subscription| {
7802            subscription.enabled
7803                && (subscription.ops_filter.is_empty()
7804                    || subscription.ops_filter.contains(&operation))
7805        })
7806    }
7807
7808    fn record_pending_store_wal_actions(
7809        &self,
7810        conn_id: u64,
7811        actions: crate::storage::unified::DeferredStoreWalActions,
7812    ) {
7813        if actions.is_empty() {
7814            return;
7815        }
7816        let mut guard = self.inner.pending_store_wal_actions.write();
7817        guard.entry(conn_id).or_default().extend(actions);
7818    }
7819
7820    fn flush_pending_store_wal_actions(&self, conn_id: u64) -> RedDBResult<()> {
7821        let Some(actions) = self
7822            .inner
7823            .pending_store_wal_actions
7824            .write()
7825            .remove(&conn_id)
7826        else {
7827            return Ok(());
7828        };
7829        self.inner
7830            .db
7831            .store()
7832            .append_deferred_store_wal_actions(actions)
7833            .map_err(|err| RedDBError::Internal(err.to_string()))
7834    }
7835
7836    fn discard_pending_store_wal_actions(&self, conn_id: u64) {
7837        self.inner
7838            .pending_store_wal_actions
7839            .write()
7840            .remove(&conn_id);
7841    }
7842
7843    fn xid_conflicts_with_snapshot(
7844        &self,
7845        xid: crate::storage::transaction::snapshot::Xid,
7846        snapshot: &crate::storage::transaction::snapshot::Snapshot,
7847        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
7848    ) -> bool {
7849        xid != 0
7850            && !own_xids.contains(&xid)
7851            && !self.inner.snapshot_manager.is_aborted(xid)
7852            && !self.inner.snapshot_manager.is_active(xid)
7853            && (xid > snapshot.xid || snapshot.in_progress.contains(&xid))
7854    }
7855
7856    fn conflict_error(
7857        collection: &str,
7858        logical_id: crate::storage::unified::entity::EntityId,
7859        xid: crate::storage::transaction::snapshot::Xid,
7860    ) -> RedDBError {
7861        RedDBError::Query(format!(
7862            "serialization conflict: table row {collection}/{} was modified by concurrent transaction {xid}",
7863            logical_id.raw()
7864        ))
7865    }
7866
7867    fn check_logical_row_conflict(
7868        &self,
7869        collection: &str,
7870        logical_id: crate::storage::unified::entity::EntityId,
7871        excluded_ids: &[crate::storage::unified::entity::EntityId],
7872        snapshot: &crate::storage::transaction::snapshot::Snapshot,
7873        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
7874    ) -> RedDBResult<()> {
7875        let store = self.inner.db.store();
7876        let Some(manager) = store.get_collection(collection) else {
7877            return Ok(());
7878        };
7879
7880        for candidate in manager.query_all(|_| true) {
7881            if excluded_ids.contains(&candidate.id) || candidate.logical_id() != logical_id {
7882                continue;
7883            }
7884            if self.xid_conflicts_with_snapshot(candidate.xmin, snapshot, own_xids) {
7885                return Err(Self::conflict_error(collection, logical_id, candidate.xmin));
7886            }
7887            if self.xid_conflicts_with_snapshot(candidate.xmax, snapshot, own_xids) {
7888                return Err(Self::conflict_error(collection, logical_id, candidate.xmax));
7889            }
7890        }
7891        Ok(())
7892    }
7893
7894    pub(crate) fn check_table_row_write_conflicts(
7895        &self,
7896        conn_id: u64,
7897        snapshot: &crate::storage::transaction::snapshot::Snapshot,
7898        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
7899    ) -> RedDBResult<()> {
7900        let versioned_updates = self
7901            .inner
7902            .pending_versioned_updates
7903            .read()
7904            .get(&conn_id)
7905            .cloned()
7906            .unwrap_or_default();
7907        let tombstones = self
7908            .inner
7909            .pending_tombstones
7910            .read()
7911            .get(&conn_id)
7912            .cloned()
7913            .unwrap_or_default();
7914
7915        let store = self.inner.db.store();
7916        for (collection, old_id, new_id, xid, previous_xmax) in versioned_updates {
7917            let Some(manager) = store.get_collection(&collection) else {
7918                continue;
7919            };
7920            let Some(old) = manager.get(old_id) else {
7921                continue;
7922            };
7923            let logical_id = old.logical_id();
7924            if self.xid_conflicts_with_snapshot(previous_xmax, snapshot, own_xids) {
7925                return Err(Self::conflict_error(&collection, logical_id, previous_xmax));
7926            }
7927            if old.xmax != xid && self.xid_conflicts_with_snapshot(old.xmax, snapshot, own_xids) {
7928                return Err(Self::conflict_error(&collection, logical_id, old.xmax));
7929            }
7930            self.check_logical_row_conflict(
7931                &collection,
7932                logical_id,
7933                &[old_id, new_id],
7934                snapshot,
7935                own_xids,
7936            )?;
7937        }
7938
7939        for (collection, id, xid, previous_xmax) in tombstones {
7940            let Some(manager) = store.get_collection(&collection) else {
7941                continue;
7942            };
7943            let Some(entity) = manager.get(id) else {
7944                continue;
7945            };
7946            let logical_id = entity.logical_id();
7947            if self.xid_conflicts_with_snapshot(previous_xmax, snapshot, own_xids) {
7948                return Err(Self::conflict_error(&collection, logical_id, previous_xmax));
7949            }
7950            if entity.xmax != xid
7951                && self.xid_conflicts_with_snapshot(entity.xmax, snapshot, own_xids)
7952            {
7953                return Err(Self::conflict_error(&collection, logical_id, entity.xmax));
7954            }
7955            self.check_logical_row_conflict(&collection, logical_id, &[id], snapshot, own_xids)?;
7956        }
7957
7958        Ok(())
7959    }
7960
7961    pub(crate) fn restore_pending_write_stamps(&self, conn_id: u64) {
7962        let versioned_updates = self
7963            .inner
7964            .pending_versioned_updates
7965            .read()
7966            .get(&conn_id)
7967            .cloned()
7968            .unwrap_or_default();
7969        let tombstones = self
7970            .inner
7971            .pending_tombstones
7972            .read()
7973            .get(&conn_id)
7974            .cloned()
7975            .unwrap_or_default();
7976
7977        let store = self.inner.db.store();
7978        for (collection, old_id, _new_id, xid, _previous_xmax) in versioned_updates {
7979            if let Some(manager) = store.get_collection(&collection) {
7980                if let Some(mut entity) = manager.get(old_id) {
7981                    entity.set_xmax(xid);
7982                    let _ = manager.update(entity);
7983                }
7984            }
7985        }
7986        for (collection, id, xid, _previous_xmax) in tombstones {
7987            if let Some(manager) = store.get_collection(&collection) {
7988                if let Some(mut entity) = manager.get(id) {
7989                    entity.set_xmax(xid);
7990                    let _ = manager.update(entity);
7991                }
7992            }
7993        }
7994    }
7995
7996    pub(crate) fn finalize_pending_versioned_updates(&self, conn_id: u64) {
7997        self.inner
7998            .pending_versioned_updates
7999            .write()
8000            .remove(&conn_id);
8001    }
8002
8003    pub(crate) fn revive_pending_versioned_updates(&self, conn_id: u64) {
8004        let Some(pending) = self
8005            .inner
8006            .pending_versioned_updates
8007            .write()
8008            .remove(&conn_id)
8009        else {
8010            return;
8011        };
8012
8013        let store = self.inner.db.store();
8014        for (collection, old_id, new_id, xid, previous_xmax) in pending {
8015            if let Some(manager) = store.get_collection(&collection) {
8016                if let Some(mut old) = manager.get(old_id) {
8017                    if old.xmax == xid {
8018                        old.set_xmax(previous_xmax);
8019                        let _ = manager.update(old);
8020                    }
8021                }
8022            }
8023            let _ = store.delete_batch(&collection, &[new_id]);
8024        }
8025    }
8026
8027    pub(crate) fn revive_versioned_updates_since(&self, conn_id: u64, stamper_xid: u64) -> usize {
8028        let mut guard = self.inner.pending_versioned_updates.write();
8029        let Some(pending) = guard.get_mut(&conn_id) else {
8030            return 0;
8031        };
8032
8033        let store = self.inner.db.store();
8034        let mut reverted = 0usize;
8035        pending.retain(|(collection, old_id, new_id, xid, previous_xmax)| {
8036            if *xid < stamper_xid {
8037                return true;
8038            }
8039            if let Some(manager) = store.get_collection(collection) {
8040                if let Some(mut old) = manager.get(*old_id) {
8041                    if old.xmax == *xid {
8042                        old.set_xmax(*previous_xmax);
8043                        let _ = manager.update(old);
8044                    }
8045                }
8046            }
8047            let _ = store.delete_batch(collection, &[*new_id]);
8048            reverted += 1;
8049            false
8050        });
8051        if pending.is_empty() {
8052            guard.remove(&conn_id);
8053        }
8054        reverted
8055    }
8056
8057    /// Flush tombstones on COMMIT. The xmax stamp is already the durable
8058    /// delete marker; commit only drops the rollback journal and emits
8059    /// side effects. Physical reclamation is left for VACUUM so old
8060    /// snapshots can still resolve the pre-delete row version.
8061    pub(crate) fn finalize_pending_tombstones(&self, conn_id: u64) {
8062        let Some(pending) = self.inner.pending_tombstones.write().remove(&conn_id) else {
8063            return;
8064        };
8065        if pending.is_empty() {
8066            return;
8067        }
8068
8069        let store = self.inner.db.store();
8070        for (collection, id, _xid, _previous_xmax) in pending {
8071            store.context_index().remove_entity(id);
8072            self.cdc_emit(
8073                crate::replication::cdc::ChangeOperation::Delete,
8074                &collection,
8075                id.raw(),
8076                "entity",
8077            );
8078        }
8079    }
8080
8081    /// Revive tombstones on ROLLBACK — reset `xmax` to 0 so the tuples
8082    /// become visible again to future snapshots. Best-effort: a row
8083    /// already reclaimed by a concurrent VACUUM stays gone, but VACUUM
8084    /// never reclaims tuples whose xmax is still referenced by any
8085    /// active snapshot, so this case is only reachable via external
8086    /// storage corruption.
8087    pub(crate) fn revive_pending_tombstones(&self, conn_id: u64) {
8088        let Some(pending) = self.inner.pending_tombstones.write().remove(&conn_id) else {
8089            return;
8090        };
8091
8092        let store = self.inner.db.store();
8093        for (collection, id, xid, previous_xmax) in pending {
8094            let Some(manager) = store.get_collection(&collection) else {
8095                continue;
8096            };
8097            if let Some(mut entity) = manager.get(id) {
8098                if entity.xmax == xid {
8099                    entity.set_xmax(previous_xmax);
8100                    let _ = manager.update(entity);
8101                }
8102            }
8103        }
8104    }
8105
8106    pub(crate) fn finalize_pending_kv_watch_events(&self, conn_id: u64) {
8107        let Some(pending) = self.inner.pending_kv_watch_events.write().remove(&conn_id) else {
8108            return;
8109        };
8110        for event in pending {
8111            self.cdc_emit_kv(
8112                event.op,
8113                &event.collection,
8114                &event.key,
8115                0,
8116                event.before,
8117                event.after,
8118            );
8119        }
8120    }
8121
8122    pub(crate) fn discard_pending_kv_watch_events(&self, conn_id: u64) {
8123        self.inner.pending_kv_watch_events.write().remove(&conn_id);
8124    }
8125
8126    /// Materialise the entire graph store while applying MVCC visibility
8127    /// AND per-collection RLS to each candidate node and edge. Mirrors
8128    /// `materialize_graph` but routes every entity through the same
8129    /// gate the SELECT path uses, with the correct `PolicyTargetKind`
8130    /// per entity kind (`Nodes` for graph nodes, `Edges` for graph
8131    /// edges). Returns the filtered `GraphStore` plus the
8132    /// `node_id → properties` map the executor needs for `RETURN n.*`
8133    /// projections.
8134    fn materialize_graph_with_rls(
8135        &self,
8136    ) -> RedDBResult<(
8137        crate::storage::engine::GraphStore,
8138        std::collections::HashMap<
8139            String,
8140            std::collections::HashMap<String, crate::storage::schema::Value>,
8141        >,
8142        crate::storage::query::unified::EdgeProperties,
8143    )> {
8144        use crate::storage::engine::GraphStore;
8145        use crate::storage::query::ast::{PolicyAction, PolicyTargetKind};
8146        use crate::storage::unified::entity::{EntityData, EntityKind};
8147        use std::collections::{HashMap, HashSet};
8148
8149        let store = self.inner.db.store();
8150        let snap_ctx = capture_current_snapshot();
8151        let role = current_auth_identity().map(|(_, r)| r.as_str().to_string());
8152
8153        let graph = GraphStore::new();
8154        let mut node_properties: HashMap<String, HashMap<String, crate::storage::schema::Value>> =
8155            HashMap::new();
8156        let mut edge_properties: crate::storage::query::unified::EdgeProperties = HashMap::new();
8157        let mut allowed_nodes: HashSet<String> = HashSet::new();
8158
8159        // Per-collection cached compiled filters — Nodes-kind for
8160        // first pass, Edges-kind for the second. None entries mean
8161        // "RLS enabled, zero matching policy → deny all of this kind".
8162        let mut node_rls: HashMap<String, Option<crate::storage::query::ast::Filter>> =
8163            HashMap::new();
8164        let mut edge_rls: HashMap<String, Option<crate::storage::query::ast::Filter>> =
8165            HashMap::new();
8166
8167        let collections = store.list_collections();
8168
8169        // First pass — gather nodes.
8170        for collection in &collections {
8171            let Some(manager) = store.get_collection(collection) else {
8172                continue;
8173            };
8174            let entities = manager.query_all(|_| true);
8175            for entity in entities {
8176                if !entity_visible_with_context(snap_ctx.as_ref(), &entity) {
8177                    continue;
8178                }
8179                let EntityKind::GraphNode(ref node) = entity.kind else {
8180                    continue;
8181                };
8182                if !node_passes_rls(self, collection, role.as_deref(), &mut node_rls, &entity) {
8183                    continue;
8184                }
8185                let id_str = entity.id.raw().to_string();
8186                graph
8187                    .add_node_with_label(
8188                        &id_str,
8189                        &node.label,
8190                        &super::graph_node_label(&node.node_type),
8191                    )
8192                    .map_err(|err| RedDBError::Query(err.to_string()))?;
8193                allowed_nodes.insert(id_str.clone());
8194                if let EntityData::Node(node_data) = &entity.data {
8195                    node_properties.insert(id_str, node_data.properties.clone());
8196                }
8197            }
8198        }
8199
8200        // Second pass — gather edges. An edge appears only when both
8201        // endpoint nodes survived the RLS pass AND the edge itself
8202        // passes its own RLS gate.
8203        for collection in &collections {
8204            let Some(manager) = store.get_collection(collection) else {
8205                continue;
8206            };
8207            let entities = manager.query_all(|_| true);
8208            for entity in entities {
8209                if !entity_visible_with_context(snap_ctx.as_ref(), &entity) {
8210                    continue;
8211                }
8212                let EntityKind::GraphEdge(ref edge) = entity.kind else {
8213                    continue;
8214                };
8215                if !allowed_nodes.contains(&edge.from_node)
8216                    || !allowed_nodes.contains(&edge.to_node)
8217                {
8218                    continue;
8219                }
8220                if !edge_passes_rls(self, collection, role.as_deref(), &mut edge_rls, &entity) {
8221                    continue;
8222                }
8223                let weight = match &entity.data {
8224                    EntityData::Edge(e) => e.weight,
8225                    _ => edge.weight as f32 / 1000.0,
8226                };
8227                let edge_label = super::graph_edge_label(&edge.label);
8228                graph
8229                    .add_edge_with_label(&edge.from_node, &edge.to_node, &edge_label, weight)
8230                    .map_err(|err| RedDBError::Query(err.to_string()))?;
8231                if let EntityData::Edge(edge_data) = &entity.data {
8232                    edge_properties.insert(
8233                        (edge.from_node.clone(), edge_label, edge.to_node.clone()),
8234                        edge_data.properties.clone(),
8235                    );
8236                }
8237            }
8238        }
8239
8240        // Suppress unused-PolicyAction/PolicyTargetKind warnings — both
8241        // are used inside the helper closures via the per-kind helpers
8242        // declared at the bottom of this file.
8243        let _ = (PolicyAction::Select, PolicyTargetKind::Nodes);
8244
8245        Ok((graph, node_properties, edge_properties))
8246    }
8247
8248    /// Phase 1.1 MVCC universal: post-save hook that stamps `xmin` on a
8249    /// freshly-inserted entity when the current connection holds an
8250    /// open transaction. Used by graph / vector / queue / timeseries
8251    /// write paths that go through the DevX builder API (`db.node(...)
8252    /// .save()` and friends) — those live in the storage crate and
8253    /// can't reach `current_xid()` without crossing layers, so the
8254    /// application layer calls this helper right after `save()` to
8255    /// finalise the MVCC stamp.
8256    ///
8257    /// Autocommit (outside BEGIN) is a no-op — no extra lookup or
8258    /// write, so the non-transactional hot path stays untouched.
8259    ///
8260    /// Best-effort: if the collection or entity disappears between
8261    /// the save and the stamp (concurrent DROP), we silently skip.
8262    pub(crate) fn stamp_xmin_if_in_txn(
8263        &self,
8264        collection: &str,
8265        id: crate::storage::unified::entity::EntityId,
8266    ) {
8267        let Some(xid) = self.current_xid() else {
8268            return;
8269        };
8270        let store = self.inner.db.store();
8271        let Some(manager) = store.get_collection(collection) else {
8272            return;
8273        };
8274        if let Some(mut entity) = manager.get(id) {
8275            entity.set_xmin(xid);
8276            let _ = manager.update(entity);
8277        }
8278    }
8279
8280    /// Revive tombstones stamped by `stamper_xid` or any sub-xid
8281    /// allocated after it (Phase 2.3.2e savepoint rollback). Any
8282    /// pending entries with `xid < stamper_xid` stay queued because
8283    /// they belong to the enclosing scope — they'll either flush on
8284    /// COMMIT or revive on an outer ROLLBACK TO SAVEPOINT.
8285    ///
8286    /// Returns the number of tuples whose `xmax` was wiped back to 0.
8287    pub(crate) fn revive_tombstones_since(&self, conn_id: u64, stamper_xid: u64) -> usize {
8288        let mut guard = self.inner.pending_tombstones.write();
8289        let Some(pending) = guard.get_mut(&conn_id) else {
8290            return 0;
8291        };
8292
8293        let store = self.inner.db.store();
8294        let mut revived = 0usize;
8295        pending.retain(|(collection, id, xid, previous_xmax)| {
8296            if *xid < stamper_xid {
8297                // Stamped before the savepoint — keep in queue.
8298                return true;
8299            }
8300            if let Some(manager) = store.get_collection(collection) {
8301                if let Some(mut entity) = manager.get(*id) {
8302                    if entity.xmax == *xid {
8303                        entity.set_xmax(*previous_xmax);
8304                        let _ = manager.update(entity);
8305                        revived += 1;
8306                    }
8307                }
8308            }
8309            false
8310        });
8311        if pending.is_empty() {
8312            guard.remove(&conn_id);
8313        }
8314        revived
8315    }
8316
8317    /// Return the snapshot the current connection should use for visibility
8318    /// checks (Phase 2.3 PG parity).
8319    ///
8320    /// * If the connection is inside a BEGIN-wrapped transaction, reuse
8321    ///   the snapshot stored in its `TxnContext`.
8322    /// * Otherwise (autocommit), capture a fresh snapshot tied to an
8323    ///   implicit xid=0 — the read path treats pre-MVCC rows as always
8324    ///   visible so this degrades to "see everything committed".
8325    pub fn current_snapshot(&self) -> crate::storage::transaction::snapshot::Snapshot {
8326        let conn_id = current_connection_id();
8327        if let Some(ctx) = self.inner.tx_contexts.read().get(&conn_id).cloned() {
8328            return ctx.snapshot;
8329        }
8330        // Autocommit: take a fresh snapshot bounded by `peek_next_xid` so
8331        // every already-committed xid (which is strictly less) passes the
8332        // `xmin <= snap.xid` gate, while concurrently-active xids land in
8333        // the `in_progress` set and stay hidden until they commit. Using
8334        // xid=0 would incorrectly hide every MVCC-stamped tuple.
8335        let high_water = self.inner.snapshot_manager.peek_next_xid();
8336        self.inner.snapshot_manager.snapshot(high_water)
8337    }
8338
8339    /// Xid of the current connection's active transaction, or `None` when
8340    /// running outside a BEGIN/COMMIT block. Write paths call this to
8341    /// decide whether to stamp `xmin`/`xmax` on tuples.
8342    /// Phase 2.3.2e: when a savepoint is open, `writer_xid` returns the
8343    /// sub-xid so new writes can be selectively rolled back. Otherwise
8344    /// the parent txn's xid is returned, matching pre-savepoint
8345    /// behaviour. Callers that need the enclosing *transaction* xid
8346    /// (e.g. VACUUM min-active calculations) should read `ctx.xid`
8347    /// directly.
8348    pub fn current_xid(&self) -> Option<crate::storage::transaction::snapshot::Xid> {
8349        let conn_id = current_connection_id();
8350        self.inner
8351            .tx_contexts
8352            .read()
8353            .get(&conn_id)
8354            .map(|ctx| ctx.writer_xid())
8355    }
8356
8357    /// Access the shared `SnapshotManager` — useful for VACUUM to compute
8358    /// the oldest-active xid when reclaiming dead tuples.
8359    pub fn snapshot_manager(&self) -> Arc<crate::storage::transaction::snapshot::SnapshotManager> {
8360        Arc::clone(&self.inner.snapshot_manager)
8361    }
8362
8363    fn mvcc_vacuum_cutoff_xid(&self) -> crate::storage::transaction::snapshot::Xid {
8364        let manager = &self.inner.snapshot_manager;
8365        let next_xid = manager.peek_next_xid();
8366        let mut cutoff = next_xid;
8367        if let Some(oldest_active) = manager.oldest_active_xid() {
8368            cutoff = cutoff.min(oldest_active);
8369        }
8370        if let Some(oldest_pinned) = manager.oldest_pinned_xid() {
8371            cutoff = cutoff.min(oldest_pinned);
8372        }
8373        let retention_xids = self.config_u64("runtime.mvcc.vacuum_retention_xids", 0);
8374        if retention_xids > 0 {
8375            cutoff = cutoff.min(next_xid.saturating_sub(retention_xids));
8376        }
8377        cutoff
8378    }
8379
8380    fn rebuild_runtime_indexes_for_table(&self, table: &str) -> RedDBResult<()> {
8381        let registered = self.inner.index_store.list_indices(table);
8382        if registered.is_empty() {
8383            return Ok(());
8384        }
8385        let store = self.inner.db.store();
8386        let Some(manager) = store.get_collection(table) else {
8387            return Ok(());
8388        };
8389        let entity_fields = manager
8390            .query_all(|entity| matches!(entity.kind, crate::storage::EntityKind::TableRow { .. }))
8391            .into_iter()
8392            .map(|entity| (entity.id, table_row_index_fields(&entity)))
8393            .collect::<Vec<_>>();
8394
8395        for index in registered {
8396            self.inner.index_store.drop_index(&index.name, table);
8397            self.inner
8398                .index_store
8399                .create_index(
8400                    &index.name,
8401                    table,
8402                    &index.columns,
8403                    index.method,
8404                    index.unique,
8405                    &entity_fields,
8406                )
8407                .map_err(RedDBError::Internal)?;
8408            self.inner.index_store.register(index);
8409        }
8410        self.invalidate_plan_cache();
8411        Ok(())
8412    }
8413
8414    /// Own-tx xids (parent + open/released savepoints) for the current
8415    /// connection. Transports + tests that build a `SnapshotContext`
8416    /// manually (outside the `execute_query` scope) need this set so
8417    /// the writer's own uncommitted tuples stay visible to self.
8418    pub fn current_txn_own_xids(
8419        &self,
8420    ) -> std::collections::HashSet<crate::storage::transaction::snapshot::Xid> {
8421        let mut set = std::collections::HashSet::new();
8422        if let Some(ctx) = self.inner.tx_contexts.read().get(&current_connection_id()) {
8423            set.insert(ctx.xid);
8424            for (_, sub) in &ctx.savepoints {
8425                set.insert(*sub);
8426            }
8427            for sub in &ctx.released_sub_xids {
8428                set.insert(*sub);
8429            }
8430        }
8431        set
8432    }
8433
8434    /// Access the shared `ForeignTableRegistry` (Phase 3.2 PG parity).
8435    ///
8436    /// Callers use this to check whether a table name is a registered
8437    /// foreign table (`registry.is_foreign_table(name)`) and, if so, to
8438    /// scan it (`registry.scan(name)`). The read-path rewriter consults
8439    /// this before dispatching into native-collection lookup.
8440    pub fn foreign_tables(&self) -> Arc<crate::storage::fdw::ForeignTableRegistry> {
8441        Arc::clone(&self.inner.foreign_tables)
8442    }
8443
8444    /// Is Row-Level Security enabled for this table? (Phase 2.5 PG parity)
8445    pub fn is_rls_enabled(&self, table: &str) -> bool {
8446        self.inner.rls_enabled_tables.read().contains(table)
8447    }
8448
8449    /// Collect the USING predicates that apply to this `(table, role, action)`.
8450    ///
8451    /// Returned filters should be OR-combined (a row passes RLS when *any*
8452    /// matching policy accepts it) and then AND-ed into the query's WHERE.
8453    /// When the table has RLS disabled this returns an empty Vec — callers
8454    /// can fast-path back to the unfiltered read.
8455    pub fn matching_rls_policies(
8456        &self,
8457        table: &str,
8458        role: Option<&str>,
8459        action: crate::storage::query::ast::PolicyAction,
8460    ) -> Vec<crate::storage::query::ast::Filter> {
8461        // Default kind = Table preserves the pre-Phase-2.5.5 behaviour:
8462        // callers that don't name a kind only see Table-scoped
8463        // policies (which is what execute SELECT / UPDATE / DELETE
8464        // expect).
8465        self.matching_rls_policies_for_kind(
8466            table,
8467            role,
8468            action,
8469            crate::storage::query::ast::PolicyTargetKind::Table,
8470        )
8471    }
8472
8473    /// Kind-aware variant used by cross-model scans (Phase 2.5.5).
8474    ///
8475    /// Graph scans request `Nodes` / `Edges`, vector ANN requests
8476    /// `Vectors`, queue consumers request `Messages`, and timeseries
8477    /// range scans request `Points`. Policies tagged with a
8478    /// different kind are skipped so a graph-scoped policy doesn't
8479    /// accidentally gate a table SELECT on the same collection.
8480    pub fn matching_rls_policies_for_kind(
8481        &self,
8482        table: &str,
8483        role: Option<&str>,
8484        action: crate::storage::query::ast::PolicyAction,
8485        kind: crate::storage::query::ast::PolicyTargetKind,
8486    ) -> Vec<crate::storage::query::ast::Filter> {
8487        if !self.is_rls_enabled(table) {
8488            return Vec::new();
8489        }
8490        let policies = self.inner.rls_policies.read();
8491        policies
8492            .iter()
8493            .filter_map(|((t, _), p)| {
8494                if t != table {
8495                    return None;
8496                }
8497                // Kind gate — Table policies also apply to every
8498                // other kind *iff* the policy predicate evaluates
8499                // against entity fields that exist uniformly; the
8500                // caller's kind filter is the stricter check, so
8501                // match literally. Auto-tenancy policies stamp
8502                // Table and the caller passes the concrete kind —
8503                // we allow Table policies to apply cross-kind for
8504                // backwards compat.
8505                if p.target_kind != kind
8506                    && p.target_kind != crate::storage::query::ast::PolicyTargetKind::Table
8507                {
8508                    return None;
8509                }
8510                // Action gate — `None` means "ALL" actions.
8511                if let Some(a) = p.action {
8512                    if a != action {
8513                        return None;
8514                    }
8515                }
8516                // Role gate — `None` means "any role".
8517                if let Some(p_role) = p.role.as_deref() {
8518                    match role {
8519                        Some(r) if r == p_role => {}
8520                        _ => return None,
8521                    }
8522                }
8523                Some((*p.using).clone())
8524            })
8525            .collect()
8526    }
8527
8528    pub(crate) fn refresh_table_planner_stats(&self, table: &str) {
8529        let store = self.inner.db.store();
8530        if let Some(stats) =
8531            crate::storage::query::planner::stats_catalog::analyze_collection(store.as_ref(), table)
8532        {
8533            crate::storage::query::planner::stats_catalog::persist_table_stats(
8534                store.as_ref(),
8535                &stats,
8536            );
8537        } else {
8538            crate::storage::query::planner::stats_catalog::clear_table_stats(store.as_ref(), table);
8539        }
8540        self.invalidate_plan_cache();
8541    }
8542
8543    pub(crate) fn note_table_write(&self, table: &str) {
8544        // Skip the write lock when the table is already marked
8545        // dirty. With single-row UPDATEs in a loop this used to
8546        // grab the planner_dirty_tables write lock N times even
8547        // though the first call already flipped the flag.
8548        let already_dirty = self.inner.planner_dirty_tables.read().contains(table);
8549        if !already_dirty {
8550            self.inner
8551                .planner_dirty_tables
8552                .write()
8553                .insert(table.to_string());
8554        }
8555        self.invalidate_result_cache_for_table(table);
8556    }
8557
8558    /// Wrap the planner's `RuntimeQueryExplain` as rows on a
8559    /// `RuntimeQueryResult` so callers over the SQL interface see the
8560    /// plan tree in the same shape a SELECT produces.
8561    ///
8562    /// Columns: `op`, `source`, `est_rows`, `est_cost`, `depth`.
8563    /// Nodes are walked depth-first; `depth` counts from 0 at the
8564    /// root so a text renderer can indent without re-walking.
8565    fn explain_as_rows(&self, raw_query: &str, inner_sql: &str) -> RedDBResult<RuntimeQueryResult> {
8566        let explain = self.explain_query(inner_sql)?;
8567
8568        let columns = vec![
8569            "op".to_string(),
8570            "source".to_string(),
8571            "est_rows".to_string(),
8572            "est_cost".to_string(),
8573            "depth".to_string(),
8574        ];
8575
8576        let mut records: Vec<crate::storage::query::unified::UnifiedRecord> = Vec::new();
8577
8578        // Prepend `CteScan` markers when the query carried a leading
8579        // WITH clause. The CTE bodies are already inlined into the
8580        // main plan tree, but operators reading EXPLAIN need to see
8581        // which named CTEs were resolved — without this row the plan
8582        // would look indistinguishable from a hand-inlined query.
8583        for name in &explain.cte_materializations {
8584            use std::sync::Arc;
8585            let mut rec = crate::storage::query::unified::UnifiedRecord::default();
8586            rec.set_arc(Arc::from("op"), Value::text("CteScan".to_string()));
8587            rec.set_arc(Arc::from("source"), Value::text(name.clone()));
8588            rec.set_arc(Arc::from("est_rows"), Value::Float(0.0));
8589            rec.set_arc(Arc::from("est_cost"), Value::Float(0.0));
8590            rec.set_arc(Arc::from("depth"), Value::Integer(0));
8591            records.push(rec);
8592        }
8593
8594        walk_plan_node(&explain.logical_plan.root, 0, &mut records);
8595
8596        let result = crate::storage::query::unified::UnifiedResult {
8597            columns,
8598            records,
8599            stats: Default::default(),
8600            pre_serialized_json: None,
8601        };
8602
8603        Ok(RuntimeQueryResult {
8604            query: raw_query.to_string(),
8605            mode: explain.mode,
8606            statement: "explain",
8607            engine: "runtime-explain",
8608            result,
8609            affected_rows: 0,
8610            statement_type: "select",
8611        })
8612    }
8613
8614    // -----------------------------------------------------------------
8615    // Granular RBAC — privilege gate + GRANT/REVOKE/ALTER USER dispatch
8616    // -----------------------------------------------------------------
8617
8618    /// Project a `QueryExpr` to the (action, resource) pair the
8619    /// privilege engine cares about. Returns `Ok(())` for statements
8620    /// that don't touch user data (transaction control, SHOW, SET, etc.).
8621    pub(super) fn check_query_privilege(
8622        &self,
8623        expr: &crate::storage::query::ast::QueryExpr,
8624    ) -> Result<(), String> {
8625        use crate::auth::privileges::{Action, AuthzContext, Resource};
8626        use crate::auth::UserId;
8627        use crate::storage::query::ast::QueryExpr;
8628
8629        // No auth store wired (embedded mode / fresh DB / tests) → bypass.
8630        // The bootstrap path itself goes through `execute_query` so this
8631        // is the only sensible default; once auth is wired, the gate
8632        // becomes active.
8633        let auth_store = match self.inner.auth_store.read().clone() {
8634            Some(s) => s,
8635            None => return Ok(()),
8636        };
8637
8638        // Resolve principal + role from the thread-local identity.
8639        // Anonymous (no identity) is allowed to read the bootstrap path
8640        // only when auth_store says so; we treat missing identity as
8641        // platform-admin-equivalent here so embedded test harnesses
8642        // continue to work without setting an identity.
8643        let (username, role) = match current_auth_identity() {
8644            Some(p) => p,
8645            None => return Ok(()),
8646        };
8647        let tenant = current_tenant();
8648
8649        let ctx = AuthzContext {
8650            principal: &username,
8651            effective_role: role,
8652            tenant: tenant.as_deref(),
8653        };
8654        let principal_id = UserId::from_parts(tenant.as_deref(), &username);
8655
8656        // Map QueryExpr → (Action, Resource).
8657        let (action, resource) = match expr {
8658            QueryExpr::Table(t) => (Action::Select, Resource::table_from_name(&t.table)),
8659            QueryExpr::QueueSelect(q) => (Action::Select, Resource::table_from_name(&q.queue)),
8660            QueryExpr::Graph(g) => {
8661                if auth_store.iam_authorization_enabled() {
8662                    self.check_graph_property_projection_privilege(
8663                        &auth_store,
8664                        &principal_id,
8665                        role,
8666                        tenant.as_deref(),
8667                        g,
8668                    )?;
8669                    return Ok(());
8670                }
8671                return Ok(());
8672            }
8673            QueryExpr::Vector(v) => {
8674                if auth_store.iam_authorization_enabled() {
8675                    self.check_table_like_column_projection_privilege(
8676                        &auth_store,
8677                        &principal_id,
8678                        role,
8679                        tenant.as_deref(),
8680                        &v.collection,
8681                        &["content".to_string()],
8682                    )?;
8683                    return Ok(());
8684                }
8685                return Ok(());
8686            }
8687            QueryExpr::Insert(i) => (Action::Insert, Resource::table_from_name(&i.table)),
8688            QueryExpr::Update(u) => (Action::Update, Resource::table_from_name(&u.table)),
8689            QueryExpr::Delete(d) => (Action::Delete, Resource::table_from_name(&d.table)),
8690            // Joins inherit the read privilege from any constituent
8691            // table — for now we emit a single Select on the database
8692            // (admins bypass; non-admins need a Database/Schema grant).
8693            QueryExpr::Join(_) => (Action::Select, Resource::Database),
8694            // GRANT / REVOKE / ALTER USER are authority statements;
8695            // require Admin (the helper methods enforce).
8696            QueryExpr::Grant(_) | QueryExpr::Revoke(_) | QueryExpr::AlterUser(_) => {
8697                return if role == crate::auth::Role::Admin {
8698                    Ok(())
8699                } else {
8700                    Err(format!(
8701                        "principal=`{}` role=`{:?}` cannot issue ACL/auth DDL",
8702                        username, role
8703                    ))
8704                };
8705            }
8706            QueryExpr::CreateIamPolicy { id, .. } => {
8707                return self.check_policy_management_privilege(
8708                    &auth_store,
8709                    &principal_id,
8710                    role,
8711                    tenant.as_deref(),
8712                    "policy:put",
8713                    "policy",
8714                    id,
8715                );
8716            }
8717            QueryExpr::DropIamPolicy { id } => {
8718                return self.check_policy_management_privilege(
8719                    &auth_store,
8720                    &principal_id,
8721                    role,
8722                    tenant.as_deref(),
8723                    "policy:drop",
8724                    "policy",
8725                    id,
8726                );
8727            }
8728            QueryExpr::AttachPolicy { policy_id, .. } => {
8729                return self.check_policy_management_privilege(
8730                    &auth_store,
8731                    &principal_id,
8732                    role,
8733                    tenant.as_deref(),
8734                    "policy:attach",
8735                    "policy",
8736                    policy_id,
8737                );
8738            }
8739            QueryExpr::DetachPolicy { policy_id, .. } => {
8740                return self.check_policy_management_privilege(
8741                    &auth_store,
8742                    &principal_id,
8743                    role,
8744                    tenant.as_deref(),
8745                    "policy:detach",
8746                    "policy",
8747                    policy_id,
8748                );
8749            }
8750            QueryExpr::ShowPolicies { .. } | QueryExpr::ShowEffectivePermissions { .. } => {
8751                return Ok(());
8752            }
8753            QueryExpr::SimulatePolicy { .. } => {
8754                return self.check_policy_management_privilege(
8755                    &auth_store,
8756                    &principal_id,
8757                    role,
8758                    tenant.as_deref(),
8759                    "policy:simulate",
8760                    "policy",
8761                    "*",
8762                );
8763            }
8764            // DROP and TRUNCATE — Write-role gate + per-collection IAM policy
8765            // when IAM mode is active. Other DDL stays role-only for now.
8766            QueryExpr::DropTable(q) => {
8767                return self.check_ddl_collection_privilege(
8768                    &auth_store,
8769                    &principal_id,
8770                    role,
8771                    tenant.as_deref(),
8772                    &username,
8773                    "drop",
8774                    &q.name,
8775                );
8776            }
8777            QueryExpr::DropGraph(q) => {
8778                return self.check_ddl_collection_privilege(
8779                    &auth_store,
8780                    &principal_id,
8781                    role,
8782                    tenant.as_deref(),
8783                    &username,
8784                    "drop",
8785                    &q.name,
8786                );
8787            }
8788            QueryExpr::DropVector(q) => {
8789                return self.check_ddl_collection_privilege(
8790                    &auth_store,
8791                    &principal_id,
8792                    role,
8793                    tenant.as_deref(),
8794                    &username,
8795                    "drop",
8796                    &q.name,
8797                );
8798            }
8799            QueryExpr::DropDocument(q) => {
8800                return self.check_ddl_collection_privilege(
8801                    &auth_store,
8802                    &principal_id,
8803                    role,
8804                    tenant.as_deref(),
8805                    &username,
8806                    "drop",
8807                    &q.name,
8808                );
8809            }
8810            QueryExpr::DropKv(q) => {
8811                return self.check_ddl_collection_privilege(
8812                    &auth_store,
8813                    &principal_id,
8814                    role,
8815                    tenant.as_deref(),
8816                    &username,
8817                    "drop",
8818                    &q.name,
8819                );
8820            }
8821            QueryExpr::DropCollection(q) => {
8822                return self.check_ddl_collection_privilege(
8823                    &auth_store,
8824                    &principal_id,
8825                    role,
8826                    tenant.as_deref(),
8827                    &username,
8828                    "drop",
8829                    &q.name,
8830                );
8831            }
8832            QueryExpr::Truncate(q) => {
8833                return self.check_ddl_collection_privilege(
8834                    &auth_store,
8835                    &principal_id,
8836                    role,
8837                    tenant.as_deref(),
8838                    &username,
8839                    "truncate",
8840                    &q.name,
8841                );
8842            }
8843            // Remaining DDL — gate on Write role. Fine-grained grants TBD.
8844            QueryExpr::CreateTable(_)
8845            | QueryExpr::CreateCollection(_)
8846            | QueryExpr::CreateVector(_)
8847            | QueryExpr::AlterTable(_)
8848            | QueryExpr::CreateIndex(_)
8849            | QueryExpr::DropIndex(_)
8850            | QueryExpr::CreateSchema(_)
8851            | QueryExpr::DropSchema(_)
8852            | QueryExpr::CreateSequence(_)
8853            | QueryExpr::DropSequence(_)
8854            | QueryExpr::CreateView(_)
8855            | QueryExpr::DropView(_)
8856            | QueryExpr::RefreshMaterializedView(_)
8857            | QueryExpr::CreatePolicy(_)
8858            | QueryExpr::DropPolicy(_)
8859            | QueryExpr::CreateServer(_)
8860            | QueryExpr::DropServer(_)
8861            | QueryExpr::CreateForeignTable(_)
8862            | QueryExpr::DropForeignTable(_)
8863            | QueryExpr::CreateTimeSeries(_)
8864            | QueryExpr::DropTimeSeries(_)
8865            | QueryExpr::CreateQueue(_)
8866            | QueryExpr::AlterQueue(_)
8867            | QueryExpr::DropQueue(_)
8868            | QueryExpr::CreateTree(_)
8869            | QueryExpr::DropTree(_) => {
8870                return if role >= crate::auth::Role::Write {
8871                    Ok(())
8872                } else {
8873                    Err(format!(
8874                        "principal=`{}` role=`{:?}` cannot issue DDL",
8875                        username, role
8876                    ))
8877                };
8878            }
8879            // Migration DDL — CREATE MIGRATION requires Write role (schema author).
8880            QueryExpr::CreateMigration(_) => {
8881                return if role >= crate::auth::Role::Write {
8882                    Ok(())
8883                } else {
8884                    Err(format!(
8885                        "principal=`{}` role=`{:?}` cannot issue CREATE MIGRATION",
8886                        username, role
8887                    ))
8888                };
8889            }
8890            // APPLY / ROLLBACK change data and schema — require Admin.
8891            QueryExpr::ApplyMigration(_) | QueryExpr::RollbackMigration(_) => {
8892                return if role == crate::auth::Role::Admin {
8893                    Ok(())
8894                } else {
8895                    Err(format!(
8896                        "principal=`{}` role=`{:?}` cannot issue APPLY/ROLLBACK MIGRATION",
8897                        username, role
8898                    ))
8899                };
8900            }
8901            // EXPLAIN MIGRATION is read-only — any authenticated principal.
8902            QueryExpr::ExplainMigration(_) => return Ok(()),
8903            // Everything else (SET, SHOW, transaction control, graph
8904            // commands, queue/tree commands, MaintenanceCommand …)
8905            // is allowed for any authenticated principal.
8906            _ => return Ok(()),
8907        };
8908
8909        if auth_store.iam_authorization_enabled() {
8910            let iam_action = legacy_action_to_iam(action);
8911            let iam_resource = legacy_resource_to_iam(&resource, tenant.as_deref());
8912            let iam_ctx = runtime_iam_context(role, tenant.as_deref());
8913            if !auth_store.check_policy_authz(&principal_id, iam_action, &iam_resource, &iam_ctx) {
8914                return Err(format!(
8915                    "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
8916                    username, iam_action, iam_resource.kind, iam_resource.name
8917                ));
8918            }
8919
8920            if let QueryExpr::Table(table) = expr {
8921                self.check_table_column_projection_privilege(
8922                    &auth_store,
8923                    &principal_id,
8924                    &iam_ctx,
8925                    table,
8926                )?;
8927            }
8928
8929            if let QueryExpr::Update(update) = expr {
8930                let columns = update_set_target_columns(update);
8931                if !columns.is_empty() {
8932                    let request = column_access_request_for_table_update(&update.table, columns);
8933                    let outcome =
8934                        auth_store.check_column_projection_authz(&principal_id, &request, &iam_ctx);
8935                    if let Some(denied) = outcome.first_denied_column() {
8936                        return Err(format!(
8937                            "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM column policy",
8938                            username, iam_action, denied.resource.kind, denied.resource.name
8939                        ));
8940                    }
8941                    if !outcome.allowed() {
8942                        return Err(format!(
8943                            "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
8944                            username,
8945                            iam_action,
8946                            outcome.table_resource.kind,
8947                            outcome.table_resource.name
8948                        ));
8949                    }
8950                }
8951
8952                if let Some(columns) = update_returning_columns_for_policy(self, update) {
8953                    let request = column_access_request_for_table_select(&update.table, columns);
8954                    let outcome =
8955                        auth_store.check_column_projection_authz(&principal_id, &request, &iam_ctx);
8956                    if let Some(denied) = outcome.first_denied_column() {
8957                        return Err(format!(
8958                            "principal=`{}` action=`select` resource=`{}:{}` denied by IAM column policy",
8959                            username, denied.resource.kind, denied.resource.name
8960                        ));
8961                    }
8962                    if !outcome.allowed() {
8963                        return Err(format!(
8964                            "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
8965                            username, outcome.table_resource.kind, outcome.table_resource.name
8966                        ));
8967                    }
8968                }
8969            }
8970
8971            Ok(())
8972        } else {
8973            auth_store
8974                .check_grant(&ctx, action, &resource)
8975                .map_err(|e| e.to_string())
8976        }
8977    }
8978
8979    fn check_table_column_projection_privilege(
8980        &self,
8981        auth_store: &Arc<crate::auth::store::AuthStore>,
8982        principal: &crate::auth::UserId,
8983        ctx: &crate::auth::policies::EvalContext,
8984        table: &crate::storage::query::ast::TableQuery,
8985    ) -> Result<(), String> {
8986        use crate::auth::{ColumnAccessRequest, ColumnDecisionEffect};
8987
8988        let columns = requested_table_columns_for_policy(table);
8989        if columns.is_empty() {
8990            return Ok(());
8991        }
8992
8993        let request = ColumnAccessRequest::select(table.table.clone(), columns);
8994        let outcome = auth_store.check_column_projection_authz(principal, &request, ctx);
8995        if outcome.allowed() {
8996            return Ok(());
8997        }
8998
8999        if !matches!(
9000            outcome.table_decision,
9001            crate::auth::policies::Decision::Allow { .. }
9002                | crate::auth::policies::Decision::AdminBypass
9003        ) {
9004            return Err(format!(
9005                "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
9006                principal, outcome.table_resource.kind, outcome.table_resource.name
9007            ));
9008        }
9009
9010        let denied = outcome
9011            .first_denied_column()
9012            .filter(|decision| decision.effective == ColumnDecisionEffect::Denied);
9013        match denied {
9014            Some(decision) => Err(format!(
9015                "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
9016                principal, decision.resource.kind, decision.resource.name
9017            )),
9018            None => Ok(()),
9019        }
9020    }
9021
9022    fn check_graph_property_projection_privilege(
9023        &self,
9024        auth_store: &Arc<crate::auth::store::AuthStore>,
9025        principal: &crate::auth::UserId,
9026        role: crate::auth::Role,
9027        tenant: Option<&str>,
9028        query: &crate::storage::query::ast::GraphQuery,
9029    ) -> Result<(), String> {
9030        let columns = explicit_graph_projection_properties(query);
9031        if columns.is_empty() {
9032            return Ok(());
9033        }
9034        self.check_table_like_column_projection_privilege(
9035            auth_store, principal, role, tenant, "graph", &columns,
9036        )
9037    }
9038
9039    fn check_table_like_column_projection_privilege(
9040        &self,
9041        auth_store: &Arc<crate::auth::store::AuthStore>,
9042        principal: &crate::auth::UserId,
9043        role: crate::auth::Role,
9044        tenant: Option<&str>,
9045        table: &str,
9046        columns: &[String],
9047    ) -> Result<(), String> {
9048        let iam_ctx = runtime_iam_context(role, tenant);
9049        let request =
9050            crate::auth::ColumnAccessRequest::select(table.to_string(), columns.iter().cloned());
9051        let outcome = auth_store.check_column_projection_authz(principal, &request, &iam_ctx);
9052        if outcome.allowed() {
9053            return Ok(());
9054        }
9055        let denied = outcome
9056            .first_denied_column()
9057            .map(|d| d.resource.name.clone())
9058            .unwrap_or_else(|| format!("{table}.<unknown>"));
9059        Err(format!(
9060            "principal=`{}` action=`select` resource=`column:{}` denied by IAM policy",
9061            principal, denied
9062        ))
9063    }
9064
9065    fn check_policy_management_privilege(
9066        &self,
9067        auth_store: &Arc<crate::auth::store::AuthStore>,
9068        principal: &crate::auth::UserId,
9069        role: crate::auth::Role,
9070        tenant: Option<&str>,
9071        action: &str,
9072        resource_kind: &str,
9073        resource_name: &str,
9074    ) -> Result<(), String> {
9075        if !auth_store.iam_authorization_enabled() {
9076            return if role == crate::auth::Role::Admin {
9077                Ok(())
9078            } else {
9079                Err(format!(
9080                    "principal=`{}` role=`{:?}` cannot issue ACL/auth DDL",
9081                    principal, role
9082                ))
9083            };
9084        }
9085
9086        let mut resource = crate::auth::policies::ResourceRef::new(
9087            resource_kind.to_string(),
9088            resource_name.to_string(),
9089        );
9090        if let Some(t) = tenant {
9091            resource = resource.with_tenant(t.to_string());
9092        }
9093        let ctx = runtime_iam_context(role, tenant);
9094        if auth_store.check_policy_authz(principal, action, &resource, &ctx) {
9095            Ok(())
9096        } else {
9097            Err(format!(
9098                "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
9099                principal, action, resource.kind, resource.name
9100            ))
9101        }
9102    }
9103
9104    /// IAM privilege check for DROP / TRUNCATE on a named collection.
9105    ///
9106    /// In legacy mode (IAM not enabled): requires Write role.
9107    /// In IAM mode: requires an explicit `drop` / `truncate` policy on
9108    /// `collection:<name>` (Admin role auto-passes via AdminBypass).
9109    /// Records an audit log entry for both allow and deny outcomes.
9110    fn check_ddl_collection_privilege(
9111        &self,
9112        auth_store: &Arc<crate::auth::store::AuthStore>,
9113        principal: &crate::auth::UserId,
9114        role: crate::auth::Role,
9115        tenant: Option<&str>,
9116        username: &str,
9117        action: &str,
9118        collection: &str,
9119    ) -> Result<(), String> {
9120        if role < crate::auth::Role::Write {
9121            let msg = format!(
9122                "principal=`{}` role=`{:?}` cannot issue DDL",
9123                username, role
9124            );
9125            self.inner.audit_log.record(
9126                action,
9127                username,
9128                collection,
9129                "denied",
9130                crate::json::Value::Null,
9131            );
9132            return Err(msg);
9133        }
9134
9135        if !auth_store.iam_authorization_enabled() {
9136            self.inner.audit_log.record(
9137                action,
9138                username,
9139                collection,
9140                "ok",
9141                crate::json::Value::Null,
9142            );
9143            return Ok(());
9144        }
9145
9146        let resource_name = collection.to_string();
9147        let mut resource = crate::auth::policies::ResourceRef::new(
9148            "collection".to_string(),
9149            resource_name.clone(),
9150        );
9151        if let Some(t) = tenant {
9152            resource = resource.with_tenant(t.to_string());
9153        }
9154        let ctx = runtime_iam_context(role, tenant);
9155        if auth_store.check_policy_authz(principal, action, &resource, &ctx) {
9156            self.inner.audit_log.record(
9157                action,
9158                username,
9159                &resource_name,
9160                "ok",
9161                crate::json::Value::Null,
9162            );
9163            Ok(())
9164        } else {
9165            self.inner.audit_log.record(
9166                action,
9167                username,
9168                &resource_name,
9169                "denied",
9170                crate::json::Value::Null,
9171            );
9172            Err(format!(
9173                "principal=`{}` action=`{}` resource=`collection:{}` denied by IAM policy",
9174                username, action, resource_name
9175            ))
9176        }
9177    }
9178
9179    /// Translate the parsed [`GrantStmt`] into AuthStore mutations.
9180    fn execute_grant_statement(
9181        &self,
9182        query: &str,
9183        stmt: &crate::storage::query::ast::GrantStmt,
9184    ) -> RedDBResult<RuntimeQueryResult> {
9185        use crate::auth::privileges::{Action, GrantPrincipal, Resource};
9186        use crate::auth::UserId;
9187        use crate::storage::query::ast::{GrantObjectKind, GrantPrincipalRef};
9188
9189        let auth_store = self
9190            .inner
9191            .auth_store
9192            .read()
9193            .clone()
9194            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9195
9196        // Granter identity + role.
9197        let (gname, grole) = current_auth_identity().ok_or_else(|| {
9198            RedDBError::Query("GRANT requires an authenticated principal".to_string())
9199        })?;
9200        let granter = UserId::from_parts(current_tenant().as_deref(), &gname);
9201        let granter_role = grole;
9202
9203        // Build the action set.
9204        let mut actions: Vec<Action> = Vec::new();
9205        if stmt.all {
9206            actions.push(Action::All);
9207        } else {
9208            for kw in &stmt.actions {
9209                let a = Action::from_keyword(kw).ok_or_else(|| {
9210                    RedDBError::Query(format!("unknown privilege keyword `{}`", kw))
9211                })?;
9212                actions.push(a);
9213            }
9214        }
9215
9216        // Audit emit (printed; structured emission is Agent #4's lane).
9217        let mut applied = 0usize;
9218        for obj in &stmt.objects {
9219            let resource = match stmt.object_kind {
9220                GrantObjectKind::Table => Resource::Table {
9221                    schema: obj.schema.clone(),
9222                    table: obj.name.clone(),
9223                },
9224                GrantObjectKind::Schema => Resource::Schema(obj.name.clone()),
9225                GrantObjectKind::Database => Resource::Database,
9226                GrantObjectKind::Function => Resource::Function {
9227                    schema: obj.schema.clone(),
9228                    name: obj.name.clone(),
9229                },
9230            };
9231            for principal in &stmt.principals {
9232                let p = match principal {
9233                    GrantPrincipalRef::Public => GrantPrincipal::Public,
9234                    GrantPrincipalRef::Group(g) => GrantPrincipal::Group(g.clone()),
9235                    GrantPrincipalRef::User { tenant, name } => {
9236                        GrantPrincipal::User(UserId::from_parts(tenant.as_deref(), name))
9237                    }
9238                };
9239                // Tenant of the grant follows the granter's tenant
9240                // (cross-tenant guard inside `AuthStore::grant`).
9241                let tenant = granter.tenant.clone();
9242                auth_store
9243                    .grant(
9244                        &granter,
9245                        granter_role,
9246                        p.clone(),
9247                        resource.clone(),
9248                        actions.clone(),
9249                        stmt.with_grant_option,
9250                        tenant.clone(),
9251                    )
9252                    .map_err(|e| RedDBError::Query(e.to_string()))?;
9253
9254                // IAM policy translation: every GRANT also lands as a
9255                // synthetic `_grant_<id>` policy attached to the
9256                // principal so the new evaluator sees it.
9257                if let Some(policy) =
9258                    grant_to_iam_policy(&p, &resource, &actions, tenant.as_deref())
9259                {
9260                    let pid = policy.id.clone();
9261                    auth_store
9262                        .put_policy_internal(policy)
9263                        .map_err(|e| RedDBError::Query(e.to_string()))?;
9264                    let attachment = match &p {
9265                        GrantPrincipal::User(uid) => {
9266                            crate::auth::store::PrincipalRef::User(uid.clone())
9267                        }
9268                        GrantPrincipal::Group(group) => {
9269                            crate::auth::store::PrincipalRef::Group(group.clone())
9270                        }
9271                        GrantPrincipal::Public => crate::auth::store::PrincipalRef::Group(
9272                            crate::auth::store::PUBLIC_IAM_GROUP.to_string(),
9273                        ),
9274                    };
9275                    auth_store
9276                        .attach_policy(attachment, &pid)
9277                        .map_err(|e| RedDBError::Query(e.to_string()))?;
9278                }
9279                applied += 1;
9280                tracing::info!(
9281                    target: "audit",
9282                    principal = %granter,
9283                    action = "grant",
9284                    "GRANT applied"
9285                );
9286            }
9287        }
9288
9289        self.invalidate_result_cache();
9290        Ok(RuntimeQueryResult::ok_message(
9291            query.to_string(),
9292            &format!("GRANT applied to {} target(s)", applied),
9293            "grant",
9294        ))
9295    }
9296
9297    /// Translate the parsed [`RevokeStmt`] into AuthStore mutations.
9298    fn execute_revoke_statement(
9299        &self,
9300        query: &str,
9301        stmt: &crate::storage::query::ast::RevokeStmt,
9302    ) -> RedDBResult<RuntimeQueryResult> {
9303        use crate::auth::privileges::{Action, GrantPrincipal, Resource};
9304        use crate::auth::UserId;
9305        use crate::storage::query::ast::{GrantObjectKind, GrantPrincipalRef};
9306
9307        let auth_store = self
9308            .inner
9309            .auth_store
9310            .read()
9311            .clone()
9312            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9313
9314        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
9315            RedDBError::Query("REVOKE requires an authenticated principal".to_string())
9316        })?;
9317        let granter_role = grole;
9318
9319        let actions: Vec<Action> = if stmt.all {
9320            vec![Action::All]
9321        } else {
9322            stmt.actions
9323                .iter()
9324                .map(|kw| Action::from_keyword(kw).unwrap_or(Action::Select))
9325                .collect()
9326        };
9327
9328        let mut total_removed = 0usize;
9329        for obj in &stmt.objects {
9330            let resource = match stmt.object_kind {
9331                GrantObjectKind::Table => Resource::Table {
9332                    schema: obj.schema.clone(),
9333                    table: obj.name.clone(),
9334                },
9335                GrantObjectKind::Schema => Resource::Schema(obj.name.clone()),
9336                GrantObjectKind::Database => Resource::Database,
9337                GrantObjectKind::Function => Resource::Function {
9338                    schema: obj.schema.clone(),
9339                    name: obj.name.clone(),
9340                },
9341            };
9342            for principal in &stmt.principals {
9343                let p = match principal {
9344                    GrantPrincipalRef::Public => GrantPrincipal::Public,
9345                    GrantPrincipalRef::Group(g) => GrantPrincipal::Group(g.clone()),
9346                    GrantPrincipalRef::User { tenant, name } => {
9347                        GrantPrincipal::User(UserId::from_parts(tenant.as_deref(), name))
9348                    }
9349                };
9350                let removed = auth_store
9351                    .revoke(granter_role, &p, &resource, &actions)
9352                    .map_err(|e| RedDBError::Query(e.to_string()))?;
9353                let _removed_policies =
9354                    auth_store.delete_synthetic_grant_policies(&p, &resource, &actions);
9355                total_removed += removed;
9356            }
9357        }
9358
9359        self.invalidate_result_cache();
9360        Ok(RuntimeQueryResult::ok_message(
9361            query.to_string(),
9362            &format!("REVOKE removed {} grant(s)", total_removed),
9363            "revoke",
9364        ))
9365    }
9366
9367    /// Translate the parsed [`AlterUserStmt`] into AuthStore mutations.
9368    fn execute_alter_user_statement(
9369        &self,
9370        query: &str,
9371        stmt: &crate::storage::query::ast::AlterUserStmt,
9372    ) -> RedDBResult<RuntimeQueryResult> {
9373        use crate::auth::privileges::UserAttributes;
9374        use crate::auth::UserId;
9375        use crate::storage::query::ast::AlterUserAttribute;
9376
9377        let auth_store = self
9378            .inner
9379            .auth_store
9380            .read()
9381            .clone()
9382            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9383
9384        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
9385            RedDBError::Query("ALTER USER requires an authenticated principal".to_string())
9386        })?;
9387        if grole != crate::auth::Role::Admin {
9388            return Err(RedDBError::Query(
9389                "ALTER USER requires Admin role".to_string(),
9390            ));
9391        }
9392
9393        let target = UserId::from_parts(stmt.tenant.as_deref(), &stmt.username);
9394
9395        // Apply attributes incrementally — each one reads the current
9396        // record, mutates the relevant field, writes back.
9397        let mut attrs = auth_store.user_attributes(&target);
9398        let mut enable_change: Option<bool> = None;
9399
9400        for a in &stmt.attributes {
9401            match a {
9402                AlterUserAttribute::ValidUntil(ts) => {
9403                    // Parse ISO-ish timestamp → ms since epoch. Fall
9404                    // back to integer-ms parsing for callers that pass
9405                    // `'1234567890123'`.
9406                    let ms = parse_timestamp_to_ms(ts).ok_or_else(|| {
9407                        RedDBError::Query(format!("invalid VALID UNTIL timestamp `{ts}`"))
9408                    })?;
9409                    attrs.valid_until = Some(ms);
9410                }
9411                AlterUserAttribute::ConnectionLimit(n) => {
9412                    if *n < 0 {
9413                        return Err(RedDBError::Query(
9414                            "CONNECTION LIMIT must be non-negative".to_string(),
9415                        ));
9416                    }
9417                    attrs.connection_limit = Some(*n as u32);
9418                }
9419                AlterUserAttribute::SetSearchPath(p) => {
9420                    attrs.search_path = Some(p.clone());
9421                }
9422                AlterUserAttribute::AddGroup(g) => {
9423                    if !attrs.groups.iter().any(|existing| existing == g) {
9424                        attrs.groups.push(g.clone());
9425                        attrs.groups.sort();
9426                    }
9427                }
9428                AlterUserAttribute::DropGroup(g) => {
9429                    attrs.groups.retain(|existing| existing != g);
9430                }
9431                AlterUserAttribute::Enable => enable_change = Some(true),
9432                AlterUserAttribute::Disable => enable_change = Some(false),
9433                AlterUserAttribute::Password(_) => {
9434                    // Out of scope — accept the AST but no-op so the
9435                    // parser stays compatible with future password
9436                    // rotation work.
9437                }
9438            }
9439        }
9440
9441        auth_store
9442            .set_user_attributes(&target, attrs)
9443            .map_err(|e| RedDBError::Query(e.to_string()))?;
9444        if let Some(en) = enable_change {
9445            auth_store
9446                .set_user_enabled(&target, en)
9447                .map_err(|e| RedDBError::Query(e.to_string()))?;
9448        }
9449        self.invalidate_result_cache();
9450        tracing::info!(
9451            target: "audit",
9452            principal = %target,
9453            action = "alter_user",
9454            "ALTER USER applied"
9455        );
9456
9457        Ok(RuntimeQueryResult::ok_message(
9458            query.to_string(),
9459            &format!("ALTER USER {} applied", target),
9460            "alter_user",
9461        ))
9462    }
9463
9464    // -----------------------------------------------------------------
9465    // IAM policy executors
9466    // -----------------------------------------------------------------
9467
9468    fn execute_create_iam_policy(
9469        &self,
9470        query: &str,
9471        id: &str,
9472        json: &str,
9473    ) -> RedDBResult<RuntimeQueryResult> {
9474        use crate::auth::policies::Policy;
9475
9476        let auth_store = self
9477            .inner
9478            .auth_store
9479            .read()
9480            .clone()
9481            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9482
9483        // Parse + validate. The kernel rejects oversize / bad shape /
9484        // bad action keywords. If the supplied id differs from the JSON
9485        // id, override it with the SQL-provided id (the JSON id is
9486        // optional context — the SQL DDL form is authoritative).
9487        let mut policy = Policy::from_json_str(json)
9488            .map_err(|e| RedDBError::Query(format!("policy parse: {e}")))?;
9489        if policy.id != id {
9490            policy.id = id.to_string();
9491        }
9492        let pid = policy.id.clone();
9493        auth_store
9494            .put_policy(policy)
9495            .map_err(|e| RedDBError::Query(e.to_string()))?;
9496
9497        let principal = current_auth_identity()
9498            .map(|(u, _)| u)
9499            .unwrap_or_else(|| "anonymous".into());
9500        tracing::info!(
9501            target: "audit",
9502            principal = %principal,
9503            action = "iam:policy.put",
9504            matched_policy_id = %pid,
9505            "CREATE POLICY applied"
9506        );
9507        self.inner.audit_log.record(
9508            "iam/policy.put",
9509            &principal,
9510            &pid,
9511            "ok",
9512            crate::json::Value::Null,
9513        );
9514
9515        self.invalidate_result_cache();
9516        Ok(RuntimeQueryResult::ok_message(
9517            query.to_string(),
9518            &format!("policy `{pid}` stored"),
9519            "create_iam_policy",
9520        ))
9521    }
9522
9523    fn execute_drop_iam_policy(&self, query: &str, id: &str) -> RedDBResult<RuntimeQueryResult> {
9524        let auth_store = self
9525            .inner
9526            .auth_store
9527            .read()
9528            .clone()
9529            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9530        auth_store
9531            .delete_policy(id)
9532            .map_err(|e| RedDBError::Query(e.to_string()))?;
9533
9534        let principal = current_auth_identity()
9535            .map(|(u, _)| u)
9536            .unwrap_or_else(|| "anonymous".into());
9537        tracing::info!(
9538            target: "audit",
9539            principal = %principal,
9540            action = "iam:policy.drop",
9541            matched_policy_id = %id,
9542            "DROP POLICY applied"
9543        );
9544        self.inner.audit_log.record(
9545            "iam/policy.drop",
9546            &principal,
9547            id,
9548            "ok",
9549            crate::json::Value::Null,
9550        );
9551
9552        self.invalidate_result_cache();
9553        Ok(RuntimeQueryResult::ok_message(
9554            query.to_string(),
9555            &format!("policy `{id}` dropped"),
9556            "drop_iam_policy",
9557        ))
9558    }
9559
9560    fn execute_attach_policy(
9561        &self,
9562        query: &str,
9563        policy_id: &str,
9564        principal: &crate::storage::query::ast::PolicyPrincipalRef,
9565    ) -> RedDBResult<RuntimeQueryResult> {
9566        use crate::auth::store::PrincipalRef;
9567        use crate::auth::UserId;
9568        use crate::storage::query::ast::PolicyPrincipalRef;
9569
9570        let auth_store = self
9571            .inner
9572            .auth_store
9573            .read()
9574            .clone()
9575            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9576        let p = match principal {
9577            PolicyPrincipalRef::User(u) => {
9578                PrincipalRef::User(UserId::from_parts(u.tenant.as_deref(), &u.username))
9579            }
9580            PolicyPrincipalRef::Group(g) => PrincipalRef::Group(g.clone()),
9581        };
9582        let pretty_target = principal_label(principal);
9583        auth_store
9584            .attach_policy(p, policy_id)
9585            .map_err(|e| RedDBError::Query(e.to_string()))?;
9586
9587        let principal_str = current_auth_identity()
9588            .map(|(u, _)| u)
9589            .unwrap_or_else(|| "anonymous".into());
9590        tracing::info!(
9591            target: "audit",
9592            principal = %principal_str,
9593            action = "iam:policy.attach",
9594            matched_policy_id = %policy_id,
9595            target = %pretty_target,
9596            "ATTACH POLICY applied"
9597        );
9598        self.inner.audit_log.record(
9599            "iam/policy.attach",
9600            &principal_str,
9601            &pretty_target,
9602            "ok",
9603            crate::json::Value::Null,
9604        );
9605
9606        self.invalidate_result_cache();
9607        Ok(RuntimeQueryResult::ok_message(
9608            query.to_string(),
9609            &format!("policy `{policy_id}` attached to {pretty_target}"),
9610            "attach_policy",
9611        ))
9612    }
9613
9614    fn execute_detach_policy(
9615        &self,
9616        query: &str,
9617        policy_id: &str,
9618        principal: &crate::storage::query::ast::PolicyPrincipalRef,
9619    ) -> RedDBResult<RuntimeQueryResult> {
9620        use crate::auth::store::PrincipalRef;
9621        use crate::auth::UserId;
9622        use crate::storage::query::ast::PolicyPrincipalRef;
9623
9624        let auth_store = self
9625            .inner
9626            .auth_store
9627            .read()
9628            .clone()
9629            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9630        let p = match principal {
9631            PolicyPrincipalRef::User(u) => {
9632                PrincipalRef::User(UserId::from_parts(u.tenant.as_deref(), &u.username))
9633            }
9634            PolicyPrincipalRef::Group(g) => PrincipalRef::Group(g.clone()),
9635        };
9636        let pretty_target = principal_label(principal);
9637        auth_store
9638            .detach_policy(p, policy_id)
9639            .map_err(|e| RedDBError::Query(e.to_string()))?;
9640
9641        let principal_str = current_auth_identity()
9642            .map(|(u, _)| u)
9643            .unwrap_or_else(|| "anonymous".into());
9644        tracing::info!(
9645            target: "audit",
9646            principal = %principal_str,
9647            action = "iam:policy.detach",
9648            matched_policy_id = %policy_id,
9649            target = %pretty_target,
9650            "DETACH POLICY applied"
9651        );
9652        self.inner.audit_log.record(
9653            "iam/policy.detach",
9654            &principal_str,
9655            &pretty_target,
9656            "ok",
9657            crate::json::Value::Null,
9658        );
9659
9660        self.invalidate_result_cache();
9661        Ok(RuntimeQueryResult::ok_message(
9662            query.to_string(),
9663            &format!("policy `{policy_id}` detached from {pretty_target}"),
9664            "detach_policy",
9665        ))
9666    }
9667
9668    fn execute_show_policies(
9669        &self,
9670        query: &str,
9671        filter: Option<&crate::storage::query::ast::PolicyPrincipalRef>,
9672    ) -> RedDBResult<RuntimeQueryResult> {
9673        use crate::auth::UserId;
9674        use crate::storage::query::ast::PolicyPrincipalRef;
9675        use crate::storage::query::unified::UnifiedRecord;
9676        use crate::storage::schema::Value as SchemaValue;
9677        use std::sync::Arc;
9678
9679        let auth_store = self
9680            .inner
9681            .auth_store
9682            .read()
9683            .clone()
9684            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9685
9686        let pols = match filter {
9687            None => auth_store.list_policies(),
9688            Some(PolicyPrincipalRef::User(u)) => {
9689                let id = UserId::from_parts(u.tenant.as_deref(), &u.username);
9690                auth_store.effective_policies(&id)
9691            }
9692            Some(PolicyPrincipalRef::Group(g)) => auth_store.group_policies(g),
9693        };
9694
9695        let mut records = Vec::with_capacity(pols.len());
9696        for p in pols.iter() {
9697            let mut rec = UnifiedRecord::default();
9698            rec.set_arc(Arc::from("id"), SchemaValue::text(p.id.clone()));
9699            rec.set_arc(
9700                Arc::from("statements"),
9701                SchemaValue::Integer(p.statements.len() as i64),
9702            );
9703            rec.set_arc(
9704                Arc::from("tenant"),
9705                p.tenant
9706                    .as_deref()
9707                    .map(|t| SchemaValue::text(t.to_string()))
9708                    .unwrap_or(SchemaValue::Null),
9709            );
9710            rec.set_arc(Arc::from("json"), SchemaValue::text(p.to_json_string()));
9711            records.push(rec);
9712        }
9713        let mut result = crate::storage::query::unified::UnifiedResult::empty();
9714        result.records = records;
9715        Ok(RuntimeQueryResult {
9716            query: query.to_string(),
9717            mode: crate::storage::query::modes::QueryMode::Sql,
9718            statement: "show_policies",
9719            engine: "iam-policies",
9720            result,
9721            affected_rows: 0,
9722            statement_type: "select",
9723        })
9724    }
9725
9726    fn execute_show_effective_permissions(
9727        &self,
9728        query: &str,
9729        user: &crate::storage::query::ast::PolicyUserRef,
9730        resource: Option<&crate::storage::query::ast::PolicyResourceRef>,
9731    ) -> RedDBResult<RuntimeQueryResult> {
9732        use crate::auth::UserId;
9733        use crate::storage::query::unified::UnifiedRecord;
9734        use crate::storage::schema::Value as SchemaValue;
9735        use std::sync::Arc;
9736
9737        let auth_store = self
9738            .inner
9739            .auth_store
9740            .read()
9741            .clone()
9742            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9743        let id = UserId::from_parts(user.tenant.as_deref(), &user.username);
9744        let pols = auth_store.effective_policies(&id);
9745
9746        // Show one row per (policy, statement) tuple, plus any
9747        // resource-level filter passed by the caller.
9748        let mut records = Vec::new();
9749        for p in pols.iter() {
9750            for (idx, st) in p.statements.iter().enumerate() {
9751                if let Some(_r) = resource {
9752                    // Naive filter: render statement targets to strings
9753                    // and skip if no match. Conservative default = include
9754                    // (the simulator handles fine-grained matching).
9755                }
9756                let mut rec = UnifiedRecord::default();
9757                rec.set_arc(Arc::from("policy_id"), SchemaValue::text(p.id.clone()));
9758                rec.set_arc(
9759                    Arc::from("statement_index"),
9760                    SchemaValue::Integer(idx as i64),
9761                );
9762                rec.set_arc(
9763                    Arc::from("sid"),
9764                    st.sid
9765                        .as_deref()
9766                        .map(|s| SchemaValue::text(s.to_string()))
9767                        .unwrap_or(SchemaValue::Null),
9768                );
9769                rec.set_arc(
9770                    Arc::from("effect"),
9771                    SchemaValue::text(match st.effect {
9772                        crate::auth::policies::Effect::Allow => "allow",
9773                        crate::auth::policies::Effect::Deny => "deny",
9774                    }),
9775                );
9776                rec.set_arc(
9777                    Arc::from("actions"),
9778                    SchemaValue::Integer(st.actions.len() as i64),
9779                );
9780                rec.set_arc(
9781                    Arc::from("resources"),
9782                    SchemaValue::Integer(st.resources.len() as i64),
9783                );
9784                records.push(rec);
9785            }
9786        }
9787        let mut result = crate::storage::query::unified::UnifiedResult::empty();
9788        result.records = records;
9789        Ok(RuntimeQueryResult {
9790            query: query.to_string(),
9791            mode: crate::storage::query::modes::QueryMode::Sql,
9792            statement: "show_effective_permissions",
9793            engine: "iam-policies",
9794            result,
9795            affected_rows: 0,
9796            statement_type: "select",
9797        })
9798    }
9799
9800    fn execute_simulate_policy(
9801        &self,
9802        query: &str,
9803        user: &crate::storage::query::ast::PolicyUserRef,
9804        action: &str,
9805        resource: &crate::storage::query::ast::PolicyResourceRef,
9806    ) -> RedDBResult<RuntimeQueryResult> {
9807        use crate::auth::policies::ResourceRef;
9808        use crate::auth::store::SimCtx;
9809        use crate::auth::UserId;
9810        use crate::storage::query::unified::UnifiedRecord;
9811        use crate::storage::schema::Value as SchemaValue;
9812        use std::sync::Arc;
9813
9814        let auth_store = self
9815            .inner
9816            .auth_store
9817            .read()
9818            .clone()
9819            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9820        let id = UserId::from_parts(user.tenant.as_deref(), &user.username);
9821        let r = ResourceRef::new(resource.kind.clone(), resource.name.clone());
9822        let outcome = auth_store.simulate(&id, action, &r, SimCtx::default());
9823
9824        let principal_str = current_auth_identity()
9825            .map(|(u, _)| u)
9826            .unwrap_or_else(|| "anonymous".into());
9827        let (decision_str, matched_pid, matched_sid) = decision_to_strings(&outcome.decision);
9828        tracing::info!(
9829            target: "audit",
9830            principal = %principal_str,
9831            action = "iam:policy.simulate",
9832            decision = %decision_str,
9833            matched_policy_id = ?matched_pid,
9834            matched_sid = ?matched_sid,
9835            "SIMULATE issued"
9836        );
9837        self.inner.audit_log.record(
9838            "iam/policy.simulate",
9839            &principal_str,
9840            &id.to_string(),
9841            "ok",
9842            crate::json::Value::Null,
9843        );
9844
9845        let mut rec = UnifiedRecord::default();
9846        rec.set_arc(Arc::from("decision"), SchemaValue::text(decision_str));
9847        rec.set_arc(
9848            Arc::from("matched_policy_id"),
9849            matched_pid
9850                .map(SchemaValue::text)
9851                .unwrap_or(SchemaValue::Null),
9852        );
9853        rec.set_arc(
9854            Arc::from("matched_sid"),
9855            matched_sid
9856                .map(SchemaValue::text)
9857                .unwrap_or(SchemaValue::Null),
9858        );
9859        rec.set_arc(Arc::from("reason"), SchemaValue::text(outcome.reason));
9860        rec.set_arc(
9861            Arc::from("trail_len"),
9862            SchemaValue::Integer(outcome.trail.len() as i64),
9863        );
9864        let mut result = crate::storage::query::unified::UnifiedResult::empty();
9865        result.records = vec![rec];
9866        Ok(RuntimeQueryResult {
9867            query: query.to_string(),
9868            mode: crate::storage::query::modes::QueryMode::Sql,
9869            statement: "simulate_policy",
9870            engine: "iam-policies",
9871            result,
9872            affected_rows: 0,
9873            statement_type: "select",
9874        })
9875    }
9876}
9877
9878/// Translate a parsed GRANT into a synthetic IAM policy whose id
9879/// starts with `_grant_<unique>`. PUBLIC is represented as an
9880/// implicit IAM group; legacy GROUP grants are still rejected by the
9881/// grant store and are not translated here.
9882fn grant_to_iam_policy(
9883    principal: &crate::auth::privileges::GrantPrincipal,
9884    resource: &crate::auth::privileges::Resource,
9885    actions: &[crate::auth::privileges::Action],
9886    tenant: Option<&str>,
9887) -> Option<crate::auth::policies::Policy> {
9888    use crate::auth::policies::{
9889        compile_action, ActionPattern, Effect, Policy, ResourcePattern, Statement,
9890    };
9891    use crate::auth::privileges::{Action, GrantPrincipal, Resource};
9892
9893    if matches!(principal, GrantPrincipal::Group(_)) {
9894        return None;
9895    }
9896
9897    let now = crate::auth::now_ms();
9898    let id = format!("_grant_{:x}_{:x}", now, std::process::id());
9899
9900    let resource_str = match resource {
9901        Resource::Database => "table:*".to_string(),
9902        Resource::Schema(s) => format!("table:{s}.*"),
9903        Resource::Table { schema, table } => match schema {
9904            Some(s) => format!("table:{s}.{table}"),
9905            None => format!("table:{table}"),
9906        },
9907        Resource::Function { schema, name } => match schema {
9908            Some(s) => format!("function:{s}.{name}"),
9909            None => format!("function:{name}"),
9910        },
9911    };
9912
9913    // Compile actions — fall back to `*` only when the grant included
9914    // `Action::All`. Map every other action keyword to its lowercase
9915    // form so it lines up with the kernel's allowlist.
9916    let action_patterns: Vec<ActionPattern> = if actions.contains(&Action::All) {
9917        vec![ActionPattern::Wildcard]
9918    } else {
9919        actions
9920            .iter()
9921            .map(|a| compile_action(&a.as_str().to_ascii_lowercase()))
9922            .collect()
9923    };
9924    if action_patterns.is_empty() {
9925        return None;
9926    }
9927
9928    // Inline resource compilation matching the kernel's `compile_resource`:
9929    //   * `*` → wildcard
9930    //   * contains `*` → glob
9931    //   * `kind:name` → exact
9932    let resource_patterns = if resource_str == "*" {
9933        vec![ResourcePattern::Wildcard]
9934    } else if resource_str.contains('*') {
9935        vec![ResourcePattern::Glob(resource_str.clone())]
9936    } else if let Some((kind, name)) = resource_str.split_once(':') {
9937        vec![ResourcePattern::Exact {
9938            kind: kind.to_string(),
9939            name: name.to_string(),
9940        }]
9941    } else {
9942        vec![ResourcePattern::Wildcard]
9943    };
9944
9945    let policy = Policy {
9946        id,
9947        version: 1,
9948        tenant: tenant.map(|t| t.to_string()),
9949        created_at: now,
9950        updated_at: now,
9951        statements: vec![Statement {
9952            sid: None,
9953            effect: Effect::Allow,
9954            actions: action_patterns,
9955            resources: resource_patterns,
9956            condition: None,
9957        }],
9958    };
9959    if policy.validate().is_err() {
9960        return None;
9961    }
9962    Some(policy)
9963}
9964
9965fn legacy_action_to_iam(action: crate::auth::privileges::Action) -> &'static str {
9966    use crate::auth::privileges::Action;
9967    match action {
9968        Action::Select => "select",
9969        Action::Insert => "insert",
9970        Action::Update => "update",
9971        Action::Delete => "delete",
9972        Action::Truncate => "truncate",
9973        Action::References => "references",
9974        Action::Execute => "execute",
9975        Action::Usage => "usage",
9976        Action::All => "*",
9977    }
9978}
9979
9980fn update_set_target_columns(query: &crate::storage::query::ast::UpdateQuery) -> Vec<String> {
9981    let mut columns = Vec::new();
9982    for (column, _) in &query.assignment_exprs {
9983        if !columns.iter().any(|seen| seen == column) {
9984            columns.push(column.clone());
9985        }
9986    }
9987    columns
9988}
9989
9990fn column_access_request_for_table_update(
9991    table_name: &str,
9992    columns: Vec<String>,
9993) -> crate::auth::ColumnAccessRequest {
9994    match table_name.split_once('.') {
9995        Some((schema, table)) => {
9996            crate::auth::ColumnAccessRequest::update(table.to_string(), columns)
9997                .with_schema(schema.to_string())
9998        }
9999        None => crate::auth::ColumnAccessRequest::update(table_name.to_string(), columns),
10000    }
10001}
10002
10003fn column_access_request_for_table_select(
10004    table_name: &str,
10005    columns: Vec<String>,
10006) -> crate::auth::ColumnAccessRequest {
10007    match table_name.split_once('.') {
10008        Some((schema, table)) => {
10009            crate::auth::ColumnAccessRequest::select(table.to_string(), columns)
10010                .with_schema(schema.to_string())
10011        }
10012        None => crate::auth::ColumnAccessRequest::select(table_name.to_string(), columns),
10013    }
10014}
10015
10016fn update_returning_columns_for_policy(
10017    runtime: &RedDBRuntime,
10018    query: &crate::storage::query::ast::UpdateQuery,
10019) -> Option<Vec<String>> {
10020    let items = query.returning.as_ref()?;
10021    let mut columns = Vec::new();
10022    let project_all = items
10023        .iter()
10024        .any(|item| matches!(item, crate::storage::query::ast::ReturningItem::All));
10025    if project_all {
10026        collect_returning_star_columns(runtime, query, &mut columns);
10027    } else {
10028        for item in items {
10029            let crate::storage::query::ast::ReturningItem::Column(column) = item else {
10030                continue;
10031            };
10032            push_returning_policy_column(&mut columns, column);
10033        }
10034    }
10035    (!columns.is_empty()).then_some(columns)
10036}
10037
10038fn collect_returning_star_columns(
10039    runtime: &RedDBRuntime,
10040    query: &crate::storage::query::ast::UpdateQuery,
10041    columns: &mut Vec<String>,
10042) {
10043    let store = runtime.db().store();
10044    let Some(manager) = store.get_collection(&query.table) else {
10045        return;
10046    };
10047    if let Some(schema) = manager.column_schema() {
10048        for column in schema.iter() {
10049            push_returning_policy_column(columns, column);
10050        }
10051    }
10052    for entity in manager.query_all(|_| true) {
10053        if !returning_entity_matches_update_target(&entity, query.target) {
10054            continue;
10055        }
10056        match &entity.data {
10057            crate::storage::EntityData::Row(row) => {
10058                for (column, _) in row.iter_fields() {
10059                    push_returning_policy_column(columns, column);
10060                }
10061            }
10062            crate::storage::EntityData::Node(node) => {
10063                push_returning_policy_column(columns, "label");
10064                push_returning_policy_column(columns, "node_type");
10065                for column in node.properties.keys() {
10066                    push_returning_policy_column(columns, column);
10067                }
10068            }
10069            crate::storage::EntityData::Edge(edge) => {
10070                push_returning_policy_column(columns, "label");
10071                push_returning_policy_column(columns, "from_rid");
10072                push_returning_policy_column(columns, "to_rid");
10073                push_returning_policy_column(columns, "weight");
10074                for column in edge.properties.keys() {
10075                    push_returning_policy_column(columns, column);
10076                }
10077            }
10078            _ => {}
10079        }
10080    }
10081}
10082
10083fn push_returning_policy_column(columns: &mut Vec<String>, column: &str) {
10084    if returning_public_envelope_column(column) {
10085        return;
10086    }
10087    if !columns.iter().any(|seen| seen == column) {
10088        columns.push(column.to_string());
10089    }
10090}
10091
10092fn returning_public_envelope_column(column: &str) -> bool {
10093    matches!(
10094        column.to_ascii_lowercase().as_str(),
10095        "rid" | "collection" | "kind" | "tenant" | "created_at" | "updated_at" | "red_entity_id"
10096    )
10097}
10098
10099fn returning_entity_matches_update_target(
10100    entity: &crate::storage::UnifiedEntity,
10101    target: crate::storage::query::ast::UpdateTarget,
10102) -> bool {
10103    use crate::storage::query::ast::UpdateTarget;
10104    match target {
10105        UpdateTarget::Rows => {
10106            matches!(returning_row_item_kind(entity), Some(ReturningRowKind::Row))
10107        }
10108        UpdateTarget::Documents => {
10109            matches!(
10110                returning_row_item_kind(entity),
10111                Some(ReturningRowKind::Document)
10112            )
10113        }
10114        UpdateTarget::Kv => matches!(returning_row_item_kind(entity), Some(ReturningRowKind::Kv)),
10115        UpdateTarget::Nodes => matches!(
10116            (&entity.kind, &entity.data),
10117            (
10118                crate::storage::EntityKind::GraphNode(_),
10119                crate::storage::EntityData::Node(_)
10120            )
10121        ),
10122        UpdateTarget::Edges => matches!(
10123            (&entity.kind, &entity.data),
10124            (
10125                crate::storage::EntityKind::GraphEdge(_),
10126                crate::storage::EntityData::Edge(_)
10127            )
10128        ),
10129    }
10130}
10131
10132#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10133enum ReturningRowKind {
10134    Row,
10135    Document,
10136    Kv,
10137}
10138
10139fn returning_row_item_kind(entity: &crate::storage::UnifiedEntity) -> Option<ReturningRowKind> {
10140    let row = entity.data.as_row()?;
10141    let is_kv = row.iter_fields().all(|(column, _)| {
10142        column.eq_ignore_ascii_case("key") || column.eq_ignore_ascii_case("value")
10143    });
10144    if is_kv {
10145        return Some(ReturningRowKind::Kv);
10146    }
10147    let is_document = row
10148        .iter_fields()
10149        .any(|(_, value)| matches!(value, crate::storage::schema::Value::Json(_)));
10150    if is_document {
10151        Some(ReturningRowKind::Document)
10152    } else {
10153        Some(ReturningRowKind::Row)
10154    }
10155}
10156
10157fn requested_table_columns_for_policy(
10158    table: &crate::storage::query::ast::TableQuery,
10159) -> Vec<String> {
10160    use crate::storage::query::sql_lowering::{
10161        effective_table_filter, effective_table_group_by_exprs, effective_table_having_filter,
10162        effective_table_projections,
10163    };
10164
10165    let table_name = table.table.as_str();
10166    let table_alias = table.alias.as_deref();
10167    let mut columns = std::collections::BTreeSet::new();
10168
10169    for projection in effective_table_projections(table) {
10170        collect_projection_columns(&projection, table_name, table_alias, &mut columns);
10171    }
10172    if let Some(filter) = effective_table_filter(table) {
10173        collect_filter_columns(&filter, table_name, table_alias, &mut columns);
10174    }
10175    for expr in effective_table_group_by_exprs(table) {
10176        collect_expr_columns(&expr, table_name, table_alias, &mut columns);
10177    }
10178    if let Some(filter) = effective_table_having_filter(table) {
10179        collect_filter_columns(&filter, table_name, table_alias, &mut columns);
10180    }
10181    for order in &table.order_by {
10182        if let Some(expr) = order.expr.as_ref() {
10183            collect_expr_columns(expr, table_name, table_alias, &mut columns);
10184        } else {
10185            collect_field_ref_column(&order.field, table_name, table_alias, &mut columns);
10186        }
10187    }
10188
10189    columns.into_iter().collect()
10190}
10191
10192fn collect_projection_columns(
10193    projection: &crate::storage::query::ast::Projection,
10194    table_name: &str,
10195    table_alias: Option<&str>,
10196    columns: &mut std::collections::BTreeSet<String>,
10197) {
10198    use crate::storage::query::ast::Projection;
10199    match projection {
10200        Projection::All => {
10201            columns.insert("*".to_string());
10202        }
10203        Projection::Column(column) | Projection::Alias(column, _) => {
10204            if column != "*" {
10205                columns.insert(column.clone());
10206            }
10207        }
10208        Projection::Function(_, args) => {
10209            for arg in args {
10210                collect_projection_columns(arg, table_name, table_alias, columns);
10211            }
10212        }
10213        Projection::Expression(filter, _) => {
10214            collect_filter_columns(filter, table_name, table_alias, columns);
10215        }
10216        Projection::Field(field, _) => {
10217            collect_field_ref_column(field, table_name, table_alias, columns);
10218        }
10219    }
10220}
10221
10222fn collect_filter_columns(
10223    filter: &crate::storage::query::ast::Filter,
10224    table_name: &str,
10225    table_alias: Option<&str>,
10226    columns: &mut std::collections::BTreeSet<String>,
10227) {
10228    use crate::storage::query::ast::Filter;
10229    match filter {
10230        Filter::Compare { field, .. }
10231        | Filter::IsNull(field)
10232        | Filter::IsNotNull(field)
10233        | Filter::In { field, .. }
10234        | Filter::Between { field, .. }
10235        | Filter::Like { field, .. }
10236        | Filter::StartsWith { field, .. }
10237        | Filter::EndsWith { field, .. }
10238        | Filter::Contains { field, .. } => {
10239            collect_field_ref_column(field, table_name, table_alias, columns);
10240        }
10241        Filter::CompareFields { left, right, .. } => {
10242            collect_field_ref_column(left, table_name, table_alias, columns);
10243            collect_field_ref_column(right, table_name, table_alias, columns);
10244        }
10245        Filter::CompareExpr { lhs, rhs, .. } => {
10246            collect_expr_columns(lhs, table_name, table_alias, columns);
10247            collect_expr_columns(rhs, table_name, table_alias, columns);
10248        }
10249        Filter::And(left, right) | Filter::Or(left, right) => {
10250            collect_filter_columns(left, table_name, table_alias, columns);
10251            collect_filter_columns(right, table_name, table_alias, columns);
10252        }
10253        Filter::Not(inner) => collect_filter_columns(inner, table_name, table_alias, columns),
10254    }
10255}
10256
10257fn collect_expr_columns(
10258    expr: &crate::storage::query::ast::Expr,
10259    table_name: &str,
10260    table_alias: Option<&str>,
10261    columns: &mut std::collections::BTreeSet<String>,
10262) {
10263    use crate::storage::query::ast::Expr;
10264    match expr {
10265        Expr::Column { field, .. } => {
10266            collect_field_ref_column(field, table_name, table_alias, columns);
10267        }
10268        Expr::Literal { .. } | Expr::Parameter { .. } => {}
10269        Expr::UnaryOp { operand, .. } | Expr::Cast { inner: operand, .. } => {
10270            collect_expr_columns(operand, table_name, table_alias, columns);
10271        }
10272        Expr::BinaryOp { lhs, rhs, .. } => {
10273            collect_expr_columns(lhs, table_name, table_alias, columns);
10274            collect_expr_columns(rhs, table_name, table_alias, columns);
10275        }
10276        Expr::FunctionCall { args, .. } => {
10277            for arg in args {
10278                collect_expr_columns(arg, table_name, table_alias, columns);
10279            }
10280        }
10281        Expr::Case {
10282            branches, else_, ..
10283        } => {
10284            for (condition, value) in branches {
10285                collect_expr_columns(condition, table_name, table_alias, columns);
10286                collect_expr_columns(value, table_name, table_alias, columns);
10287            }
10288            if let Some(value) = else_ {
10289                collect_expr_columns(value, table_name, table_alias, columns);
10290            }
10291        }
10292        Expr::IsNull { operand, .. } => {
10293            collect_expr_columns(operand, table_name, table_alias, columns);
10294        }
10295        Expr::InList { target, values, .. } => {
10296            collect_expr_columns(target, table_name, table_alias, columns);
10297            for value in values {
10298                collect_expr_columns(value, table_name, table_alias, columns);
10299            }
10300        }
10301        Expr::Between {
10302            target, low, high, ..
10303        } => {
10304            collect_expr_columns(target, table_name, table_alias, columns);
10305            collect_expr_columns(low, table_name, table_alias, columns);
10306            collect_expr_columns(high, table_name, table_alias, columns);
10307        }
10308        Expr::Subquery { .. } => {}
10309    }
10310}
10311
10312fn collect_field_ref_column(
10313    field: &crate::storage::query::ast::FieldRef,
10314    table_name: &str,
10315    table_alias: Option<&str>,
10316    columns: &mut std::collections::BTreeSet<String>,
10317) {
10318    if let Some(column) = policy_column_name_from_field_ref(field, table_name, table_alias) {
10319        if column != "*" {
10320            columns.insert(column);
10321        }
10322    }
10323}
10324
10325fn policy_column_name_from_field_ref(
10326    field: &crate::storage::query::ast::FieldRef,
10327    table_name: &str,
10328    table_alias: Option<&str>,
10329) -> Option<String> {
10330    match field {
10331        crate::storage::query::ast::FieldRef::TableColumn { table, column } => {
10332            if column == "*" {
10333                return Some("*".to_string());
10334            }
10335            if table.is_empty() || table == table_name || Some(table.as_str()) == table_alias {
10336                Some(column.clone())
10337            } else {
10338                Some(format!("{table}.{column}"))
10339            }
10340        }
10341        _ => None,
10342    }
10343}
10344
10345fn legacy_resource_to_iam(
10346    resource: &crate::auth::privileges::Resource,
10347    tenant: Option<&str>,
10348) -> crate::auth::policies::ResourceRef {
10349    use crate::auth::privileges::Resource;
10350
10351    let (kind, name) = match resource {
10352        Resource::Database => ("database".to_string(), "*".to_string()),
10353        Resource::Schema(s) => ("schema".to_string(), format!("{s}.*")),
10354        Resource::Table { schema, table } => (
10355            "table".to_string(),
10356            match schema {
10357                Some(s) => format!("{s}.{table}"),
10358                None => table.clone(),
10359            },
10360        ),
10361        Resource::Function { schema, name } => (
10362            "function".to_string(),
10363            match schema {
10364                Some(s) => format!("{s}.{name}"),
10365                None => name.clone(),
10366            },
10367        ),
10368    };
10369
10370    let mut out = crate::auth::policies::ResourceRef::new(kind, name);
10371    if let Some(t) = tenant {
10372        out = out.with_tenant(t.to_string());
10373    }
10374    out
10375}
10376
10377#[derive(Debug)]
10378struct JoinTableSide {
10379    table: String,
10380    alias: String,
10381}
10382
10383fn table_side_context(expr: &QueryExpr) -> Option<JoinTableSide> {
10384    match expr {
10385        QueryExpr::Table(table) => Some(JoinTableSide {
10386            table: table.table.clone(),
10387            alias: table.alias.clone().unwrap_or_else(|| table.table.clone()),
10388        }),
10389        _ => None,
10390    }
10391}
10392
10393fn collect_projection_columns_for_table(
10394    projection: &Projection,
10395    table: &str,
10396    alias: Option<&str>,
10397    out: &mut BTreeSet<String>,
10398) {
10399    match projection {
10400        Projection::Column(column) | Projection::Alias(column, _) => {
10401            match split_qualified_column(column) {
10402                Some((qualifier, column))
10403                    if qualifier == table || alias.is_some_and(|alias| qualifier == alias) =>
10404                {
10405                    push_policy_column(column, out);
10406                }
10407                Some(_) => {}
10408                None => push_policy_column(column, out),
10409            }
10410        }
10411        Projection::Field(
10412            FieldRef::TableColumn {
10413                table: qualifier,
10414                column,
10415            },
10416            _,
10417        ) => {
10418            if qualifier.is_empty()
10419                || qualifier == table
10420                || alias.is_some_and(|alias| qualifier == alias)
10421            {
10422                push_policy_column(column, out);
10423            }
10424        }
10425        Projection::Field(
10426            FieldRef::NodeProperty {
10427                alias: qualifier,
10428                property,
10429            },
10430            _,
10431        )
10432        | Projection::Field(
10433            FieldRef::EdgeProperty {
10434                alias: qualifier,
10435                property,
10436            },
10437            _,
10438        ) => {
10439            if qualifier == table || alias.is_some_and(|alias| qualifier == alias) {
10440                push_policy_column(property, out);
10441            }
10442        }
10443        Projection::Function(_, args) => {
10444            for arg in args {
10445                collect_projection_columns_for_table(arg, table, alias, out);
10446            }
10447        }
10448        Projection::Expression(_, _) | Projection::All | Projection::Field(_, _) => {}
10449    }
10450}
10451
10452fn collect_projection_columns_for_join_side(
10453    projection: &Projection,
10454    left: Option<&JoinTableSide>,
10455    right: Option<&JoinTableSide>,
10456    out: &mut HashMap<String, BTreeSet<String>>,
10457) -> RedDBResult<()> {
10458    match projection {
10459        Projection::Column(column) | Projection::Alias(column, _) => {
10460            if let Some((qualifier, column)) = split_qualified_column(column) {
10461                push_qualified_join_column(qualifier, column, left, right, out);
10462            } else {
10463                push_unqualified_join_column(column, left, right, out);
10464            }
10465        }
10466        Projection::Field(FieldRef::TableColumn { table, column }, _) => {
10467            if table.is_empty() {
10468                push_unqualified_join_column(column, left, right, out);
10469            } else if let Some(side) = [left, right]
10470                .into_iter()
10471                .flatten()
10472                .find(|side| table == side.table.as_str() || table == side.alias.as_str())
10473            {
10474                push_join_column(&side.table, column, out);
10475            }
10476        }
10477        Projection::Field(FieldRef::NodeProperty { alias, property }, _)
10478        | Projection::Field(FieldRef::EdgeProperty { alias, property }, _) => {
10479            push_qualified_join_column(alias, property, left, right, out);
10480        }
10481        Projection::Function(_, args) => {
10482            for arg in args {
10483                collect_projection_columns_for_join_side(arg, left, right, out)?;
10484            }
10485        }
10486        Projection::Expression(_, _) | Projection::All | Projection::Field(_, _) => {}
10487    }
10488    Ok(())
10489}
10490
10491fn split_qualified_column(column: &str) -> Option<(&str, &str)> {
10492    let (qualifier, column) = column.split_once('.')?;
10493    if qualifier.is_empty() || column.is_empty() || column.contains('.') {
10494        return None;
10495    }
10496    Some((qualifier, column))
10497}
10498
10499fn push_qualified_join_column(
10500    qualifier: &str,
10501    column: &str,
10502    left: Option<&JoinTableSide>,
10503    right: Option<&JoinTableSide>,
10504    out: &mut HashMap<String, BTreeSet<String>>,
10505) {
10506    if let Some(side) = [left, right]
10507        .into_iter()
10508        .flatten()
10509        .find(|side| qualifier == side.table.as_str() || qualifier == side.alias.as_str())
10510    {
10511        push_join_column(&side.table, column, out);
10512    }
10513}
10514
10515fn push_unqualified_join_column(
10516    column: &str,
10517    left: Option<&JoinTableSide>,
10518    right: Option<&JoinTableSide>,
10519    out: &mut HashMap<String, BTreeSet<String>>,
10520) {
10521    for side in [left, right].into_iter().flatten() {
10522        push_join_column(&side.table, column, out);
10523    }
10524}
10525
10526fn push_join_column(table: &str, column: &str, out: &mut HashMap<String, BTreeSet<String>>) {
10527    if is_policy_column_name(column) {
10528        out.entry(table.to_string())
10529            .or_default()
10530            .insert(column.to_string());
10531    }
10532}
10533
10534fn push_policy_column(column: &str, out: &mut BTreeSet<String>) {
10535    if is_policy_column_name(column) {
10536        out.insert(column.to_string());
10537    }
10538}
10539
10540fn is_policy_column_name(column: &str) -> bool {
10541    !column.is_empty()
10542        && column != "*"
10543        && !column.starts_with("LIT:")
10544        && !column.starts_with("TYPE:")
10545}
10546
10547fn runtime_iam_context(
10548    role: crate::auth::Role,
10549    tenant: Option<&str>,
10550) -> crate::auth::policies::EvalContext {
10551    crate::auth::policies::EvalContext {
10552        principal_tenant: tenant.map(|t| t.to_string()),
10553        current_tenant: tenant.map(|t| t.to_string()),
10554        peer_ip: None,
10555        mfa_present: false,
10556        now_ms: crate::auth::now_ms(),
10557        principal_is_admin_role: role == crate::auth::Role::Admin,
10558    }
10559}
10560
10561fn explicit_table_projection_columns(
10562    query: &crate::storage::query::ast::TableQuery,
10563) -> Vec<String> {
10564    use crate::storage::query::ast::{FieldRef, Projection};
10565
10566    let mut columns = Vec::new();
10567    for projection in crate::storage::query::sql_lowering::effective_table_projections(query) {
10568        match projection {
10569            Projection::Column(column) | Projection::Alias(column, _) => {
10570                push_unique(&mut columns, column)
10571            }
10572            Projection::Field(FieldRef::TableColumn { column, .. }, _) => {
10573                push_unique(&mut columns, column)
10574            }
10575            // SELECT * and expression/function projections need the
10576            // executor-wide column-policy context mapped in
10577            // docs/security/select-relational-column-policy-audit-2026-05-08.md.
10578            _ => {}
10579        }
10580    }
10581    columns
10582}
10583
10584fn explicit_graph_projection_properties(
10585    query: &crate::storage::query::ast::GraphQuery,
10586) -> Vec<String> {
10587    use crate::storage::query::ast::{FieldRef, Projection};
10588
10589    let mut columns = Vec::new();
10590    for projection in &query.return_ {
10591        match projection {
10592            Projection::Field(FieldRef::NodeProperty { property, .. }, _)
10593            | Projection::Field(FieldRef::EdgeProperty { property, .. }, _) => {
10594                push_unique(&mut columns, property.clone())
10595            }
10596            _ => {}
10597        }
10598    }
10599    columns
10600}
10601
10602fn push_unique(columns: &mut Vec<String>, column: String) {
10603    if !columns.iter().any(|existing| existing == &column) {
10604        columns.push(column);
10605    }
10606}
10607
10608fn principal_label(p: &crate::storage::query::ast::PolicyPrincipalRef) -> String {
10609    use crate::storage::query::ast::PolicyPrincipalRef;
10610    match p {
10611        PolicyPrincipalRef::User(u) => match &u.tenant {
10612            Some(t) => format!("user:{t}/{}", u.username),
10613            None => format!("user:{}", u.username),
10614        },
10615        PolicyPrincipalRef::Group(g) => format!("group:{g}"),
10616    }
10617}
10618
10619/// Render a `Decision` into the (decision, matched_policy_id, matched_sid)
10620/// shape used by every audit emit + the simulator response.
10621pub(crate) fn decision_to_strings(
10622    d: &crate::auth::policies::Decision,
10623) -> (String, Option<String>, Option<String>) {
10624    use crate::auth::policies::Decision;
10625    match d {
10626        Decision::Allow {
10627            matched_policy_id,
10628            matched_sid,
10629        } => (
10630            "allow".into(),
10631            Some(matched_policy_id.clone()),
10632            matched_sid.clone(),
10633        ),
10634        Decision::Deny {
10635            matched_policy_id,
10636            matched_sid,
10637        } => (
10638            "deny".into(),
10639            Some(matched_policy_id.clone()),
10640            matched_sid.clone(),
10641        ),
10642        Decision::DefaultDeny => ("default_deny".into(), None, None),
10643        Decision::AdminBypass => ("admin_bypass".into(), None, None),
10644    }
10645}
10646
10647fn relation_scopes_for_query(query: &QueryExpr) -> Vec<String> {
10648    let mut scopes = Vec::new();
10649    collect_relation_scopes(query, &mut scopes);
10650    scopes.sort();
10651    scopes.dedup();
10652    scopes
10653}
10654
10655fn collect_relation_scopes(query: &QueryExpr, scopes: &mut Vec<String>) {
10656    match query {
10657        QueryExpr::Table(table) => {
10658            if !table.table.is_empty() {
10659                scopes.push(table.table.clone());
10660            }
10661            if let Some(alias) = &table.alias {
10662                scopes.push(alias.clone());
10663            }
10664        }
10665        QueryExpr::Join(join) => {
10666            collect_relation_scopes(&join.left, scopes);
10667            collect_relation_scopes(&join.right, scopes);
10668        }
10669        _ => {}
10670    }
10671}
10672
10673fn query_references_outer_scope(query: &QueryExpr, outer_scopes: &[String]) -> bool {
10674    let inner_scopes = relation_scopes_for_query(query);
10675    query_expr_references_outer_scope(query, outer_scopes, &inner_scopes)
10676}
10677
10678fn query_expr_references_outer_scope(
10679    query: &QueryExpr,
10680    outer_scopes: &[String],
10681    inner_scopes: &[String],
10682) -> bool {
10683    match query {
10684        QueryExpr::Table(table) => {
10685            table.select_items.iter().any(|item| match item {
10686                crate::storage::query::ast::SelectItem::Wildcard => false,
10687                crate::storage::query::ast::SelectItem::Expr { expr, .. } => {
10688                    expr_references_outer_scope(expr, outer_scopes, inner_scopes)
10689                }
10690            }) || table
10691                .where_expr
10692                .as_ref()
10693                .is_some_and(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
10694                || table.filter.as_ref().is_some_and(|filter| {
10695                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
10696                })
10697                || table.having_expr.as_ref().is_some_and(|expr| {
10698                    expr_references_outer_scope(expr, outer_scopes, inner_scopes)
10699                })
10700                || table.having.as_ref().is_some_and(|filter| {
10701                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
10702                })
10703                || table
10704                    .group_by_exprs
10705                    .iter()
10706                    .any(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
10707                || table.order_by.iter().any(|clause| {
10708                    clause.expr.as_ref().is_some_and(|expr| {
10709                        expr_references_outer_scope(expr, outer_scopes, inner_scopes)
10710                    })
10711                })
10712        }
10713        QueryExpr::Join(join) => {
10714            query_expr_references_outer_scope(&join.left, outer_scopes, inner_scopes)
10715                || query_expr_references_outer_scope(&join.right, outer_scopes, inner_scopes)
10716                || join.filter.as_ref().is_some_and(|filter| {
10717                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
10718                })
10719                || join.return_items.iter().any(|item| match item {
10720                    crate::storage::query::ast::SelectItem::Wildcard => false,
10721                    crate::storage::query::ast::SelectItem::Expr { expr, .. } => {
10722                        expr_references_outer_scope(expr, outer_scopes, inner_scopes)
10723                    }
10724                })
10725        }
10726        _ => false,
10727    }
10728}
10729
10730fn filter_references_outer_scope(
10731    filter: &crate::storage::query::ast::Filter,
10732    outer_scopes: &[String],
10733    inner_scopes: &[String],
10734) -> bool {
10735    use crate::storage::query::ast::Filter;
10736    match filter {
10737        Filter::Compare { field, .. }
10738        | Filter::IsNull(field)
10739        | Filter::IsNotNull(field)
10740        | Filter::In { field, .. }
10741        | Filter::Between { field, .. }
10742        | Filter::Like { field, .. }
10743        | Filter::StartsWith { field, .. }
10744        | Filter::EndsWith { field, .. }
10745        | Filter::Contains { field, .. } => {
10746            field_ref_references_outer_scope(field, outer_scopes, inner_scopes)
10747        }
10748        Filter::CompareFields { left, right, .. } => {
10749            field_ref_references_outer_scope(left, outer_scopes, inner_scopes)
10750                || field_ref_references_outer_scope(right, outer_scopes, inner_scopes)
10751        }
10752        Filter::CompareExpr { lhs, rhs, .. } => {
10753            expr_references_outer_scope(lhs, outer_scopes, inner_scopes)
10754                || expr_references_outer_scope(rhs, outer_scopes, inner_scopes)
10755        }
10756        Filter::And(left, right) | Filter::Or(left, right) => {
10757            filter_references_outer_scope(left, outer_scopes, inner_scopes)
10758                || filter_references_outer_scope(right, outer_scopes, inner_scopes)
10759        }
10760        Filter::Not(inner) => filter_references_outer_scope(inner, outer_scopes, inner_scopes),
10761    }
10762}
10763
10764fn expr_references_outer_scope(
10765    expr: &crate::storage::query::ast::Expr,
10766    outer_scopes: &[String],
10767    inner_scopes: &[String],
10768) -> bool {
10769    use crate::storage::query::ast::Expr;
10770    match expr {
10771        Expr::Column { field, .. } => {
10772            field_ref_references_outer_scope(field, outer_scopes, inner_scopes)
10773        }
10774        Expr::BinaryOp { lhs, rhs, .. } => {
10775            expr_references_outer_scope(lhs, outer_scopes, inner_scopes)
10776                || expr_references_outer_scope(rhs, outer_scopes, inner_scopes)
10777        }
10778        Expr::UnaryOp { operand, .. }
10779        | Expr::Cast { inner: operand, .. }
10780        | Expr::IsNull { operand, .. } => {
10781            expr_references_outer_scope(operand, outer_scopes, inner_scopes)
10782        }
10783        Expr::FunctionCall { args, .. } => args
10784            .iter()
10785            .any(|arg| expr_references_outer_scope(arg, outer_scopes, inner_scopes)),
10786        Expr::Case {
10787            branches, else_, ..
10788        } => {
10789            branches.iter().any(|(cond, value)| {
10790                expr_references_outer_scope(cond, outer_scopes, inner_scopes)
10791                    || expr_references_outer_scope(value, outer_scopes, inner_scopes)
10792            }) || else_
10793                .as_ref()
10794                .is_some_and(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
10795        }
10796        Expr::InList { target, values, .. } => {
10797            expr_references_outer_scope(target, outer_scopes, inner_scopes)
10798                || values
10799                    .iter()
10800                    .any(|value| expr_references_outer_scope(value, outer_scopes, inner_scopes))
10801        }
10802        Expr::Between {
10803            target, low, high, ..
10804        } => {
10805            expr_references_outer_scope(target, outer_scopes, inner_scopes)
10806                || expr_references_outer_scope(low, outer_scopes, inner_scopes)
10807                || expr_references_outer_scope(high, outer_scopes, inner_scopes)
10808        }
10809        Expr::Subquery { query, .. } => query_references_outer_scope(&query.query, inner_scopes),
10810        Expr::Literal { .. } | Expr::Parameter { .. } => false,
10811    }
10812}
10813
10814fn field_ref_references_outer_scope(
10815    field: &crate::storage::query::ast::FieldRef,
10816    outer_scopes: &[String],
10817    inner_scopes: &[String],
10818) -> bool {
10819    match field {
10820        crate::storage::query::ast::FieldRef::TableColumn { table, .. } if !table.is_empty() => {
10821            outer_scopes.iter().any(|scope| scope == table)
10822                && !inner_scopes.iter().any(|scope| scope == table)
10823        }
10824        _ => false,
10825    }
10826}
10827
10828fn first_column_values(
10829    result: crate::storage::query::unified::UnifiedResult,
10830) -> RedDBResult<Vec<Value>> {
10831    if result.columns.len() > 1 {
10832        return Err(RedDBError::Query(
10833            "expression subquery must return exactly one column".to_string(),
10834        ));
10835    }
10836    let fallback_column = result
10837        .records
10838        .first()
10839        .and_then(|record| record.column_names().into_iter().next())
10840        .map(|name| name.to_string());
10841    let column = result.columns.first().cloned().or(fallback_column);
10842    let Some(column) = column else {
10843        return Ok(Vec::new());
10844    };
10845    Ok(result
10846        .records
10847        .iter()
10848        .map(|record| record.get(column.as_str()).cloned().unwrap_or(Value::Null))
10849        .collect())
10850}
10851
10852fn parse_timestamp_to_ms(s: &str) -> Option<u128> {
10853    // Bare integer ms.
10854    if let Ok(n) = s.parse::<u128>() {
10855        return Some(n);
10856    }
10857    // Fallback: ISO-8601 like 2030-01-02 03:04:05 — accept the date
10858    // portion only (midnight UTC). Full RFC3339 parsing is a stretch
10859    // goal; the common case is `'2030-01-01'`.
10860    if let Some(date) = s.split_whitespace().next() {
10861        let parts: Vec<&str> = date.split('-').collect();
10862        if parts.len() == 3 {
10863            let (y, m, d) = (parts[0], parts[1], parts[2]);
10864            if let (Ok(y), Ok(m), Ok(d)) = (y.parse::<i64>(), m.parse::<u32>(), d.parse::<u32>()) {
10865                // Days since 1970-01-01 — simple Julian arithmetic
10866                // suitable for years 1970-2100. Good enough for test
10867                // fixtures; precise parsing lands when we wire chrono.
10868                let days_in = days_from_civil(y, m, d);
10869                return Some((days_in as u128) * 86_400_000u128);
10870            }
10871        }
10872    }
10873    None
10874}
10875
10876/// Days from Unix epoch using H. Hinnant's civil-from-days algorithm.
10877/// Robust for the entire Gregorian range; used by `parse_timestamp_to_ms`.
10878fn days_from_civil(y: i64, m: u32, d: u32) -> i64 {
10879    let y = if m <= 2 { y - 1 } else { y };
10880    let era = if y >= 0 { y } else { y - 399 } / 400;
10881    let yoe = (y - era * 400) as u64; // [0, 399]
10882    let doy = (153 * (if m > 2 { m - 3 } else { m + 9 }) as u64 + 2) / 5 + d as u64 - 1;
10883    let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
10884    era * 146097 + doe as i64 - 719468
10885}
10886
10887fn walk_plan_node(
10888    node: &crate::storage::query::planner::CanonicalLogicalNode,
10889    depth: usize,
10890    out: &mut Vec<crate::storage::query::unified::UnifiedRecord>,
10891) {
10892    use std::sync::Arc;
10893    let mut rec = crate::storage::query::unified::UnifiedRecord::default();
10894    rec.set_arc(Arc::from("op"), Value::text(node.operator.clone()));
10895    rec.set_arc(
10896        Arc::from("source"),
10897        node.source.clone().map(Value::text).unwrap_or(Value::Null),
10898    );
10899    rec.set_arc(Arc::from("est_rows"), Value::Float(node.estimated_rows));
10900    rec.set_arc(Arc::from("est_cost"), Value::Float(node.operator_cost));
10901    rec.set_arc(Arc::from("depth"), Value::Integer(depth as i64));
10902    out.push(rec);
10903    for child in &node.children {
10904        walk_plan_node(child, depth + 1, out);
10905    }
10906}