Skip to main content

reddb_server/runtime/
impl_core.rs

1use super::*;
2use crate::application::entity::metadata_to_json;
3use crate::auth::column_policy_gate::ColumnAccessRequest;
4use crate::auth::UserId;
5use crate::replication::cdc::ChangeRecord;
6use crate::replication::logical::{ApplyMode, LogicalChangeApplier};
7use crate::storage::query::ast::TableSource;
8
9thread_local! {
10    /// Current connection id for the executing statement. Set by the
11    /// per-connection wrapper (stdio/gRPC handlers) before dispatching
12    /// into `execute_query`; falls back to `0` for embedded callers.
13    static CURRENT_CONN_ID: std::cell::Cell<u64> = const { std::cell::Cell::new(0) };
14
15    /// Authenticated user + role for the executing statement (Phase 2.5.2
16    /// RLS enforcement). Set by the transport middleware after validating
17    /// credentials (password / cert / oauth); unset means "anonymous" /
18    /// "embedded" — RLS policies degrade to the role-agnostic subset.
19    ///
20    /// `None` skips RLS injection entirely; `Some((username, role))`
21    /// passes `role` to `matching_rls_policies(table, Some(role), action)`.
22    static CURRENT_AUTH_IDENTITY: std::cell::RefCell<Option<(String, crate::auth::Role)>> =
23        const { std::cell::RefCell::new(None) };
24
25    /// MVCC snapshot scoped to the currently-executing statement (Phase
26    /// 2.3.2d PG parity). `execute_query` captures it on entry and drops
27    /// it on exit; every scan consults it via
28    /// `entity_visible_under_current_snapshot` to hide tuples whose xmin
29    /// hasn't committed or whose xmax already has.
30    ///
31    /// `None` means "pre-MVCC semantics" — the read path returns every
32    /// tuple regardless of xmin/xmax. All embedded callers that bypass
33    /// `execute_query` see this default.
34    static CURRENT_SNAPSHOT: std::cell::RefCell<Option<SnapshotContext>> =
35        const { std::cell::RefCell::new(None) };
36
37    /// Cheap presence flag for `CURRENT_SNAPSHOT`. Scan hot paths
38    /// poll this instead of `borrow()`-ing the RefCell on every
39    /// row — the common case (autocommit / no MVCC session) reads
40    /// one atomic `Cell<bool>` and short-circuits, saving ~10ns × N
41    /// rows on aggregate_group / select_range scans.
42    static HAS_SNAPSHOT: std::cell::Cell<bool> = const { std::cell::Cell::new(false) };
43
44    /// Session-scoped tenant id for the current connection (Phase 2.5.3
45    /// multi-tenancy). Populated by `SET TENANT 'id'` or by transport
46    /// middleware after resolving tenant from auth claims. Read by the
47    /// `CURRENT_TENANT()` scalar function — RLS policies typically
48    /// combine it as `USING (tenant_id = CURRENT_TENANT())` to scope
49    /// every query to one tenant.
50    ///
51    /// `None` means "no tenant bound" — `CURRENT_TENANT()` returns
52    /// NULL, and RLS policies that gate on it hide every row.
53    static CURRENT_TENANT_ID: std::cell::RefCell<Option<String>> =
54        const { std::cell::RefCell::new(None) };
55
56    /// Statement-local config resolver. SQL expressions materialize the
57    /// `red_config` snapshot lazily on the first `$config.*`/`CONFIG()`
58    /// access, keeping ordinary statements on the zero-scan path.
59    static CURRENT_CONFIG_RESOLVER: std::cell::RefCell<Option<ConfigResolver>> =
60        const { std::cell::RefCell::new(None) };
61
62    /// Statement-local secret resolver. SQL expressions materialize the
63    /// vault KV snapshot lazily on first `$secret.*` access, then use
64    /// lock-free map reads for the rest of the statement.
65    static CURRENT_SECRET_RESOLVER: std::cell::RefCell<Option<SecretResolver>> =
66        const { std::cell::RefCell::new(None) };
67}
68
69fn secret_sql_value_to_string(value: &Value) -> RedDBResult<String> {
70    match value {
71        Value::Text(s) => Ok(s.to_string()),
72        Value::Integer(n) => Ok(n.to_string()),
73        Value::UnsignedInteger(n) => Ok(n.to_string()),
74        Value::Float(n) => Ok(n.to_string()),
75        Value::Boolean(b) => Ok(b.to_string()),
76        Value::Null => Err(RedDBError::Query(
77            "SET SECRET key = NULL deletes the secret; use DELETE SECRET for explicit deletes"
78                .to_string(),
79        )),
80        Value::Password(_) | Value::Secret(_) => Err(RedDBError::Query(
81            "SET SECRET accepts plain scalar literals; PASSWORD() and SECRET() are for typed columns"
82                .to_string(),
83        )),
84        _ => Err(RedDBError::Query(format!(
85            "SET SECRET does not support value type {:?} yet",
86            value.data_type()
87        ))),
88    }
89}
90
91/// Convert the rows produced by a materialized-view body into
92/// `UnifiedEntity` table rows targeting the backing collection.
93/// Issue #595 slice 9c — feeds `UnifiedStore::refresh_collection`.
94///
95/// Graph fragments and vector hits are ignored: a materialized view
96/// is a relational result set (SELECT-shaped); slices 11+ may extend
97/// this once we have a richer view body shape. Each row materialises
98/// the union of its schema-bound columns + overflow.
99fn view_records_to_entities(
100    table: &str,
101    records: &[crate::storage::query::unified::UnifiedRecord],
102) -> Vec<crate::storage::UnifiedEntity> {
103    use std::collections::HashMap;
104    let table_arc: std::sync::Arc<str> = std::sync::Arc::from(table);
105    let mut out = Vec::with_capacity(records.len());
106    for record in records {
107        let mut named: HashMap<String, crate::storage::schema::Value> = HashMap::new();
108        for (name, value) in record.iter_fields() {
109            named.insert(name.to_string(), value.clone());
110        }
111        let entity = crate::storage::UnifiedEntity::new(
112            crate::storage::EntityId::new(0),
113            crate::storage::EntityKind::TableRow {
114                table: std::sync::Arc::clone(&table_arc),
115                row_id: 0,
116            },
117            crate::storage::EntityData::Row(crate::storage::RowData {
118                columns: Vec::new(),
119                named: Some(named),
120                schema: None,
121            }),
122        );
123        out.push(entity);
124    }
125    out
126}
127
128fn system_keyed_collection_contract(
129    name: &str,
130    model: crate::catalog::CollectionModel,
131) -> crate::physical::CollectionContract {
132    let now = crate::utils::now_unix_millis() as u128;
133    crate::physical::CollectionContract {
134        name: name.to_string(),
135        declared_model: model,
136        schema_mode: crate::catalog::SchemaMode::Dynamic,
137        origin: crate::physical::ContractOrigin::Implicit,
138        version: 1,
139        created_at_unix_ms: now,
140        updated_at_unix_ms: now,
141        default_ttl_ms: None,
142        vector_dimension: None,
143        vector_metric: None,
144        context_index_fields: Vec::new(),
145        declared_columns: Vec::new(),
146        table_def: None,
147        timestamps_enabled: false,
148        context_index_enabled: false,
149        metrics_raw_retention_ms: None,
150        metrics_rollup_policies: Vec::new(),
151        metrics_tenant_identity: None,
152        metrics_namespace: None,
153        append_only: false,
154        subscriptions: Vec::new(),
155        session_key: None,
156        session_gap_ms: None,
157        retention_duration_ms: None,
158    }
159}
160
161/// Snapshot + manager pair used for read-path visibility checks.
162///
163/// The manager is needed in addition to the snapshot because `aborted`
164/// state mutates after the snapshot is captured — a ROLLBACK by a
165/// committed-at-capture-time writer must still hide its tuples. Keeping
166/// the Arc around is O(pointer) and the RwLock reads on `is_aborted`
167/// are cheap (HashSet lookup under a parking_lot read guard).
168///
169/// `own_xids` (Phase 2.3.2e) lists the xids belonging to the current
170/// connection's transaction — the parent xid plus open and released
171/// savepoint sub-xids. The visibility rule promotes rows stamped with
172/// these xids to "always visible (unless aborted)" so the writer sees
173/// its own nested-savepoint writes even though their xids exceed
174/// `snapshot.xid`.
175#[derive(Clone)]
176pub struct SnapshotContext {
177    pub snapshot: crate::storage::transaction::snapshot::Snapshot,
178    pub manager: Arc<crate::storage::transaction::snapshot::SnapshotManager>,
179    pub own_xids: std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
180    pub requires_index_fallback: bool,
181}
182
183/// Install a connection id on the current thread for the duration of a
184/// statement. Transaction state (`RuntimeInner::tx_contexts`) is keyed
185/// by this id so different connections can hold independent BEGINs.
186///
187/// Pub so transports (PG wire, gRPC, HTTP per-request spawners) and
188/// tests can emulate per-connection isolation. Call it once when
189/// binding the connection's worker thread; pair with
190/// `clear_current_connection_id` on teardown.
191pub fn set_current_connection_id(id: u64) {
192    CURRENT_CONN_ID.with(|c| c.set(id));
193}
194
195/// Reset the thread's connection id back to `0` (autocommit).
196pub fn clear_current_connection_id() {
197    CURRENT_CONN_ID.with(|c| c.set(0));
198}
199
200/// Read the connection id set by `set_current_connection_id`. Returns
201/// `0` when no wrapper installed one — auto-commit path.
202pub fn current_connection_id() -> u64 {
203    CURRENT_CONN_ID.with(|c| c.get())
204}
205
206/// Install the authenticated identity for the current thread (Phase 2.5.2
207/// RLS enforcement). Transport layers call this right after resolving
208/// auth so the query dispatch can fold RLS policies into the filter.
209pub fn set_current_auth_identity(username: String, role: crate::auth::Role) {
210    CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = Some((username, role)));
211}
212
213/// Clear the thread-local auth identity. Transports call this after the
214/// statement completes so pooled threads don't leak identities across
215/// requests.
216pub fn clear_current_auth_identity() {
217    CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = None);
218}
219
220/// Read the current-thread auth identity. `None` when no transport
221/// installed one (embedded mode / anonymous access).
222pub(crate) fn current_auth_identity() -> Option<(String, crate::auth::Role)> {
223    CURRENT_AUTH_IDENTITY.with(|cell| cell.borrow().clone())
224}
225
226/// Install the session tenant id for the current thread (Phase 2.5.3
227/// multi-tenancy). Called by `SET TENANT 'id'` dispatch and by
228/// transport middleware that resolves tenant from auth claims (e.g.
229/// JWT `tenant` claim, HTTP header, subdomain).
230pub fn set_current_tenant(tenant_id: String) {
231    CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = Some(tenant_id));
232}
233
234/// Clear the current-thread tenant — `CURRENT_TENANT()` will then
235/// return NULL and any RLS policy gated on it will hide every row.
236pub fn clear_current_tenant() {
237    CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = None);
238}
239
240/// Read the current-thread tenant id, applying overrides in priority order:
241///   1. `WITHIN TENANT '<id>' …` per-statement override (highest)
242///   2. `SET LOCAL TENANT '<id>'` transaction-local override (consulted
243///      only when the current connection has an open transaction)
244///   3. `SET TENANT '<id>'` session-level thread-local
245///   4. `None` (deny-default for RLS).
246///
247/// The transaction-local layer is read through the runtime; an embedded
248/// helper crate that has no `RedDBRuntime` access still gets correct
249/// behaviour for layers 1, 3, and 4.
250pub fn current_tenant() -> Option<String> {
251    let inherited = CURRENT_TENANT_ID.with(|cell| cell.borrow().clone());
252    if let Some(over) = current_scope_override() {
253        if over.tenant.is_active() {
254            return over.tenant.resolve(inherited);
255        }
256    }
257    if let Some(tx_local) = current_tx_local_tenant() {
258        return tx_local;
259    }
260    inherited
261}
262
263thread_local! {
264    /// Snapshot of the active connection's `tx_local_tenants` entry for
265    /// the current `execute_query` call. Outer `Some(_)` means "a
266    /// transaction-local tenant override is active for this call";
267    /// inner is the override's value (`Some(s)` overrides to `s`,
268    /// `None` overrides to NULL/cleared). Refreshed at the top of every
269    /// `execute_query` invocation and cleared by the RAII guard on
270    /// return so pooled connections cannot leak the override past the
271    /// statement that owns it.
272    static TX_LOCAL_TENANT: std::cell::RefCell<Option<Option<String>>> =
273        const { std::cell::RefCell::new(None) };
274}
275
276fn current_tx_local_tenant() -> Option<Option<String>> {
277    TX_LOCAL_TENANT.with(|cell| cell.borrow().clone())
278}
279
280/// Recognise `SET LOCAL TENANT '<id>'` / `SET LOCAL TENANT NULL` —
281/// returns `Ok(Some(Some(id)))` for an explicit value, `Ok(Some(None))`
282/// for an explicit NULL clear, `Ok(None)` when the input is not a
283/// `SET LOCAL TENANT` statement at all, and `Err` when the prefix
284/// matches but the value is malformed.
285fn parse_set_local_tenant(query: &str) -> RedDBResult<Option<Option<String>>> {
286    let mut tokens = query.split_ascii_whitespace();
287    let Some(w1) = tokens.next() else {
288        return Ok(None);
289    };
290    if !w1.eq_ignore_ascii_case("SET") {
291        return Ok(None);
292    }
293    let Some(w2) = tokens.next() else {
294        return Ok(None);
295    };
296    if !w2.eq_ignore_ascii_case("LOCAL") {
297        return Ok(None);
298    }
299    let Some(w3) = tokens.next() else {
300        return Ok(None);
301    };
302    if !w3.eq_ignore_ascii_case("TENANT") {
303        return Ok(None);
304    }
305    let rest: String = tokens.collect::<Vec<_>>().join(" ");
306    let rest = rest.trim().trim_end_matches(';').trim();
307    let value_str = rest.strip_prefix('=').map(|s| s.trim()).unwrap_or(rest);
308    if value_str.is_empty() {
309        return Err(RedDBError::Query(
310            "SET LOCAL TENANT expects a string literal or NULL".to_string(),
311        ));
312    }
313    if value_str.eq_ignore_ascii_case("NULL") {
314        return Ok(Some(None));
315    }
316    if value_str.starts_with('\'') && value_str.ends_with('\'') && value_str.len() >= 2 {
317        let inner = &value_str[1..value_str.len() - 1];
318        return Ok(Some(Some(inner.to_string())));
319    }
320    Err(RedDBError::Query(format!(
321        "SET LOCAL TENANT expects a string literal or NULL, got `{value_str}`"
322    )))
323}
324
325pub(crate) struct TxLocalTenantGuard;
326
327impl TxLocalTenantGuard {
328    pub fn install(value: Option<Option<String>>) -> Self {
329        TX_LOCAL_TENANT.with(|cell| *cell.borrow_mut() = value);
330        Self
331    }
332}
333
334impl Drop for TxLocalTenantGuard {
335    fn drop(&mut self) {
336        TX_LOCAL_TENANT.with(|cell| *cell.borrow_mut() = None);
337    }
338}
339
340thread_local! {
341    /// Stack of `WITHIN ... <stmt>` overrides active on the current
342    /// thread. Every entry corresponds to one in-flight `execute_query`
343    /// call that started with a `WITHIN` prefix; the entry is pushed
344    /// before dispatch and popped before the call returns. The stack
345    /// shape supports nested invocations (e.g. a view body that itself
346    /// re-enters execute_query).
347    static SCOPE_OVERRIDES: std::cell::RefCell<Vec<crate::runtime::within_clause::ScopeOverride>> =
348        const { std::cell::RefCell::new(Vec::new()) };
349}
350
351pub(crate) fn push_scope_override(over: crate::runtime::within_clause::ScopeOverride) {
352    SCOPE_OVERRIDES.with(|cell| cell.borrow_mut().push(over));
353}
354
355pub(crate) fn pop_scope_override() {
356    SCOPE_OVERRIDES.with(|cell| {
357        cell.borrow_mut().pop();
358    });
359}
360
361pub(crate) fn current_scope_override() -> Option<crate::runtime::within_clause::ScopeOverride> {
362    SCOPE_OVERRIDES.with(|cell| cell.borrow().last().cloned())
363}
364
365/// Cheap probe: is any `WITHIN …` scope override active on this
366/// thread? The fast-path needs to know without paying for the full
367/// `.last().cloned()` allocation — just peek at stack length.
368pub(crate) fn has_scope_override_active() -> bool {
369    SCOPE_OVERRIDES.with(|cell| !cell.borrow().is_empty())
370}
371
372/// RAII guard pairing `push_scope_override` with the matching pop, so
373/// the stack stays balanced even when the inner `execute_query` returns
374/// early via `?`.
375pub(crate) struct ScopeOverrideGuard;
376
377impl ScopeOverrideGuard {
378    pub fn install(over: crate::runtime::within_clause::ScopeOverride) -> Self {
379        push_scope_override(over);
380        Self
381    }
382}
383
384impl Drop for ScopeOverrideGuard {
385    fn drop(&mut self) {
386        pop_scope_override();
387    }
388}
389
390/// Read the current-thread auth identity, honouring per-statement
391/// `WITHIN ... USER '<u>' AS ROLE '<r>'` overrides. The override only
392/// supplies projected strings — it never grants additional privilege —
393/// so callers that need to make authorisation decisions must read from
394/// the underlying `current_auth_identity()` directly.
395pub(crate) fn current_user_projected() -> Option<String> {
396    let inherited = current_auth_identity().map(|(u, _)| u);
397    if let Some(over) = current_scope_override() {
398        if over.user.is_active() {
399            return over.user.resolve(inherited);
400        }
401    }
402    inherited
403}
404
405pub(crate) fn current_role_projected() -> Option<String> {
406    let inherited = current_auth_identity().map(|(_, r)| format!("{r:?}").to_lowercase());
407    if let Some(over) = current_scope_override() {
408        if over.role.is_active() {
409            return over.role.resolve(inherited);
410        }
411    }
412    inherited
413}
414
415pub(crate) fn current_secret_value(path: &str) -> Option<String> {
416    let key = path.to_ascii_lowercase();
417    CURRENT_SECRET_RESOLVER.with(|cell| {
418        let mut resolver = cell.borrow_mut();
419        let resolver = resolver.as_mut()?;
420        if resolver.values.is_none() {
421            resolver.values = resolver
422                .store
423                .as_ref()
424                .map(|store| store.vault_kv_snapshot());
425        }
426        let values = resolver.values.as_ref()?;
427        values.get(&key).cloned().or_else(|| {
428            key.strip_prefix("red.vault/").and_then(|rest| {
429                values
430                    .get(rest)
431                    .cloned()
432                    .or_else(|| values.get(&format!("red.secret.{rest}")).cloned())
433            })
434        })
435    })
436}
437
438struct SecretResolver {
439    store: Option<Arc<crate::auth::store::AuthStore>>,
440    values: Option<HashMap<String, String>>,
441}
442
443pub(super) struct SecretStoreGuard {
444    previous: Option<SecretResolver>,
445}
446
447impl SecretStoreGuard {
448    pub(super) fn install(store: Option<Arc<crate::auth::store::AuthStore>>) -> Self {
449        let previous = CURRENT_SECRET_RESOLVER.with(|cell| {
450            cell.replace(Some(SecretResolver {
451                store,
452                values: None,
453            }))
454        });
455        Self { previous }
456    }
457}
458
459impl Drop for SecretStoreGuard {
460    fn drop(&mut self) {
461        let previous = self.previous.take();
462        CURRENT_SECRET_RESOLVER.with(|cell| {
463            cell.replace(previous);
464        });
465    }
466}
467
468pub(crate) fn current_config_value(path: &str) -> Option<Value> {
469    let key = path.to_ascii_lowercase();
470    CURRENT_CONFIG_RESOLVER.with(|cell| {
471        let mut resolver = cell.borrow_mut();
472        let resolver = resolver.as_mut()?;
473        if resolver.values.is_none() {
474            resolver.values = Some(latest_config_snapshot(&resolver.db));
475        }
476        let values = resolver.values.as_ref()?;
477        values.get(&key).cloned().or_else(|| {
478            key.strip_prefix("red.config/")
479                .and_then(|rest| values.get(&format!("red.config.{rest}")).cloned())
480        })
481    })
482}
483
484fn update_current_config_value(path: &str, value: Value) {
485    let key = path.to_ascii_lowercase();
486    CURRENT_CONFIG_RESOLVER.with(|cell| {
487        if let Some(resolver) = cell.borrow_mut().as_mut() {
488            if let Some(values) = resolver.values.as_mut() {
489                values.insert(key, value);
490            }
491        }
492    });
493}
494
495fn update_current_secret_value(path: &str, value: Option<String>) {
496    let key = path.to_ascii_lowercase();
497    CURRENT_SECRET_RESOLVER.with(|cell| {
498        if let Some(resolver) = cell.borrow_mut().as_mut() {
499            let Some(values) = resolver.values.as_mut() else {
500                return;
501            };
502            match value {
503                Some(value) => {
504                    values.insert(key, value);
505                }
506                None => {
507                    values.remove(&key);
508                }
509            }
510        }
511    });
512}
513
514fn latest_config_snapshot(db: &RedDB) -> HashMap<String, Value> {
515    let mut latest: HashMap<String, (u64, Value)> = HashMap::new();
516
517    if let Some(manager) = db.store().get_collection("red_config") {
518        manager.for_each_entity(|entity| {
519            let Some(row) = entity.data.as_row() else {
520                return true;
521            };
522            let Some(Value::Text(key)) = row.get_field("key") else {
523                return true;
524            };
525            let value = row.get_field("value").cloned().unwrap_or(Value::Null);
526            let id = entity.id.raw();
527            let key = key.to_ascii_lowercase();
528            insert_latest_config_value(&mut latest, key.clone(), id, value.clone());
529            if let Some(rest) = key.strip_prefix("red.config.") {
530                insert_latest_config_value(&mut latest, format!("red.config/{rest}"), id, value);
531            }
532            true
533        });
534    }
535
536    if let Some(manager) = db.store().get_collection("red.config") {
537        manager.for_each_entity(|entity| {
538            let Some(row) = entity.data.as_row() else {
539                return true;
540            };
541            if matches!(row.get_field("tombstone"), Some(Value::Boolean(true))) {
542                return true;
543            }
544            let Some(Value::Text(key)) = row.get_field("key") else {
545                return true;
546            };
547            let value = row.get_field("value").cloned().unwrap_or(Value::Null);
548            insert_latest_config_value(
549                &mut latest,
550                format!("red.config/{}", key.to_ascii_lowercase()),
551                entity.id.raw(),
552                value,
553            );
554            true
555        });
556    }
557
558    latest
559        .into_iter()
560        .map(|(key, (_, value))| (key, value))
561        .collect()
562}
563
564fn insert_latest_config_value(
565    latest: &mut HashMap<String, (u64, Value)>,
566    key: String,
567    id: u64,
568    value: Value,
569) {
570    match latest.get(&key) {
571        Some((prev_id, _)) if *prev_id > id => {}
572        _ => {
573            latest.insert(key, (id, value));
574        }
575    }
576}
577
578struct ConfigResolver {
579    db: Arc<RedDB>,
580    values: Option<HashMap<String, Value>>,
581}
582
583pub(super) struct ConfigSnapshotGuard {
584    previous: Option<ConfigResolver>,
585}
586
587impl ConfigSnapshotGuard {
588    pub(super) fn install(db: Arc<RedDB>) -> Self {
589        let previous = CURRENT_CONFIG_RESOLVER
590            .with(|cell| cell.replace(Some(ConfigResolver { db, values: None })));
591        Self { previous }
592    }
593}
594
595impl Drop for ConfigSnapshotGuard {
596    fn drop(&mut self) {
597        let previous = self.previous.take();
598        CURRENT_CONFIG_RESOLVER.with(|cell| {
599            cell.replace(previous);
600        });
601    }
602}
603
604/// Install the MVCC snapshot used by the current thread for the duration
605/// of one statement. Paired with `clear_current_snapshot()` — callers
606/// should prefer the `CurrentSnapshotGuard` RAII wrapper so early returns
607/// still clean up.
608pub fn set_current_snapshot(ctx: SnapshotContext) {
609    CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = Some(ctx));
610    HAS_SNAPSHOT.with(|c| c.set(true));
611}
612
613pub fn clear_current_snapshot() {
614    CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = None);
615    HAS_SNAPSHOT.with(|c| c.set(false));
616}
617
618/// Drop-guard that restores the previous snapshot on scope exit. Safe to
619/// nest — each statement saves the caller's snapshot and puts it back
620/// instead of blindly clearing, so a top-level `execute_query` called
621/// from inside another statement dispatch (e.g. vector source subqueries)
622/// doesn't strip visibility from the outer scan.
623pub(crate) struct CurrentSnapshotGuard {
624    previous: Option<SnapshotContext>,
625}
626
627impl CurrentSnapshotGuard {
628    pub(crate) fn install(ctx: SnapshotContext) -> Self {
629        let previous = CURRENT_SNAPSHOT.with(|cell| cell.borrow().clone());
630        set_current_snapshot(ctx);
631        Self { previous }
632    }
633}
634
635impl Drop for CurrentSnapshotGuard {
636    fn drop(&mut self) {
637        let prev = self.previous.take();
638        let has = prev.is_some();
639        CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = prev);
640        HAS_SNAPSHOT.with(|c| c.set(has));
641    }
642}
643
644/// Is this entity visible under the current thread's MVCC snapshot?
645///
646/// Returns `true` (no filtering) when no snapshot is installed — that
647/// path is used by embedded callers and by operations that intentionally
648/// bypass MVCC (VACUUM, snapshot export, admin introspection).
649///
650/// When a snapshot is installed the result is
651///   `snapshot.sees(xmin, xmax) && !mgr.is_aborted(xmin) && !xmax_half_abort`
652/// where `xmax_half_abort` re-grants visibility for tuples whose
653/// deleting transaction rolled back.
654#[inline]
655pub fn entity_visible_under_current_snapshot(
656    entity: &crate::storage::unified::entity::UnifiedEntity,
657) -> bool {
658    // Fast path — one `Cell<bool>` read, no RefCell borrow. Autocommit
659    // reads (no active MVCC transaction) still hide superseded physical
660    // versions while avoiding a full snapshot-context lookup.
661    // This runs on every row of every scan; the slow path only fires
662    // inside an explicit transaction.
663    if !HAS_SNAPSHOT.with(|c| c.get()) {
664        return entity.xmax == 0;
665    }
666    CURRENT_SNAPSHOT.with(|cell| {
667        let guard = cell.borrow();
668        let Some(ctx) = guard.as_ref() else {
669            return true;
670        };
671        visibility_check(ctx, entity.xmin, entity.xmax)
672    })
673}
674
675/// Direct visibility check from raw `(xmin, xmax)` — bypasses the
676/// entity borrow for callers that already decomposed the tuple (e.g.
677/// pre-materialized scan caches). Same semantics as
678/// `entity_visible_under_current_snapshot`.
679#[inline]
680pub(crate) fn xids_visible_under_current_snapshot(xmin: u64, xmax: u64) -> bool {
681    if !HAS_SNAPSHOT.with(|c| c.get()) {
682        return true;
683    }
684    CURRENT_SNAPSHOT.with(|cell| {
685        let guard = cell.borrow();
686        let Some(ctx) = guard.as_ref() else {
687            return true;
688        };
689        visibility_check(ctx, xmin, xmax)
690    })
691}
692
693/// Clone the current thread's snapshot context. Parallel scan paths
694/// (`query_all_zoned` with `std::thread::scope`) call this on the main
695/// thread *before* spawning workers so the captured `SnapshotContext`
696/// can be moved into every worker closure. Worker threads do not
697/// inherit thread-locals, so calling `entity_visible_under_current_snapshot`
698/// from inside a spawned closure would silently skip the filter.
699pub fn capture_current_snapshot() -> Option<SnapshotContext> {
700    CURRENT_SNAPSHOT.with(|cell| cell.borrow().clone())
701}
702
703/// Whether the active read snapshot may need historical tuple versions
704/// that the current secondary indexes cannot prove. Index paths can still
705/// recheck visible candidates, but only a heap scan can discover versions
706/// whose indexed value was changed or deleted after this snapshot.
707pub(crate) fn current_snapshot_requires_index_fallback() -> bool {
708    if !HAS_SNAPSHOT.with(|c| c.get()) {
709        return false;
710    }
711    CURRENT_SNAPSHOT.with(|cell| {
712        cell.borrow()
713            .as_ref()
714            .is_some_and(|ctx| ctx.requires_index_fallback)
715    })
716}
717
718/// Frozen MVCC + identity context for callers that need to reinstall
719/// the same view across thread-local boundaries — long-lived cursors,
720/// background batchers, anything that detaches from the dispatch path
721/// and re-enters later.
722///
723/// The bundle bakes in the three thread-locals every read path
724/// consults: `SnapshotContext` (MVCC visibility), the auth identity
725/// (RLS policy gate), and the tenant id (RLS scope). A FETCH that
726/// reinstalls the bundle sees exactly the same rows as the DECLARE
727/// would have, regardless of writes that landed in between.
728///
729/// Cheap to clone — `SnapshotContext` is a clone of three
730/// `Arc`-backed fields, identity is a `(String, Role)`, tenant is a
731/// `String`. None of these contend with the read path.
732#[derive(Clone, Default)]
733pub struct SnapshotBundle {
734    pub snapshot: Option<SnapshotContext>,
735    pub auth: Option<(String, crate::auth::Role)>,
736    pub tenant: Option<String>,
737}
738
739/// Capture the three read-path thread-locals into a `SnapshotBundle`.
740/// Pairs with `with_snapshot_bundle` for re-entry.
741pub fn snapshot_bundle() -> SnapshotBundle {
742    SnapshotBundle {
743        snapshot: capture_current_snapshot(),
744        auth: current_auth_identity(),
745        tenant: CURRENT_TENANT_ID.with(|cell| cell.borrow().clone()),
746    }
747}
748
749/// Reinstall a captured `SnapshotBundle` for the duration of `f`.
750/// Restores the caller's previous thread-locals on exit (panic-safe via
751/// the explicit guard struct so a panic in `f` cannot leak the
752/// installed identity into the worker's next request).
753pub fn with_snapshot_bundle<R>(bundle: &SnapshotBundle, f: impl FnOnce() -> R) -> R {
754    struct Guard {
755        prev_snapshot: Option<SnapshotContext>,
756        prev_auth: Option<(String, crate::auth::Role)>,
757        prev_tenant: Option<String>,
758    }
759    impl Drop for Guard {
760        fn drop(&mut self) {
761            let snap = self.prev_snapshot.take();
762            let has = snap.is_some();
763            CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = snap);
764            HAS_SNAPSHOT.with(|c| c.set(has));
765            CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = self.prev_auth.take());
766            CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = self.prev_tenant.take());
767        }
768    }
769
770    let _guard = {
771        let prev_snapshot = CURRENT_SNAPSHOT.with(|cell| cell.borrow().clone());
772        let prev_auth = CURRENT_AUTH_IDENTITY.with(|cell| cell.borrow().clone());
773        let prev_tenant = CURRENT_TENANT_ID.with(|cell| cell.borrow().clone());
774
775        match bundle.snapshot.clone() {
776            Some(ctx) => set_current_snapshot(ctx),
777            None => clear_current_snapshot(),
778        }
779        CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = bundle.auth.clone());
780        CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = bundle.tenant.clone());
781
782        Guard {
783            prev_snapshot,
784            prev_auth,
785            prev_tenant,
786        }
787    };
788    f()
789}
790
791/// Apply the same visibility rules used by the thread-local helpers
792/// against a caller-provided context. Intended for parallel workers
793/// that captured the snapshot with `capture_current_snapshot()`.
794#[inline]
795pub fn entity_visible_with_context(
796    ctx: Option<&SnapshotContext>,
797    entity: &crate::storage::unified::entity::UnifiedEntity,
798) -> bool {
799    match ctx {
800        Some(ctx) => visibility_check(ctx, entity.xmin, entity.xmax),
801        None => true,
802    }
803}
804
805fn table_row_index_fields(
806    entity: &crate::storage::unified::entity::UnifiedEntity,
807) -> Vec<(String, crate::storage::schema::Value)> {
808    let crate::storage::EntityData::Row(row) = &entity.data else {
809        return Vec::new();
810    };
811    if let Some(named) = &row.named {
812        return named
813            .iter()
814            .map(|(name, value)| (name.clone(), value.clone()))
815            .collect();
816    }
817    if let Some(schema) = &row.schema {
818        return schema
819            .iter()
820            .zip(row.columns.iter())
821            .map(|(name, value)| (name.clone(), value.clone()))
822            .collect();
823    }
824    Vec::new()
825}
826
827#[inline]
828fn visibility_check(ctx: &SnapshotContext, xmin: u64, xmax: u64) -> bool {
829    // Writer aborted → tuple never existed from any future reader's view.
830    // Checked *before* the own-xids fast path so an aborted own-sub-xid
831    // (rolled-back savepoint) stays hidden from the parent.
832    if xmin != 0 && ctx.manager.is_aborted(xmin) {
833        return false;
834    }
835    // Deleter aborted → treat xmax as unset; fall back to xmin-only check.
836    let effective_xmax = if xmax != 0 && ctx.manager.is_aborted(xmax) {
837        0
838    } else {
839        xmax
840    };
841    // Phase 2.3.2e: own-tx writes are always visible to the connection
842    // that stamped them, even when xmin/xmax exceed `snapshot.xid` (as
843    // happens for sub-xids allocated by SAVEPOINT after BEGIN).
844    let own_xmin = xmin != 0 && ctx.own_xids.contains(&xmin);
845    let own_xmax = effective_xmax != 0 && ctx.own_xids.contains(&effective_xmax);
846    if own_xmax {
847        // This connection deleted the row via this xid — hide it from self.
848        return false;
849    }
850    if own_xmin {
851        return true;
852    }
853    ctx.snapshot.sees(xmin, effective_xmax)
854}
855
856fn runtime_pool_lock(runtime: &RedDBRuntime) -> std::sync::MutexGuard<'_, PoolState> {
857    runtime
858        .inner
859        .pool
860        .lock()
861        .unwrap_or_else(|poisoned| poisoned.into_inner())
862}
863
864fn cache_scope_insert(scopes: &mut HashSet<String>, name: &str) {
865    if name.is_empty() || name.starts_with("__subq_") || is_universal_query_source(name) {
866        return;
867    }
868    scopes.insert(name.to_string());
869}
870
871fn collect_table_source_scopes(scopes: &mut HashSet<String>, query: &TableQuery) {
872    match query.source.as_ref() {
873        Some(crate::storage::query::ast::TableSource::Name(name)) => {
874            cache_scope_insert(scopes, name)
875        }
876        Some(crate::storage::query::ast::TableSource::Subquery(subquery)) => {
877            collect_query_expr_result_cache_scopes(scopes, subquery);
878        }
879        None => cache_scope_insert(scopes, &query.table),
880    }
881}
882
883fn collect_vector_source_scopes(
884    scopes: &mut HashSet<String>,
885    source: &crate::storage::query::ast::VectorSource,
886) {
887    match source {
888        crate::storage::query::ast::VectorSource::Reference { collection, .. } => {
889            cache_scope_insert(scopes, collection);
890        }
891        crate::storage::query::ast::VectorSource::Subquery(subquery) => {
892            collect_query_expr_result_cache_scopes(scopes, subquery);
893        }
894        crate::storage::query::ast::VectorSource::Literal(_)
895        | crate::storage::query::ast::VectorSource::Text(_) => {}
896    }
897}
898
899fn collect_path_selector_scopes(
900    scopes: &mut HashSet<String>,
901    selector: &crate::storage::query::ast::NodeSelector,
902) {
903    if let crate::storage::query::ast::NodeSelector::ByRow { table, .. } = selector {
904        cache_scope_insert(scopes, table);
905    }
906}
907
908fn collect_query_expr_result_cache_scopes(scopes: &mut HashSet<String>, expr: &QueryExpr) {
909    match expr {
910        QueryExpr::Table(query) => collect_table_source_scopes(scopes, query),
911        QueryExpr::Join(query) => {
912            collect_query_expr_result_cache_scopes(scopes, &query.left);
913            collect_query_expr_result_cache_scopes(scopes, &query.right);
914        }
915        QueryExpr::Path(query) => {
916            collect_path_selector_scopes(scopes, &query.from);
917            collect_path_selector_scopes(scopes, &query.to);
918        }
919        QueryExpr::Vector(query) => {
920            cache_scope_insert(scopes, &query.collection);
921            collect_vector_source_scopes(scopes, &query.query_vector);
922        }
923        QueryExpr::Hybrid(query) => {
924            collect_query_expr_result_cache_scopes(scopes, &query.structured);
925            cache_scope_insert(scopes, &query.vector.collection);
926            collect_vector_source_scopes(scopes, &query.vector.query_vector);
927        }
928        QueryExpr::Insert(query) => cache_scope_insert(scopes, &query.table),
929        QueryExpr::Update(query) => cache_scope_insert(scopes, &query.table),
930        QueryExpr::Delete(query) => cache_scope_insert(scopes, &query.table),
931        QueryExpr::CreateTable(query) => cache_scope_insert(scopes, &query.name),
932        QueryExpr::CreateCollection(query) => cache_scope_insert(scopes, &query.name),
933        QueryExpr::CreateVector(query) => cache_scope_insert(scopes, &query.name),
934        QueryExpr::DropTable(query) => cache_scope_insert(scopes, &query.name),
935        QueryExpr::DropGraph(query) => cache_scope_insert(scopes, &query.name),
936        QueryExpr::DropVector(query) => cache_scope_insert(scopes, &query.name),
937        QueryExpr::DropDocument(query) => cache_scope_insert(scopes, &query.name),
938        QueryExpr::DropKv(query) => cache_scope_insert(scopes, &query.name),
939        QueryExpr::DropCollection(query) => cache_scope_insert(scopes, &query.name),
940        QueryExpr::Truncate(query) => cache_scope_insert(scopes, &query.name),
941        QueryExpr::AlterTable(query) => cache_scope_insert(scopes, &query.name),
942        QueryExpr::CreateIndex(query) => cache_scope_insert(scopes, &query.table),
943        QueryExpr::DropIndex(query) => cache_scope_insert(scopes, &query.table),
944        QueryExpr::CreateTimeSeries(query) => cache_scope_insert(scopes, &query.name),
945        QueryExpr::DropTimeSeries(query) => cache_scope_insert(scopes, &query.name),
946        QueryExpr::CreateQueue(query) => cache_scope_insert(scopes, &query.name),
947        QueryExpr::AlterQueue(query) => cache_scope_insert(scopes, &query.name),
948        QueryExpr::DropQueue(query) => cache_scope_insert(scopes, &query.name),
949        QueryExpr::QueueSelect(query) => cache_scope_insert(scopes, &query.queue),
950        QueryExpr::QueueCommand(query) => match query {
951            QueueCommand::Push { queue, .. }
952            | QueueCommand::Pop { queue, .. }
953            | QueueCommand::Peek { queue, .. }
954            | QueueCommand::Len { queue }
955            | QueueCommand::Purge { queue }
956            | QueueCommand::GroupCreate { queue, .. }
957            | QueueCommand::GroupRead { queue, .. }
958            | QueueCommand::Pending { queue, .. }
959            | QueueCommand::Claim { queue, .. }
960            | QueueCommand::Ack { queue, .. }
961            | QueueCommand::Nack { queue, .. } => cache_scope_insert(scopes, queue),
962            QueueCommand::Move {
963                source,
964                destination,
965                ..
966            } => {
967                cache_scope_insert(scopes, source);
968                cache_scope_insert(scopes, destination);
969            }
970        },
971        QueryExpr::EventsBackfill(query) => {
972            cache_scope_insert(scopes, &query.collection);
973            cache_scope_insert(scopes, &query.target_queue);
974        }
975        QueryExpr::CreateTree(query) => cache_scope_insert(scopes, &query.collection),
976        QueryExpr::DropTree(query) => cache_scope_insert(scopes, &query.collection),
977        QueryExpr::TreeCommand(query) => match query {
978            TreeCommand::Insert { collection, .. }
979            | TreeCommand::Move { collection, .. }
980            | TreeCommand::Delete { collection, .. }
981            | TreeCommand::Validate { collection, .. }
982            | TreeCommand::Rebalance { collection, .. } => cache_scope_insert(scopes, collection),
983        },
984        QueryExpr::SearchCommand(query) => match query {
985            SearchCommand::Similar { collection, .. }
986            | SearchCommand::Hybrid { collection, .. }
987            | SearchCommand::SpatialRadius { collection, .. }
988            | SearchCommand::SpatialBbox { collection, .. }
989            | SearchCommand::SpatialNearest { collection, .. } => {
990                cache_scope_insert(scopes, collection);
991            }
992            SearchCommand::Text { collection, .. }
993            | SearchCommand::Multimodal { collection, .. }
994            | SearchCommand::Index { collection, .. }
995            | SearchCommand::Context { collection, .. } => {
996                if let Some(collection) = collection.as_deref() {
997                    cache_scope_insert(scopes, collection);
998                }
999            }
1000        },
1001        QueryExpr::Ask(query) => {
1002            if let Some(collection) = query.collection.as_deref() {
1003                cache_scope_insert(scopes, collection);
1004            }
1005        }
1006        QueryExpr::ExplainAlter(query) => cache_scope_insert(scopes, &query.target.name),
1007        QueryExpr::MaintenanceCommand(cmd) => match cmd {
1008            crate::storage::query::ast::MaintenanceCommand::Vacuum { target, .. }
1009            | crate::storage::query::ast::MaintenanceCommand::Analyze { target } => {
1010                if let Some(t) = target {
1011                    cache_scope_insert(scopes, t);
1012                }
1013            }
1014        },
1015        QueryExpr::CopyFrom(cmd) => cache_scope_insert(scopes, &cmd.table),
1016        QueryExpr::CreateView(cmd) => {
1017            cache_scope_insert(scopes, &cmd.name);
1018            // Invalidating the view should also invalidate its dependencies.
1019            collect_query_expr_result_cache_scopes(scopes, &cmd.query);
1020        }
1021        QueryExpr::DropView(cmd) => cache_scope_insert(scopes, &cmd.name),
1022        QueryExpr::RefreshMaterializedView(cmd) => cache_scope_insert(scopes, &cmd.name),
1023        QueryExpr::CreatePolicy(cmd) => cache_scope_insert(scopes, &cmd.table),
1024        QueryExpr::DropPolicy(cmd) => cache_scope_insert(scopes, &cmd.table),
1025        QueryExpr::CreateServer(_) | QueryExpr::DropServer(_) => {}
1026        QueryExpr::CreateForeignTable(cmd) => cache_scope_insert(scopes, &cmd.name),
1027        QueryExpr::DropForeignTable(cmd) => cache_scope_insert(scopes, &cmd.name),
1028        QueryExpr::Graph(_)
1029        | QueryExpr::GraphCommand(_)
1030        | QueryExpr::ProbabilisticCommand(_)
1031        | QueryExpr::SetConfig { .. }
1032        | QueryExpr::ShowConfig { .. }
1033        | QueryExpr::SetSecret { .. }
1034        | QueryExpr::DeleteSecret { .. }
1035        | QueryExpr::ShowSecrets { .. }
1036        | QueryExpr::SetTenant(_)
1037        | QueryExpr::ShowTenant
1038        | QueryExpr::TransactionControl(_)
1039        | QueryExpr::CreateSchema(_)
1040        | QueryExpr::DropSchema(_)
1041        | QueryExpr::CreateSequence(_)
1042        | QueryExpr::DropSequence(_)
1043        | QueryExpr::Grant(_)
1044        | QueryExpr::Revoke(_)
1045        | QueryExpr::AlterUser(_)
1046        | QueryExpr::CreateIamPolicy { .. }
1047        | QueryExpr::DropIamPolicy { .. }
1048        | QueryExpr::AttachPolicy { .. }
1049        | QueryExpr::DetachPolicy { .. }
1050        | QueryExpr::ShowPolicies { .. }
1051        | QueryExpr::ShowEffectivePermissions { .. }
1052        | QueryExpr::SimulatePolicy { .. }
1053        | QueryExpr::CreateMigration(_)
1054        | QueryExpr::ApplyMigration(_)
1055        | QueryExpr::RollbackMigration(_)
1056        | QueryExpr::ExplainMigration(_)
1057        | QueryExpr::EventsBackfillStatus { .. } => {}
1058        QueryExpr::KvCommand(cmd) => {
1059            use crate::storage::query::ast::KvCommand;
1060            match cmd {
1061                KvCommand::Put { collection, .. }
1062                | KvCommand::InvalidateTags { collection, .. }
1063                | KvCommand::Get { collection, .. }
1064                | KvCommand::Unseal { collection, .. }
1065                | KvCommand::Rotate { collection, .. }
1066                | KvCommand::History { collection, .. }
1067                | KvCommand::List { collection, .. }
1068                | KvCommand::Purge { collection, .. }
1069                | KvCommand::Watch { collection, .. }
1070                | KvCommand::Delete { collection, .. }
1071                | KvCommand::Incr { collection, .. }
1072                | KvCommand::Cas { collection, .. } => cache_scope_insert(scopes, collection),
1073            }
1074        }
1075        QueryExpr::ConfigCommand(cmd) => {
1076            use crate::storage::query::ast::ConfigCommand;
1077            match cmd {
1078                ConfigCommand::Put { collection, .. }
1079                | ConfigCommand::Get { collection, .. }
1080                | ConfigCommand::Resolve { collection, .. }
1081                | ConfigCommand::Rotate { collection, .. }
1082                | ConfigCommand::Delete { collection, .. }
1083                | ConfigCommand::History { collection, .. }
1084                | ConfigCommand::List { collection, .. }
1085                | ConfigCommand::Watch { collection, .. }
1086                | ConfigCommand::InvalidVolatileOperation { collection, .. } => {
1087                    cache_scope_insert(scopes, collection)
1088                }
1089            }
1090        }
1091    }
1092}
1093
1094/// Combine matching RLS policies for a table + action into a single
1095/// `Filter` suitable for AND-ing into a caller's `WHERE` clause.
1096///
1097/// Returns `None` when RLS is disabled or no policy admits the caller's
1098/// role — callers use that to short-circuit the mutation (for DELETE /
1099/// UPDATE we simply skip the operation, which PG expresses as "no rows
1100/// match the policy + predicate combination").
1101pub(crate) fn rls_policy_filter(
1102    runtime: &RedDBRuntime,
1103    table: &str,
1104    action: crate::storage::query::ast::PolicyAction,
1105) -> Option<crate::storage::query::ast::Filter> {
1106    rls_policy_filter_for_kind(
1107        runtime,
1108        table,
1109        action,
1110        crate::storage::query::ast::PolicyTargetKind::Table,
1111    )
1112}
1113
1114/// Kind-aware policy filter combiner (Phase 2.5.5 RLS universal).
1115/// Graph / vector / queue / timeseries scans pass the concrete kind;
1116/// policies targeting other kinds are ignored. Legacy Table-scoped
1117/// policies still apply cross-kind — callers register auto-tenancy
1118/// policies as Table today.
1119pub(crate) fn rls_policy_filter_for_kind(
1120    runtime: &RedDBRuntime,
1121    table: &str,
1122    action: crate::storage::query::ast::PolicyAction,
1123    kind: crate::storage::query::ast::PolicyTargetKind,
1124) -> Option<crate::storage::query::ast::Filter> {
1125    use crate::storage::query::ast::Filter;
1126
1127    if !runtime.inner.rls_enabled_tables.read().contains(table) {
1128        return None;
1129    }
1130    let role = current_auth_identity().map(|(_, role)| role);
1131    let role_str = role.map(|r| r.as_str().to_string());
1132    let policies = runtime.matching_rls_policies_for_kind(table, role_str.as_deref(), action, kind);
1133    if policies.is_empty() {
1134        return None;
1135    }
1136    policies
1137        .into_iter()
1138        .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1139}
1140
1141/// Returns true when the table has RLS enforcement enabled. Convenience
1142/// shortcut so DML paths can gate the AND-combine work without reaching
1143/// into `runtime.inner.rls_enabled_tables` directly.
1144pub(crate) fn rls_is_enabled(runtime: &RedDBRuntime, table: &str) -> bool {
1145    runtime.inner.rls_enabled_tables.read().contains(table)
1146}
1147
1148/// Per-entity gate used by the graph materialiser for `GraphNode`
1149/// entities. RLS is checked against the source collection with
1150/// `kind = Nodes`, which `matching_rls_policies_for_kind` resolves to
1151/// either `Nodes`-targeted policies or legacy `Table`-targeted ones
1152/// (for back-compat with auto-tenancy declarations). Cached per
1153/// collection so big graphs only resolve the policy chain once.
1154fn node_passes_rls(
1155    runtime: &RedDBRuntime,
1156    collection: &str,
1157    role: Option<&str>,
1158    cache: &mut std::collections::HashMap<String, Option<crate::storage::query::ast::Filter>>,
1159    entity: &crate::storage::unified::entity::UnifiedEntity,
1160) -> bool {
1161    use crate::storage::query::ast::{Filter, PolicyAction, PolicyTargetKind};
1162
1163    if !runtime.inner.rls_enabled_tables.read().contains(collection) {
1164        return true;
1165    }
1166    let filter = cache.entry(collection.to_string()).or_insert_with(|| {
1167        let policies = runtime.matching_rls_policies_for_kind(
1168            collection,
1169            role,
1170            PolicyAction::Select,
1171            PolicyTargetKind::Nodes,
1172        );
1173        if policies.is_empty() {
1174            None
1175        } else {
1176            policies
1177                .into_iter()
1178                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1179        }
1180    });
1181    let Some(filter) = filter else {
1182        return false;
1183    };
1184    crate::runtime::query_exec::evaluate_entity_filter_with_db(
1185        Some(&runtime.inner.db),
1186        entity,
1187        filter,
1188        collection,
1189        collection,
1190    )
1191}
1192
1193/// Edge counterpart of `node_passes_rls`. Same caching strategy with
1194/// `kind = Edges`.
1195fn edge_passes_rls(
1196    runtime: &RedDBRuntime,
1197    collection: &str,
1198    role: Option<&str>,
1199    cache: &mut std::collections::HashMap<String, Option<crate::storage::query::ast::Filter>>,
1200    entity: &crate::storage::unified::entity::UnifiedEntity,
1201) -> bool {
1202    use crate::storage::query::ast::{Filter, PolicyAction, PolicyTargetKind};
1203
1204    if !runtime.inner.rls_enabled_tables.read().contains(collection) {
1205        return true;
1206    }
1207    let filter = cache.entry(collection.to_string()).or_insert_with(|| {
1208        let policies = runtime.matching_rls_policies_for_kind(
1209            collection,
1210            role,
1211            PolicyAction::Select,
1212            PolicyTargetKind::Edges,
1213        );
1214        if policies.is_empty() {
1215            None
1216        } else {
1217            policies
1218                .into_iter()
1219                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1220        }
1221    });
1222    let Some(filter) = filter else {
1223        return false;
1224    };
1225    crate::runtime::query_exec::evaluate_entity_filter_with_db(
1226        Some(&runtime.inner.db),
1227        entity,
1228        filter,
1229        collection,
1230        collection,
1231    )
1232}
1233
1234/// RLS policy injection (Phase 2.5.2 PG parity).
1235///
1236/// Fetch every matching policy for the current thread-local role and
1237/// fold them into the query's filter. Semantics mirror PostgreSQL:
1238///
1239/// * Multiple policies on the same table combine with **OR** — a row is
1240///   visible if *any* policy admits it.
1241/// * The combined policy predicate is **AND**-ed into the caller's
1242///   existing `WHERE` clause so explicit predicates continue to trim
1243///   the policy-allowed set.
1244/// * No matching policies + RLS enabled = zero rows (PG's
1245///   restrictive-default). Callers get `None` and return an empty
1246///   `UnifiedResult` without ever dispatching the scan.
1247///
1248/// This runs only when `RuntimeInner::rls_enabled_tables` already
1249/// contains the table name — callers gate the hot path upfront to
1250/// avoid the lock acquisition on tables without RLS.
1251///
1252/// Returns `None` when no policy admits the current role; returns
1253/// `Some(mutated_table)` with policy filters folded in otherwise.
1254fn inject_rls_filters(
1255    runtime: &RedDBRuntime,
1256    frame: &dyn super::statement_frame::ReadFrame,
1257    mut table: crate::storage::query::ast::TableQuery,
1258) -> Option<crate::storage::query::ast::TableQuery> {
1259    use crate::storage::query::ast::{Filter, PolicyAction};
1260
1261    // `None` role falls through to policies with no `TO role` clause.
1262    let role = frame.identity().map(|(_, role)| role);
1263    let role_str = role.map(|r| r.as_str().to_string());
1264    let policies =
1265        runtime.matching_rls_policies(&table.table, role_str.as_deref(), PolicyAction::Select);
1266
1267    if policies.is_empty() {
1268        // RLS enabled + no policy match = deny everything. Signal the
1269        // caller to short-circuit with an empty result set.
1270        return None;
1271    }
1272
1273    // Combine policy predicates with OR (PG's permissive default).
1274    let combined = policies
1275        .into_iter()
1276        .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1277        .expect("policies non-empty");
1278
1279    // AND into the caller's existing predicate. The predicate may live
1280    // in `where_expr` rather than `filter`: `resolve_table_expr_subqueries`
1281    // nulls `filter` whenever `where_expr` is present (the case for a
1282    // view body rewritten into `SELECT … WHERE …`). Folding only into
1283    // `filter` here would silently drop that `where_expr` predicate at
1284    // eval time because `effective_table_filter` prefers `filter` —
1285    // e.g. `WITHIN TENANT … SELECT * FROM <view>` would apply the
1286    // tenant policy but lose the view's own WHERE (#635).
1287    use crate::storage::query::sql_lowering::{expr_to_filter, filter_to_expr};
1288    let had_where_expr = table.where_expr.is_some();
1289    let existing = table
1290        .filter
1291        .take()
1292        .or_else(|| table.where_expr.as_ref().map(expr_to_filter));
1293    let new_filter = match existing {
1294        Some(existing) => Filter::And(Box::new(existing), Box::new(combined)),
1295        None => combined,
1296    };
1297    // Keep `where_expr` in lock-step with the merged `filter` so
1298    // whichever the executor consults sees the full predicate.
1299    if had_where_expr {
1300        table.where_expr = Some(filter_to_expr(&new_filter));
1301    }
1302    table.filter = Some(new_filter);
1303    Some(table)
1304}
1305
1306/// Apply per-table RLS to a `JoinQuery` by folding each side's policy
1307/// predicate into the join's outer filter. Walking the merged record
1308/// at the join layer (rather than mutating the per-side scan filter)
1309/// keeps the planner's strategy choice and per-side index selection
1310/// undisturbed — the policy predicate uses the qualified `t.col` form
1311/// that resolves cleanly against the merged record's keys.
1312///
1313/// Returns `None` when any leaf has RLS enabled and no policy admits
1314/// the caller — the join short-circuits to an empty result.
1315fn inject_rls_into_join(
1316    runtime: &RedDBRuntime,
1317    frame: &dyn super::statement_frame::ReadFrame,
1318    mut join: crate::storage::query::ast::JoinQuery,
1319) -> Option<crate::storage::query::ast::JoinQuery> {
1320    use crate::storage::query::ast::Filter;
1321
1322    let mut policy_filters: Vec<Filter> = Vec::new();
1323    if !collect_join_side_policy(runtime, frame, join.left.as_ref(), &mut policy_filters) {
1324        return None;
1325    }
1326    if !collect_join_side_policy(runtime, frame, join.right.as_ref(), &mut policy_filters) {
1327        return None;
1328    }
1329
1330    if policy_filters.is_empty() {
1331        return Some(join);
1332    }
1333
1334    let combined = policy_filters
1335        .into_iter()
1336        .reduce(|acc, f| Filter::And(Box::new(acc), Box::new(f)))
1337        .expect("policy_filters non-empty");
1338
1339    join.filter = Some(match join.filter.take() {
1340        Some(existing) => Filter::And(Box::new(existing), Box::new(combined)),
1341        None => combined,
1342    });
1343
1344    Some(join)
1345}
1346
1347/// For each `Table` leaf reachable through nested joins, append the
1348/// RLS-policy filter (combined with OR across that side's matching
1349/// policies) into `out`. Returns `false` when a side has RLS enabled
1350/// but no policy admits the caller — the join must short-circuit.
1351fn collect_join_side_policy(
1352    runtime: &RedDBRuntime,
1353    frame: &dyn super::statement_frame::ReadFrame,
1354    expr: &crate::storage::query::ast::QueryExpr,
1355    out: &mut Vec<crate::storage::query::ast::Filter>,
1356) -> bool {
1357    use crate::storage::query::ast::{Filter, PolicyAction, QueryExpr};
1358    match expr {
1359        QueryExpr::Table(t) => {
1360            if !runtime.inner.rls_enabled_tables.read().contains(&t.table) {
1361                return true;
1362            }
1363            let role = frame.identity().map(|(_, role)| role);
1364            let role_str = role.map(|r| r.as_str().to_string());
1365            let policies =
1366                runtime.matching_rls_policies(&t.table, role_str.as_deref(), PolicyAction::Select);
1367            if policies.is_empty() {
1368                return false;
1369            }
1370            let combined = policies
1371                .into_iter()
1372                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1373                .expect("policies non-empty");
1374            out.push(combined);
1375            true
1376        }
1377        QueryExpr::Join(inner) => {
1378            collect_join_side_policy(runtime, frame, inner.left.as_ref(), out)
1379                && collect_join_side_policy(runtime, frame, inner.right.as_ref(), out)
1380        }
1381        _ => true,
1382    }
1383}
1384
1385/// Foreign-table post-scan filter application (Phase 3.2.2 PG parity).
1386///
1387/// Phase 3.2 FDW wrappers don't advertise filter pushdown, so the runtime
1388/// applies `WHERE` / `ORDER BY` / `LIMIT` / `OFFSET` after the wrapper
1389/// materialises all rows. Projections are best-effort — when the query
1390/// lists explicit columns we keep only those; a `SELECT *` keeps every
1391/// wrapper-emitted field verbatim.
1392///
1393/// When a wrapper later opts into pushdown (`supports_pushdown = true`)
1394/// the runtime will pass the compiled filter down instead of post-filtering.
1395fn apply_foreign_table_filters(
1396    records: Vec<crate::storage::query::unified::UnifiedRecord>,
1397    query: &crate::storage::query::ast::TableQuery,
1398) -> crate::storage::query::unified::UnifiedResult {
1399    use crate::storage::query::sql_lowering::{
1400        effective_table_filter, effective_table_projections,
1401    };
1402    use crate::storage::query::unified::UnifiedResult;
1403
1404    let filter = effective_table_filter(query);
1405    let projections = effective_table_projections(query);
1406
1407    // Step 1 — WHERE. Reuse the cross-store evaluator so the semantics
1408    // match native-collection queries (same operators, same NULL handling).
1409    let mut filtered: Vec<_> = records
1410        .into_iter()
1411        .filter(|record| match &filter {
1412            Some(f) => {
1413                super::join_filter::evaluate_runtime_filter_with_db(None, record, f, None, None)
1414            }
1415            None => true,
1416        })
1417        .collect();
1418
1419    // Step 2 — LIMIT / OFFSET. Applied after filter to match SQL semantics.
1420    if let Some(offset) = query.offset {
1421        let offset = offset as usize;
1422        if offset >= filtered.len() {
1423            filtered.clear();
1424        } else {
1425            filtered.drain(0..offset);
1426        }
1427    }
1428    if let Some(limit) = query.limit {
1429        filtered.truncate(limit as usize);
1430    }
1431
1432    // Step 3 — columns list. `SELECT *` (no explicit projections) keeps
1433    // the wrapper's column set; an explicit list trims to those names.
1434    let columns: Vec<String> = if projections.is_empty() {
1435        filtered
1436            .first()
1437            .map(|r| r.column_names().iter().map(|k| k.to_string()).collect())
1438            .unwrap_or_default()
1439    } else {
1440        projections
1441            .iter()
1442            .map(super::join_filter::projection_name)
1443            .collect()
1444    };
1445
1446    let mut result = UnifiedResult::empty();
1447    result.columns = columns;
1448    result.records = filtered;
1449    result
1450}
1451
1452/// Collect every concrete table reference inside a `QueryExpr`.
1453///
1454/// Used by view bookkeeping (dependency tracking for materialised
1455/// invalidation) and any other rewriter that needs to know the base
1456/// tables a query pulls from. Does not descend into projections/filters;
1457/// only the `FROM` side.
1458pub(crate) fn collect_table_refs(expr: &QueryExpr) -> Vec<String> {
1459    let mut scopes: HashSet<String> = HashSet::new();
1460    collect_query_expr_result_cache_scopes(&mut scopes, expr);
1461    scopes.into_iter().collect()
1462}
1463
1464fn query_expr_result_cache_scopes(expr: &QueryExpr) -> HashSet<String> {
1465    let mut scopes = HashSet::new();
1466    collect_query_expr_result_cache_scopes(&mut scopes, expr);
1467    scopes
1468}
1469
1470const RESULT_CACHE_BACKEND_KEY: &str = "runtime.result_cache.backend";
1471const RESULT_CACHE_DEFAULT_BACKEND: &str = "legacy";
1472const RESULT_CACHE_BLOB_NAMESPACE: &str = "runtime.result_cache";
1473const RESULT_CACHE_TTL_SECS: u64 = 30;
1474const RESULT_CACHE_MAX_ENTRIES: usize = 1000;
1475const RESULT_CACHE_PAYLOAD_MAGIC: &[u8; 8] = b"RDRC0001";
1476
1477#[derive(Clone, Copy, Debug, PartialEq, Eq)]
1478enum RuntimeResultCacheBackend {
1479    Legacy,
1480    BlobCache,
1481    Shadow,
1482}
1483
1484fn trim_result_cache(
1485    map: &mut HashMap<String, RuntimeResultCacheEntry>,
1486    order: &mut std::collections::VecDeque<String>,
1487) {
1488    while map.len() > RESULT_CACHE_MAX_ENTRIES {
1489        if let Some(oldest) = order.pop_front() {
1490            map.remove(&oldest);
1491        } else {
1492            break;
1493        }
1494    }
1495}
1496
1497fn result_cache_fingerprint(result: &RuntimeQueryResult) -> String {
1498    format!(
1499        "{:?}|{}|{}|{}|{}|{:?}",
1500        result.result,
1501        result.query,
1502        result.statement,
1503        result.engine,
1504        result.affected_rows,
1505        result.statement_type
1506    )
1507}
1508
1509fn mode_to_byte(mode: crate::storage::query::modes::QueryMode) -> u8 {
1510    match mode {
1511        crate::storage::query::modes::QueryMode::Sql => 0,
1512        crate::storage::query::modes::QueryMode::Gremlin => 1,
1513        crate::storage::query::modes::QueryMode::Cypher => 2,
1514        crate::storage::query::modes::QueryMode::Sparql => 3,
1515        crate::storage::query::modes::QueryMode::Path => 4,
1516        crate::storage::query::modes::QueryMode::Natural => 5,
1517        crate::storage::query::modes::QueryMode::Unknown => 255,
1518    }
1519}
1520
1521fn mode_from_byte(byte: u8) -> Option<crate::storage::query::modes::QueryMode> {
1522    match byte {
1523        0 => Some(crate::storage::query::modes::QueryMode::Sql),
1524        1 => Some(crate::storage::query::modes::QueryMode::Gremlin),
1525        2 => Some(crate::storage::query::modes::QueryMode::Cypher),
1526        3 => Some(crate::storage::query::modes::QueryMode::Sparql),
1527        4 => Some(crate::storage::query::modes::QueryMode::Path),
1528        5 => Some(crate::storage::query::modes::QueryMode::Natural),
1529        255 => Some(crate::storage::query::modes::QueryMode::Unknown),
1530        _ => None,
1531    }
1532}
1533
1534fn result_cache_static_str(value: &str) -> Option<&'static str> {
1535    match value {
1536        "select" => Some("select"),
1537        "materialized-graph" => Some("materialized-graph"),
1538        "runtime-red-schema" => Some("runtime-red-schema"),
1539        "runtime-fdw" => Some("runtime-fdw"),
1540        "runtime-table-rls" => Some("runtime-table-rls"),
1541        "runtime-table" => Some("runtime-table"),
1542        "runtime-join-rls" => Some("runtime-join-rls"),
1543        "runtime-join" => Some("runtime-join"),
1544        "runtime-vector" => Some("runtime-vector"),
1545        "runtime-hybrid" => Some("runtime-hybrid"),
1546        "runtime-secret" => Some("runtime-secret"),
1547        "runtime-config" => Some("runtime-config"),
1548        "runtime-tenant" => Some("runtime-tenant"),
1549        "runtime-explain" => Some("runtime-explain"),
1550        "runtime-tree" => Some("runtime-tree"),
1551        "runtime-kv" => Some("runtime-kv"),
1552        "runtime-queue" => Some("runtime-queue"),
1553        _ => None,
1554    }
1555}
1556
1557fn write_u32(out: &mut Vec<u8>, value: usize) -> Option<()> {
1558    let value = u32::try_from(value).ok()?;
1559    out.extend_from_slice(&value.to_le_bytes());
1560    Some(())
1561}
1562
1563fn write_string(out: &mut Vec<u8>, value: &str) -> Option<()> {
1564    write_u32(out, value.len())?;
1565    out.extend_from_slice(value.as_bytes());
1566    Some(())
1567}
1568
1569fn write_bytes(out: &mut Vec<u8>, value: &[u8]) -> Option<()> {
1570    write_u32(out, value.len())?;
1571    out.extend_from_slice(value);
1572    Some(())
1573}
1574
1575fn read_u8(input: &mut &[u8]) -> Option<u8> {
1576    let (&value, rest) = input.split_first()?;
1577    *input = rest;
1578    Some(value)
1579}
1580
1581fn read_u32(input: &mut &[u8]) -> Option<usize> {
1582    if input.len() < 4 {
1583        return None;
1584    }
1585    let value = u32::from_le_bytes(input[..4].try_into().ok()?) as usize;
1586    *input = &input[4..];
1587    Some(value)
1588}
1589
1590fn read_u64(input: &mut &[u8]) -> Option<u64> {
1591    if input.len() < 8 {
1592        return None;
1593    }
1594    let value = u64::from_le_bytes(input[..8].try_into().ok()?);
1595    *input = &input[8..];
1596    Some(value)
1597}
1598
1599fn read_string(input: &mut &[u8]) -> Option<String> {
1600    let len = read_u32(input)?;
1601    if input.len() < len {
1602        return None;
1603    }
1604    let value = String::from_utf8(input[..len].to_vec()).ok()?;
1605    *input = &input[len..];
1606    Some(value)
1607}
1608
1609fn read_bytes<'a>(input: &mut &'a [u8]) -> Option<&'a [u8]> {
1610    let len = read_u32(input)?;
1611    if input.len() < len {
1612        return None;
1613    }
1614    let value = &input[..len];
1615    *input = &input[len..];
1616    Some(value)
1617}
1618
1619fn encode_result_cache_payload(entry: &RuntimeResultCacheEntry) -> Option<Vec<u8>> {
1620    let result = &entry.result;
1621    if result.result.pre_serialized_json.is_some()
1622        || result_cache_static_str(result.statement).is_none()
1623        || result_cache_static_str(result.engine).is_none()
1624        || result_cache_static_str(result.statement_type).is_none()
1625        || result.result.records.iter().any(|record| {
1626            !record.nodes.is_empty()
1627                || !record.edges.is_empty()
1628                || !record.paths.is_empty()
1629                || !record.vector_results.is_empty()
1630        })
1631    {
1632        return None;
1633    }
1634
1635    let mut out = Vec::new();
1636    out.extend_from_slice(RESULT_CACHE_PAYLOAD_MAGIC);
1637    write_string(&mut out, &result.query)?;
1638    out.push(mode_to_byte(result.mode));
1639    write_string(&mut out, result.statement)?;
1640    write_string(&mut out, result.engine)?;
1641    out.extend_from_slice(&result.affected_rows.to_le_bytes());
1642    write_string(&mut out, result.statement_type)?;
1643
1644    write_u32(&mut out, result.result.columns.len())?;
1645    for column in &result.result.columns {
1646        write_string(&mut out, column)?;
1647    }
1648    out.extend_from_slice(&result.result.stats.nodes_scanned.to_le_bytes());
1649    out.extend_from_slice(&result.result.stats.edges_scanned.to_le_bytes());
1650    out.extend_from_slice(&result.result.stats.rows_scanned.to_le_bytes());
1651    out.extend_from_slice(&result.result.stats.exec_time_us.to_le_bytes());
1652
1653    write_u32(&mut out, result.result.records.len())?;
1654    for record in &result.result.records {
1655        let fields = record.iter_fields().collect::<Vec<_>>();
1656        write_u32(&mut out, fields.len())?;
1657        for (name, value) in fields {
1658            write_string(&mut out, name)?;
1659            let mut encoded = Vec::new();
1660            crate::storage::schema::value_codec::encode(value, &mut encoded);
1661            write_bytes(&mut out, &encoded)?;
1662        }
1663    }
1664
1665    write_u32(&mut out, entry.scopes.len())?;
1666    for scope in &entry.scopes {
1667        write_string(&mut out, scope)?;
1668    }
1669    Some(out)
1670}
1671
1672fn decode_result_cache_payload(mut input: &[u8]) -> Option<(RuntimeQueryResult, HashSet<String>)> {
1673    if input.len() < RESULT_CACHE_PAYLOAD_MAGIC.len()
1674        || &input[..RESULT_CACHE_PAYLOAD_MAGIC.len()] != RESULT_CACHE_PAYLOAD_MAGIC
1675    {
1676        return None;
1677    }
1678    input = &input[RESULT_CACHE_PAYLOAD_MAGIC.len()..];
1679
1680    let query = read_string(&mut input)?;
1681    let mode = mode_from_byte(read_u8(&mut input)?)?;
1682    let statement = result_cache_static_str(&read_string(&mut input)?)?;
1683    let engine = result_cache_static_str(&read_string(&mut input)?)?;
1684    let affected_rows = read_u64(&mut input)?;
1685    let statement_type = result_cache_static_str(&read_string(&mut input)?)?;
1686
1687    let mut columns = Vec::new();
1688    for _ in 0..read_u32(&mut input)? {
1689        columns.push(read_string(&mut input)?);
1690    }
1691    let stats = crate::storage::query::unified::QueryStats {
1692        nodes_scanned: read_u64(&mut input)?,
1693        edges_scanned: read_u64(&mut input)?,
1694        rows_scanned: read_u64(&mut input)?,
1695        exec_time_us: read_u64(&mut input)?,
1696    };
1697
1698    let mut records = Vec::new();
1699    for _ in 0..read_u32(&mut input)? {
1700        let mut record = crate::storage::query::unified::UnifiedRecord::new();
1701        for _ in 0..read_u32(&mut input)? {
1702            let name = read_string(&mut input)?;
1703            let bytes = read_bytes(&mut input)?;
1704            let (value, used) = crate::storage::schema::value_codec::decode(bytes).ok()?;
1705            if used != bytes.len() {
1706                return None;
1707            }
1708            record.set_owned(name, value);
1709        }
1710        records.push(record);
1711    }
1712
1713    let mut scopes = HashSet::new();
1714    for _ in 0..read_u32(&mut input)? {
1715        scopes.insert(read_string(&mut input)?);
1716    }
1717    if !input.is_empty() {
1718        return None;
1719    }
1720
1721    Some((
1722        RuntimeQueryResult {
1723            query,
1724            mode,
1725            statement,
1726            engine,
1727            result: crate::storage::query::unified::UnifiedResult {
1728                columns,
1729                records,
1730                stats,
1731                pre_serialized_json: None,
1732            },
1733            affected_rows,
1734            statement_type,
1735        },
1736        scopes,
1737    ))
1738}
1739
1740/// Heuristic: does the raw SQL reference a built-in whose output
1741/// varies by connection, clock, or randomness? Such queries must
1742/// skip the 30s result cache — see the call site for rationale.
1743///
1744/// ASCII case-insensitive substring match. False positives (the
1745/// token appears in a quoted string) only skip caching, which is
1746/// the conservative direction.
1747/// If `sql` starts with `EXPLAIN` followed by a non-`ALTER` token,
1748/// return the trimmed inner statement; otherwise `None`.
1749///
1750/// `EXPLAIN ALTER FOR CREATE TABLE ...` is a separate schema-diff
1751/// command handled inside the normal SQL parser, so we leave it
1752/// alone here.
1753fn strip_explain_prefix(sql: &str) -> Option<&str> {
1754    let trimmed = sql.trim_start();
1755    let (head, rest) = trimmed.split_at(
1756        trimmed
1757            .find(|c: char| c.is_whitespace())
1758            .unwrap_or(trimmed.len()),
1759    );
1760    if !head.eq_ignore_ascii_case("EXPLAIN") {
1761        return None;
1762    }
1763    let rest = rest.trim_start();
1764    if rest.is_empty() {
1765        return None;
1766    }
1767    // Peek the next token — if ALTER or ASK, defer to the normal parser.
1768    // `EXPLAIN ASK` is an executable read path: it runs retrieval and
1769    // provider selection, then short-circuits before the LLM call.
1770    let next_head_end = rest.find(|c: char| c.is_whitespace()).unwrap_or(rest.len());
1771    if rest[..next_head_end].eq_ignore_ascii_case("ALTER")
1772        || rest[..next_head_end].eq_ignore_ascii_case("ASK")
1773    {
1774        return None;
1775    }
1776    Some(rest)
1777}
1778
1779/// Cheap prefix check for a leading `WITH` keyword. Used to gate the
1780/// CTE-aware parse in `execute_query` without paying for a full
1781/// lexer pass on every statement. Treats `WITHIN` as not-a-CTE so
1782/// `WITHIN TENANT '...' SELECT ...` doesn't mis-route.
1783pub(super) fn has_with_prefix(sql: &str) -> bool {
1784    let trimmed = sql.trim_start();
1785    let head_end = trimmed
1786        .find(|c: char| c.is_whitespace() || c == '(')
1787        .unwrap_or(trimmed.len());
1788    trimmed[..head_end].eq_ignore_ascii_case("WITH")
1789}
1790
1791/// If the query is a plain SELECT whose top-level `TableQuery`
1792/// carries an `AS OF` clause, return a typed spec that the runtime
1793/// can feed to `vcs_resolve_as_of`. Returns `None` for any other
1794/// shape — joins, DML, EXPLAIN, or parse failures — so callers fall
1795/// back to the connection's regular MVCC snapshot. A cheap textual
1796/// prefilter skips the parse entirely when the source doesn't
1797/// mention `AS OF` / `as of`, keeping the autocommit hot path free.
1798fn peek_top_level_as_of(sql: &str) -> Option<crate::application::vcs::AsOfSpec> {
1799    peek_top_level_as_of_with_table(sql).map(|(spec, _)| spec)
1800}
1801
1802/// Same as `peek_top_level_as_of` but also returns the table name
1803/// targeted by the AS OF clause (when the FROM clause names a
1804/// concrete table). `None` for the table slot means scalar SELECT
1805/// or a subquery source — callers treat those as "no enforcement".
1806pub(super) fn peek_top_level_as_of_with_table(
1807    sql: &str,
1808) -> Option<(crate::application::vcs::AsOfSpec, Option<String>)> {
1809    if !sql
1810        .as_bytes()
1811        .windows(5)
1812        .any(|w| w.eq_ignore_ascii_case(b"as of"))
1813    {
1814        return None;
1815    }
1816    let parsed = crate::storage::query::parser::parse(sql).ok()?;
1817    let crate::storage::query::ast::QueryExpr::Table(table) = parsed.query else {
1818        return None;
1819    };
1820    let clause = table.as_of?;
1821    let table_name = if table.table.is_empty() || table.table == "any" {
1822        None
1823    } else {
1824        Some(table.table.clone())
1825    };
1826    let spec = match clause {
1827        crate::storage::query::ast::AsOfClause::Commit(h) => {
1828            crate::application::vcs::AsOfSpec::Commit(h)
1829        }
1830        crate::storage::query::ast::AsOfClause::Branch(b) => {
1831            crate::application::vcs::AsOfSpec::Branch(b)
1832        }
1833        crate::storage::query::ast::AsOfClause::Tag(t) => crate::application::vcs::AsOfSpec::Tag(t),
1834        crate::storage::query::ast::AsOfClause::TimestampMs(ts) => {
1835            crate::application::vcs::AsOfSpec::TimestampMs(ts)
1836        }
1837        crate::storage::query::ast::AsOfClause::Snapshot(x) => {
1838            crate::application::vcs::AsOfSpec::Snapshot(x)
1839        }
1840    };
1841    Some((spec, table_name))
1842}
1843
1844pub(super) fn query_has_volatile_builtin(sql: &str) -> bool {
1845    // Lowercase the bytes up to the first null/newline into a small
1846    // stack buffer for cheap contains() checks. Most SQL fits in the
1847    // buffer; longer queries fall back to owned lowercase.
1848    const VOLATILE_TOKENS: &[&str] = &[
1849        "pg_advisory_lock",
1850        "pg_try_advisory_lock",
1851        "pg_advisory_unlock",
1852        "random()",
1853        // NOW() / CURRENT_TIMESTAMP / CURRENT_DATE intentionally
1854        // omitted for now — they ARE volatile but today's tests rely
1855        // on caching them. Revisit once a tighter volatility story
1856        // lands.
1857    ];
1858    let lowered = sql.to_ascii_lowercase();
1859    VOLATILE_TOKENS.iter().any(|t| lowered.contains(t))
1860}
1861
1862pub(super) fn query_is_ask_statement(sql: &str) -> bool {
1863    let trimmed = sql.trim_start();
1864    let head_end = trimmed
1865        .find(|c: char| c.is_whitespace() || c == '(' || c == ';')
1866        .unwrap_or(trimmed.len());
1867    trimmed[..head_end].eq_ignore_ascii_case("ASK")
1868}
1869
1870/// Pick the `(global_mode, collection_mode)` pair for an expression,
1871/// or `None` for variants that opt out of intent-locking entirely
1872/// (admin statements like `SHOW CONFIG`, transaction control, tenant
1873/// toggles).
1874///
1875/// Phase-1 contract:
1876/// - Reads  — `(IX-compatible) (Global, IS) → (Collection, IS)`
1877/// - Writes — `(IX-compatible) (Global, IX) → (Collection, IX)`
1878/// - DDL    — `(strong)        (Global, IX) → (Collection, X)`
1879pub(super) fn intent_lock_modes_for(
1880    expr: &QueryExpr,
1881) -> Option<(
1882    crate::storage::transaction::lock::LockMode,
1883    crate::storage::transaction::lock::LockMode,
1884)> {
1885    use crate::storage::transaction::lock::LockMode::{Exclusive, IntentExclusive, IntentShared};
1886
1887    match expr {
1888        // Reads — IS / IS.
1889        QueryExpr::Table(_)
1890        | QueryExpr::Join(_)
1891        | QueryExpr::Vector(_)
1892        | QueryExpr::Hybrid(_)
1893        | QueryExpr::Graph(_)
1894        | QueryExpr::Path(_)
1895        | QueryExpr::Ask(_)
1896        | QueryExpr::SearchCommand(_)
1897        | QueryExpr::GraphCommand(_)
1898        | QueryExpr::QueueSelect(_) => Some((IntentShared, IntentShared)),
1899
1900        // Writes — IX / IX. Non-tabular mutations (vector insert,
1901        // graph node insert, queue push, timeseries point insert)
1902        // don't carry their own dispatch arm here; they ride through
1903        // the Insert variant or a command variant covered by the
1904        // read-side arm above. P1.T4 expands only the TableQuery-ish
1905        // writes; non-tabular kinds inherit when their DML variants
1906        // land in later phases.
1907        QueryExpr::Insert(_)
1908        | QueryExpr::Update(_)
1909        | QueryExpr::Delete(_)
1910        | QueryExpr::QueueCommand(QueueCommand::Move { .. }) => {
1911            Some((IntentExclusive, IntentExclusive))
1912        }
1913        QueryExpr::QueueCommand(_) => Some((IntentShared, IntentShared)),
1914
1915        // DDL — IX / X. A DDL against collection `c` blocks all
1916        // other writers + readers on `c` but leaves other collections
1917        // running (because Global stays IX, not X).
1918        QueryExpr::CreateTable(_)
1919        | QueryExpr::CreateCollection(_)
1920        | QueryExpr::CreateVector(_)
1921        | QueryExpr::DropTable(_)
1922        | QueryExpr::DropGraph(_)
1923        | QueryExpr::DropVector(_)
1924        | QueryExpr::DropDocument(_)
1925        | QueryExpr::DropKv(_)
1926        | QueryExpr::DropCollection(_)
1927        | QueryExpr::Truncate(_)
1928        | QueryExpr::AlterTable(_)
1929        | QueryExpr::CreateIndex(_)
1930        | QueryExpr::DropIndex(_)
1931        | QueryExpr::CreateTimeSeries(_)
1932        | QueryExpr::DropTimeSeries(_)
1933        | QueryExpr::CreateQueue(_)
1934        | QueryExpr::AlterQueue(_)
1935        | QueryExpr::DropQueue(_)
1936        | QueryExpr::CreateTree(_)
1937        | QueryExpr::DropTree(_)
1938        | QueryExpr::CreatePolicy(_)
1939        | QueryExpr::DropPolicy(_)
1940        | QueryExpr::CreateView(_)
1941        | QueryExpr::DropView(_)
1942        | QueryExpr::RefreshMaterializedView(_)
1943        | QueryExpr::CreateSchema(_)
1944        | QueryExpr::DropSchema(_)
1945        | QueryExpr::CreateSequence(_)
1946        | QueryExpr::DropSequence(_)
1947        | QueryExpr::CreateServer(_)
1948        | QueryExpr::DropServer(_)
1949        | QueryExpr::CreateForeignTable(_)
1950        | QueryExpr::DropForeignTable(_) => Some((IntentExclusive, Exclusive)),
1951
1952        // Admin / control — skip intent locks. `SET TENANT`,
1953        // `BEGIN / COMMIT / ROLLBACK`, `SET CONFIG`, `SHOW CONFIG`,
1954        // `VACUUM`, etc. don't touch collection data the same way
1955        // and the existing transaction layer already serialises the
1956        // pieces that matter.
1957        _ => None,
1958    }
1959}
1960
1961/// Best-effort collection inventory for an expression. Used to pick
1962/// `Collection(...)` resources for the intent-lock guard. Overshoots
1963/// are fine (take an extra IS, benign); undershoots leak writes past
1964/// DDL X locks, so err on the side of listing more names.
1965pub(super) fn collections_referenced(expr: &QueryExpr) -> Vec<String> {
1966    let mut out = Vec::new();
1967    walk_collections(expr, &mut out);
1968    out.sort();
1969    out.dedup();
1970    out
1971}
1972
1973fn walk_collections(expr: &QueryExpr, out: &mut Vec<String>) {
1974    match expr {
1975        QueryExpr::Table(t) => out.push(t.table.clone()),
1976        QueryExpr::Join(j) => {
1977            walk_collections(&j.left, out);
1978            walk_collections(&j.right, out);
1979        }
1980        QueryExpr::Insert(i) => out.push(i.table.clone()),
1981        QueryExpr::Update(u) => out.push(u.table.clone()),
1982        QueryExpr::Delete(d) => out.push(d.table.clone()),
1983        QueryExpr::QueueSelect(q) => out.push(q.queue.clone()),
1984
1985        // DDL — include the target collection so DDL takes
1986        // `(Collection, X)` and blocks concurrent readers / writers
1987        // on the same collection. Other collections stay live
1988        // because Global is still IX.
1989        QueryExpr::CreateTable(q) => out.push(q.name.clone()),
1990        QueryExpr::CreateCollection(q) => out.push(q.name.clone()),
1991        QueryExpr::CreateVector(q) => out.push(q.name.clone()),
1992        QueryExpr::DropTable(q) => out.push(q.name.clone()),
1993        QueryExpr::DropGraph(q) => out.push(q.name.clone()),
1994        QueryExpr::DropVector(q) => out.push(q.name.clone()),
1995        QueryExpr::DropDocument(q) => out.push(q.name.clone()),
1996        QueryExpr::DropKv(q) => out.push(q.name.clone()),
1997        QueryExpr::DropCollection(q) => out.push(q.name.clone()),
1998        QueryExpr::Truncate(q) => out.push(q.name.clone()),
1999        QueryExpr::AlterTable(q) => out.push(q.name.clone()),
2000        QueryExpr::CreateIndex(q) => out.push(q.table.clone()),
2001        QueryExpr::DropIndex(q) => out.push(q.table.clone()),
2002        QueryExpr::CreateTimeSeries(q) => out.push(q.name.clone()),
2003        QueryExpr::DropTimeSeries(q) => out.push(q.name.clone()),
2004        QueryExpr::CreateQueue(q) => out.push(q.name.clone()),
2005        QueryExpr::AlterQueue(q) => out.push(q.name.clone()),
2006        QueryExpr::DropQueue(q) => out.push(q.name.clone()),
2007        QueryExpr::QueueCommand(QueueCommand::Move {
2008            source,
2009            destination,
2010            ..
2011        }) => {
2012            out.push(source.clone());
2013            out.push(destination.clone());
2014        }
2015        QueryExpr::CreatePolicy(q) => out.push(q.table.clone()),
2016        QueryExpr::CreateView(q) => out.push(q.name.clone()),
2017        QueryExpr::DropView(q) => out.push(q.name.clone()),
2018        QueryExpr::RefreshMaterializedView(q) => out.push(q.name.clone()),
2019
2020        // Vector / Hybrid / Graph / Path / commands reference
2021        // collections through fields whose shape varies; without a
2022        // uniform accessor we fall back to the global lock only —
2023        // benign because every runtime path still holds the global
2024        // mode.
2025        _ => {}
2026    }
2027}
2028
2029impl RedDBRuntime {
2030    pub fn in_memory() -> RedDBResult<Self> {
2031        Self::with_options(RedDBOptions::in_memory())
2032    }
2033
2034    /// Handle to the intent-lock manager for tests + introspection.
2035    /// Production code acquires via `LockerGuard::new(rt.lock_manager())`
2036    /// rather than touching the manager directly.
2037    pub fn lock_manager(&self) -> std::sync::Arc<crate::storage::transaction::lock::LockManager> {
2038        self.inner.lock_manager.clone()
2039    }
2040
2041    #[inline(never)]
2042    pub fn with_options(options: RedDBOptions) -> RedDBResult<Self> {
2043        Self::with_pool(options, ConnectionPoolConfig::default())
2044    }
2045
2046    pub fn with_pool(
2047        options: RedDBOptions,
2048        pool_config: ConnectionPoolConfig,
2049    ) -> RedDBResult<Self> {
2050        // PLAN.md Phase 9.1 — capture wall-clock before storage
2051        // open so the cold-start phase markers can be backfilled
2052        // once Lifecycle is constructed below. Storage open
2053        // encapsulates auto-restore + WAL replay; we treat the
2054        // whole window as one combined "restore" + "wal_replay"
2055        // phase split at the same boundary because the storage
2056        // layer doesn't yet emit a finer signal.
2057        let boot_open_start_ms = std::time::SystemTime::now()
2058            .duration_since(std::time::UNIX_EPOCH)
2059            .map(|d| d.as_millis() as u64)
2060            .unwrap_or(0);
2061        let db = Arc::new(
2062            RedDB::open_with_options(&options)
2063                .map_err(|err| RedDBError::Internal(err.to_string()))?,
2064        );
2065        let result_blob_cache = crate::storage::cache::BlobCache::open_with_l2(
2066            crate::storage::cache::BlobCacheConfig::default().with_l2_path(
2067                options
2068                    .resolved_path("data.rdb")
2069                    .with_extension("result-cache.l2"),
2070            ),
2071        )
2072        .map_err(|err| {
2073            RedDBError::Internal(format!("open result Blob Cache L2 failed: {err:?}"))
2074        })?;
2075        let storage_ready_ms = std::time::SystemTime::now()
2076            .duration_since(std::time::UNIX_EPOCH)
2077            .map(|d| d.as_millis() as u64)
2078            .unwrap_or(0);
2079
2080        let runtime = Self {
2081            inner: Arc::new(RuntimeInner {
2082                db,
2083                layout: PhysicalLayout::from_options(&options),
2084                indices: IndexCatalog::register_default_vector_graph(
2085                    options.has_capability(crate::api::Capability::Table),
2086                    options.has_capability(crate::api::Capability::Graph),
2087                ),
2088                pool_config,
2089                pool: Mutex::new(PoolState::default()),
2090                started_at_unix_ms: SystemTime::now()
2091                    .duration_since(UNIX_EPOCH)
2092                    .unwrap_or_default()
2093                    .as_millis(),
2094                probabilistic: super::probabilistic_store::ProbabilisticStore::new(),
2095                index_store: super::index_store::IndexStore::new(),
2096                cdc: crate::replication::cdc::CdcBuffer::new(100_000),
2097                backup_scheduler: crate::replication::scheduler::BackupScheduler::new(3600),
2098                query_cache: parking_lot::RwLock::new(
2099                    crate::storage::query::planner::cache::PlanCache::new(1000),
2100                ),
2101                result_cache: parking_lot::RwLock::new((
2102                    HashMap::new(),
2103                    std::collections::VecDeque::new(),
2104                )),
2105                result_blob_cache,
2106                result_blob_entries: parking_lot::RwLock::new((
2107                    HashMap::new(),
2108                    std::collections::VecDeque::new(),
2109                )),
2110                ask_answer_cache_entries: parking_lot::RwLock::new((
2111                    HashSet::new(),
2112                    std::collections::VecDeque::new(),
2113                )),
2114                result_cache_shadow_divergences: std::sync::atomic::AtomicU64::new(0),
2115                ask_daily_spend: parking_lot::RwLock::new(HashMap::new()),
2116                queue_message_locks: parking_lot::RwLock::new(HashMap::new()),
2117                rmw_locks: RmwLockTable::new(),
2118                planner_dirty_tables: parking_lot::RwLock::new(HashSet::new()),
2119                ec_registry: Arc::new(crate::ec::config::EcRegistry::new()),
2120                ec_worker: crate::ec::worker::EcWorker::new(),
2121                auth_store: parking_lot::RwLock::new(None),
2122                oauth_validator: parking_lot::RwLock::new(None),
2123                views: parking_lot::RwLock::new(HashMap::new()),
2124                materialized_views: parking_lot::RwLock::new(
2125                    crate::storage::cache::result::MaterializedViewCache::new(),
2126                ),
2127                retention_sweeper: parking_lot::RwLock::new(
2128                    crate::runtime::retention_sweeper::RetentionSweeperState::new(),
2129                ),
2130                snapshot_manager: Arc::new(
2131                    crate::storage::transaction::snapshot::SnapshotManager::new(),
2132                ),
2133                tx_contexts: parking_lot::RwLock::new(HashMap::new()),
2134                tx_local_tenants: parking_lot::RwLock::new(HashMap::new()),
2135                env_config_overrides: crate::runtime::config_overlay::collect_env_overrides(),
2136                lock_manager: Arc::new({
2137                    // Sourced from the matrix: Tier B key
2138                    // `concurrency.locking.deadlock_timeout_ms`
2139                    // (default 5000). Env var wins at boot so
2140                    // operators can tune without touching red_config.
2141                    let env = crate::runtime::config_overlay::collect_env_overrides();
2142                    let timeout_ms = env
2143                        .get("concurrency.locking.deadlock_timeout_ms")
2144                        .and_then(|raw| raw.parse::<u64>().ok())
2145                        .unwrap_or_else(|| {
2146                            match crate::runtime::config_matrix::default_for(
2147                                "concurrency.locking.deadlock_timeout_ms",
2148                            ) {
2149                                Some(crate::serde_json::Value::Number(n)) => n as u64,
2150                                _ => 5000,
2151                            }
2152                        });
2153                    let cfg = crate::storage::transaction::lock::LockConfig {
2154                        default_timeout: std::time::Duration::from_millis(timeout_ms),
2155                        ..Default::default()
2156                    };
2157                    crate::storage::transaction::lock::LockManager::new(cfg)
2158                }),
2159                rls_policies: parking_lot::RwLock::new(HashMap::new()),
2160                rls_enabled_tables: parking_lot::RwLock::new(HashSet::new()),
2161                foreign_tables: Arc::new(crate::storage::fdw::ForeignTableRegistry::with_builtins()),
2162                pending_tombstones: parking_lot::RwLock::new(HashMap::new()),
2163                pending_versioned_updates: parking_lot::RwLock::new(HashMap::new()),
2164                pending_kv_watch_events: parking_lot::RwLock::new(HashMap::new()),
2165                pending_store_wal_actions: parking_lot::RwLock::new(HashMap::new()),
2166                tenant_tables: parking_lot::RwLock::new(HashMap::new()),
2167                ddl_epoch: std::sync::atomic::AtomicU64::new(0),
2168                write_gate: Arc::new(crate::runtime::write_gate::WriteGate::from_options(
2169                    &options,
2170                )),
2171                lifecycle: crate::runtime::lifecycle::Lifecycle::new(),
2172                resource_limits: crate::runtime::resource_limits::ResourceLimits::from_env(),
2173                audit_log: {
2174                    // Default audit-log path for the in-memory case
2175                    // sits in the system temp dir; persistent runs
2176                    // place it next to data.rdb.
2177                    //
2178                    // gh-471 iter 2: route through the resolved
2179                    // `LogDestination`. Performance/Max tiers emit a
2180                    // `File(...)` under `<dbname>.rdb.red/logs/`;
2181                    // lower tiers / ephemeral runs report `Stderr`
2182                    // and we keep the legacy file-next-to-data sink.
2183                    let data_path = options
2184                        .data_path
2185                        .clone()
2186                        .unwrap_or_else(|| std::env::temp_dir().join("reddb"));
2187                    let (audit_dest, _) = crate::api::tier_wiring::current_log_destinations();
2188                    Arc::new(crate::runtime::audit_log::AuditLogger::for_destination(
2189                        &audit_dest,
2190                        &data_path,
2191                    ))
2192                },
2193                lease_lifecycle: std::sync::OnceLock::new(),
2194                replica_apply_metrics: crate::replication::logical::ReplicaApplyMetrics::default(),
2195                quota_bucket: crate::runtime::quota_bucket::QuotaBucket::from_env(),
2196                schema_vocabulary: parking_lot::RwLock::new(
2197                    crate::runtime::schema_vocabulary::SchemaVocabulary::new(),
2198                ),
2199                slow_query_logger: {
2200                    // Issue #205 — slow-query sink lives in the same
2201                    // directory the audit log uses, so backup/restore
2202                    // ships them together. Threshold + sample-pct
2203                    // default conservatively (1 s, 100% sampling) so
2204                    // emitted lines are rare and complete. Operators
2205                    // tune via env / config matrix in a follow-up.
2206                    //
2207                    // gh-471 iter 2: same routing as the audit log —
2208                    // `LogDestination::File(...)` for Performance/Max
2209                    // lands under `<dbname>.rdb.red/logs/slow.log`;
2210                    // lower tiers fall back to `red-slow.log` in the
2211                    // data directory.
2212                    let fallback_dir = options
2213                        .data_path
2214                        .as_ref()
2215                        .and_then(|p| p.parent().map(std::path::PathBuf::from))
2216                        .unwrap_or_else(|| std::env::temp_dir().join("reddb"));
2217                    let threshold_ms = std::env::var("RED_SLOW_QUERY_THRESHOLD_MS")
2218                        .ok()
2219                        .and_then(|s| s.parse::<u64>().ok())
2220                        .unwrap_or(1000);
2221                    let sample_pct = std::env::var("RED_SLOW_QUERY_SAMPLE_PCT")
2222                        .ok()
2223                        .and_then(|s| s.parse::<u8>().ok())
2224                        .unwrap_or(100);
2225                    let (_, slow_dest) = crate::api::tier_wiring::current_log_destinations();
2226                    crate::telemetry::slow_query_logger::SlowQueryLogger::for_destination(
2227                        &slow_dest,
2228                        &fallback_dir,
2229                        threshold_ms,
2230                        sample_pct,
2231                    )
2232                },
2233                kv_stats: crate::runtime::KvStatsCounters::default(),
2234                metrics_ingest_stats: crate::runtime::MetricsIngestCounters::default(),
2235                metrics_tenant_activity_stats:
2236                    crate::runtime::MetricsTenantActivityCounters::default(),
2237                queue_telemetry: Arc::new(
2238                    crate::runtime::queue_telemetry::QueueTelemetryCounters::default(),
2239                ),
2240                kv_tag_index: crate::runtime::KvTagIndex::default(),
2241                chain_tip_cache: parking_lot::Mutex::new(HashMap::new()),
2242                chain_integrity_broken: parking_lot::Mutex::new(HashMap::new()),
2243            }),
2244        };
2245
2246        // Issue #205 — install the process-wide OperatorEvent sink so
2247        // emit sites buried in storage / replication / signal handlers
2248        // can record without threading an `&AuditLogger` through every
2249        // call stack. First registration wins; subsequent in-memory
2250        // runtimes (test harnesses) fall through to tracing+eprintln.
2251        crate::telemetry::operator_event::install_global_audit_sink(Arc::clone(
2252            &runtime.inner.audit_log,
2253        ));
2254
2255        // PLAN.md Phase 9.1 — backfill cold-start phase markers
2256        // from the wall-clock captured before storage open. The
2257        // entire `RedDB::open_with_options` call covers both
2258        // auto-restore (when configured) and WAL replay. We
2259        // record both phases against the same boundary today;
2260        // a follow-up will split them once the storage layer
2261        // surfaces a finer-grained event.
2262        runtime
2263            .inner
2264            .lifecycle
2265            .set_restore_started_at_ms(boot_open_start_ms);
2266        runtime
2267            .inner
2268            .lifecycle
2269            .set_restore_ready_at_ms(storage_ready_ms);
2270        runtime
2271            .inner
2272            .lifecycle
2273            .set_wal_replay_started_at_ms(boot_open_start_ms);
2274        runtime
2275            .inner
2276            .lifecycle
2277            .set_wal_replay_ready_at_ms(storage_ready_ms);
2278
2279        let restored_cdc_lsn = runtime
2280            .inner
2281            .db
2282            .replication
2283            .as_ref()
2284            .map(|repl| {
2285                repl.logical_wal_spool
2286                    .as_ref()
2287                    .map(|spool| spool.current_lsn())
2288                    .unwrap_or(0)
2289            })
2290            .unwrap_or(0)
2291            .max(runtime.config_u64("red.config.timeline.last_archived_lsn", 0));
2292        runtime.inner.cdc.set_current_lsn(restored_cdc_lsn);
2293        runtime.rehydrate_snapshot_xid_floor();
2294        runtime.bootstrap_system_keyed_collections()?;
2295        runtime.rehydrate_declared_column_schemas();
2296        runtime.load_probabilistic_state()?;
2297
2298        // Phase 2.5.4: replay `tenant_tables.{table}.column` markers so
2299        // tables declared via `TENANT BY (col)` survive restart. Each
2300        // entry re-registers the auto-policy and flips RLS on again.
2301        runtime.rehydrate_tenant_tables();
2302        // Issue #593 slice 9a — replay persisted materialized-view
2303        // descriptors so `CREATE MATERIALIZED VIEW v AS …` survives a
2304        // restart. Runs after the system-keyed collections bootstrap
2305        // and before the API opens.
2306        runtime.rehydrate_materialized_view_descriptors();
2307        if let Some(repl) = &runtime.inner.db.replication {
2308            repl.wal_buffer.set_current_lsn(restored_cdc_lsn);
2309        }
2310
2311        // Save system info to red_config on boot
2312        {
2313            let sys = SystemInfo::collect();
2314            runtime.inner.db.store().set_config_tree(
2315                "red.system",
2316                &crate::serde_json::json!({
2317                    "pid": sys.pid,
2318                    "cpu_cores": sys.cpu_cores,
2319                    "total_memory_bytes": sys.total_memory_bytes,
2320                    "available_memory_bytes": sys.available_memory_bytes,
2321                    "os": sys.os,
2322                    "arch": sys.arch,
2323                    "hostname": sys.hostname,
2324                    "started_at": SystemTime::now()
2325                        .duration_since(UNIX_EPOCH)
2326                        .unwrap_or_default()
2327                        .as_millis() as u64
2328                }),
2329            );
2330
2331            // Seed defaults on first boot (only if red_config is empty or missing defaults)
2332            let store = runtime.inner.db.store();
2333            if store
2334                .get_collection("red_config")
2335                .map(|m| m.query_all(|_| true).len())
2336                .unwrap_or(0)
2337                <= 10
2338            {
2339                store.set_config_tree("red.ai", &crate::json!({
2340                    "default": crate::json!({
2341                        "provider": "openai",
2342                        "model": crate::ai::DEFAULT_OPENAI_PROMPT_MODEL
2343                    }),
2344                    "max_embedding_inputs": 256,
2345                    "max_prompt_batch": 256,
2346                    "timeout": crate::json!({ "connect_secs": 10, "read_secs": 90, "write_secs": 30 })
2347                }));
2348                store.set_config_tree(
2349                    "red.server",
2350                    &crate::json!({
2351                        "max_scan_limit": 1000,
2352                        "max_body_size": 1048576,
2353                        "read_timeout_ms": 5000,
2354                        "write_timeout_ms": 5000
2355                    }),
2356                );
2357                store.set_config_tree(
2358                    "red.storage",
2359                    &crate::json!({
2360                        "page_size": 4096,
2361                        "page_cache_capacity": 100000,
2362                        "auto_checkpoint_pages": 1000,
2363                        "snapshot_retention": 16,
2364                        "verify_checksums": true,
2365                        "segment": crate::json!({
2366                            "max_entities": 100000,
2367                            "max_bytes": 268435456_u64,
2368                            "compression_level": 6
2369                        }),
2370                        "hnsw": crate::json!({ "m": 16, "ef_construction": 100, "ef_search": 50 }),
2371                        "ivf": crate::json!({ "n_lists": 100, "n_probes": 10 }),
2372                        "bm25": crate::json!({ "k1": 1.2, "b": 0.75 })
2373                    }),
2374                );
2375                store.set_config_tree(
2376                    "red.search",
2377                    &crate::json!({
2378                        "rag": crate::json!({
2379                            "max_chunks_per_source": 10,
2380                            "max_total_chunks": 25,
2381                            "similarity_threshold": 0.8,
2382                            "graph_depth": 2,
2383                            "min_relevance": 0.3
2384                        }),
2385                        "fusion": crate::json!({
2386                            "vector_weight": 0.5,
2387                            "graph_weight": 0.3,
2388                            "table_weight": 0.2,
2389                            "dedup_threshold": 0.85
2390                        })
2391                    }),
2392                );
2393                store.set_config_tree(
2394                    "red.auth",
2395                    &crate::json!({
2396                        "enabled": false,
2397                        "session_ttl_secs": 3600,
2398                        "require_auth": false
2399                    }),
2400                );
2401                store.set_config_tree(
2402                    "red.query",
2403                    &crate::json!({
2404                        "connection_pool": crate::json!({ "max_connections": 64, "max_idle": 16 }),
2405                        "max_recursion_depth": 1000
2406                    }),
2407                );
2408                store.set_config_tree(
2409                    "red.indexes",
2410                    &crate::json!({
2411                        "auto_select": true,
2412                        "bloom_filter": crate::json!({
2413                            "enabled": true,
2414                            "false_positive_rate": 0.01,
2415                            "prune_on_scan": true
2416                        }),
2417                        "hash": crate::json!({ "enabled": true }),
2418                        "bitmap": crate::json!({ "enabled": true, "max_cardinality": 1000 }),
2419                        "spatial": crate::json!({ "enabled": true })
2420                    }),
2421                );
2422                store.set_config_tree(
2423                    "red.memtable",
2424                    &crate::json!({
2425                        "enabled": true,
2426                        "max_bytes": 67108864_u64,
2427                        "flush_threshold": 0.75
2428                    }),
2429                );
2430                store.set_config_tree(
2431                    "red.probabilistic",
2432                    &crate::json!({
2433                        "hll_registers": 16384,
2434                        "sketch_default_width": 1000,
2435                        "sketch_default_depth": 5,
2436                        "filter_default_capacity": 100000
2437                    }),
2438                );
2439                store.set_config_tree(
2440                    "red.timeseries",
2441                    &crate::json!({
2442                        "default_chunk_size": 1024,
2443                        "compression": crate::json!({
2444                            "timestamps": "delta_of_delta",
2445                            "values": "gorilla_xor"
2446                        }),
2447                        "default_retention_days": 0
2448                    }),
2449                );
2450                store.set_config_tree(
2451                    "red.queue",
2452                    &crate::json!({
2453                        "default_max_size": 0,
2454                        "default_max_attempts": 3,
2455                        "visibility_timeout_ms": 30000,
2456                        "consumer_idle_timeout_ms": 60000
2457                    }),
2458                );
2459                store.set_config_tree(
2460                    "red.backup",
2461                    &crate::json!({
2462                        "enabled": false,
2463                        "interval_secs": 3600,
2464                        "retention_count": 24,
2465                        "upload": false,
2466                        "backend": "local"
2467                    }),
2468                );
2469                store.set_config_tree(
2470                    "red.wal",
2471                    &crate::json!({
2472                        "archive": crate::json!({
2473                            "enabled": false,
2474                            "retention_hours": 168,
2475                            "prefix": "wal/"
2476                        })
2477                    }),
2478                );
2479                store.set_config_tree(
2480                    "red.cdc",
2481                    &crate::json!({
2482                        "enabled": true,
2483                        "buffer_size": 100000
2484                    }),
2485                );
2486                store.set_config_tree(
2487                    "red.config.secret",
2488                    &crate::json!({
2489                        "auto_encrypt": true,
2490                        "auto_decrypt": true
2491                    }),
2492                );
2493            }
2494
2495            // Perf-parity config matrix: heal the Tier A (critical)
2496            // keys unconditionally on every boot. Idempotent — only
2497            // writes the default when the key is missing. Keeps
2498            // `SHOW CONFIG` showing every guarantee the operator has
2499            // (durability.mode, concurrency.locking.enabled, …) even
2500            // on long-running datadirs that predate the matrix.
2501            crate::runtime::config_matrix::heal_critical_keys(store.as_ref());
2502
2503            // Phase 5 — Lehman-Yao runtime flag. Read the Tier A
2504            // `storage.btree.lehman_yao` value from the matrix (env
2505            // > file > red_config > default) and publish it to the
2506            // storage layer's atomic so the B-tree read / split
2507            // paths can branch without re-reading the config on
2508            // every hot-path call.
2509            let lehman_yao = runtime.config_bool("storage.btree.lehman_yao", true);
2510            crate::storage::engine::btree::lehman_yao::set_enabled(lehman_yao);
2511            if lehman_yao {
2512                tracing::info!(
2513                    "storage.btree.lehman_yao=true — lock-free concurrent descent enabled"
2514                );
2515            }
2516
2517            // Config file overlay — mounted `/etc/reddb/config.json`
2518            // (override path via REDDB_CONFIG_FILE). Writes keys with
2519            // write-if-absent semantics so a later user `SET CONFIG`
2520            // always wins. Missing file = silent no-op.
2521            let overlay_path = crate::runtime::config_overlay::config_file_path();
2522            let _ =
2523                crate::runtime::config_overlay::apply_config_file(store.as_ref(), &overlay_path);
2524        }
2525
2526        // VCS ("Git for Data") — create the `red_*` metadata
2527        // collections on first boot. Idempotent: `get_or_create_collection`
2528        // is a no-op if the collection already exists.
2529        {
2530            let store = runtime.inner.db.store();
2531            for name in crate::application::vcs_collections::ALL {
2532                let _ = store.get_or_create_collection(*name);
2533            }
2534            // Seed VCS config namespace with sensible defaults on first
2535            // boot, matching the pattern used by red.ai / red.storage.
2536            store.set_config_tree(
2537                crate::application::vcs_collections::CONFIG_NAMESPACE,
2538                &crate::json!({
2539                    "default_branch": "main",
2540                    "author": crate::json!({
2541                        "name": "reddb",
2542                        "email": "reddb@localhost"
2543                    }),
2544                    "protected_branches": crate::json!(["main"]),
2545                    "closure": crate::json!({
2546                        "enabled": true,
2547                        "lazy": true
2548                    }),
2549                    "merge": crate::json!({
2550                        "default_strategy": "auto",
2551                        "fast_forward": true
2552                    })
2553                }),
2554            );
2555        }
2556
2557        // Migrations — create the `red_migrations` / `red_migration_deps`
2558        // system collections on first boot. Idempotent.
2559        {
2560            let store = runtime.inner.db.store();
2561            for name in crate::application::migration_collections::ALL {
2562                let _ = store.get_or_create_collection(*name);
2563            }
2564        }
2565
2566        // Start background maintenance thread (context index refresh +
2567        // session purge). Held by a WEAK reference to `RuntimeInner`
2568        // so dropping the last `RedDBRuntime` handle actually releases
2569        // the underlying Arc<Pager> (and its file lock). Polling at
2570        // 200ms means shutdown latency is bounded; the real 60-second
2571        // work cadence is tracked independently via a `last_work`
2572        // timestamp.
2573        //
2574        // The previous version captured `rt = runtime.clone()` by
2575        // strong reference and ran an unterminated `loop`, which held
2576        // Arc<RuntimeInner> forever — reopening a persistent database
2577        // in the same process failed with "Database is locked" because
2578        // the pager could never drop. See the regression test
2579        // `finding_1_select_after_bulk_insert_persistent_reopen`.
2580        {
2581            let weak = Arc::downgrade(&runtime.inner);
2582            std::thread::Builder::new()
2583                .name("reddb-maintenance".into())
2584                .spawn(move || {
2585                    let tick = std::time::Duration::from_millis(200);
2586                    let work_interval = std::time::Duration::from_secs(60);
2587                    let mut last_work = std::time::Instant::now();
2588                    loop {
2589                        std::thread::sleep(tick);
2590                        let Some(inner) = weak.upgrade() else {
2591                            // All strong references dropped — the
2592                            // runtime is gone, exit cleanly.
2593                            break;
2594                        };
2595                        if last_work.elapsed() >= work_interval {
2596                            let _stats = inner.db.store().context_index().stats();
2597                            last_work = std::time::Instant::now();
2598                        }
2599                    }
2600                })
2601                .ok();
2602        }
2603
2604        // Start backup scheduler if enabled via red_config
2605        {
2606            let store = runtime.inner.db.store();
2607            let mut backup_enabled = false;
2608            let mut backup_interval = 3600u64;
2609
2610            if let Some(manager) = store.get_collection("red_config") {
2611                manager.for_each_entity(|entity| {
2612                    if let Some(row) = entity.data.as_row() {
2613                        let key = row.get_field("key").and_then(|v| match v {
2614                            crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2615                            _ => None,
2616                        });
2617                        let val = row.get_field("value");
2618                        if key == Some("red.config.backup.enabled") {
2619                            backup_enabled = match val {
2620                                Some(crate::storage::schema::Value::Boolean(true)) => true,
2621                                Some(crate::storage::schema::Value::Text(s)) => &**s == "true",
2622                                _ => false,
2623                            };
2624                        } else if key == Some("red.config.backup.interval_secs") {
2625                            if let Some(crate::storage::schema::Value::Integer(n)) = val {
2626                                backup_interval = *n as u64;
2627                            }
2628                        }
2629                    }
2630                    true
2631                });
2632            }
2633
2634            if backup_enabled {
2635                runtime.inner.backup_scheduler.set_interval(backup_interval);
2636                let rt = runtime.clone();
2637                runtime
2638                    .inner
2639                    .backup_scheduler
2640                    .start(move || rt.trigger_backup().map_err(|e| format!("{}", e)));
2641            }
2642        }
2643
2644        // Load EC registry from red_config and start worker
2645        {
2646            runtime
2647                .inner
2648                .ec_registry
2649                .load_from_config_store(runtime.inner.db.store().as_ref());
2650            if !runtime.inner.ec_registry.async_configs().is_empty() {
2651                runtime.inner.ec_worker.start(
2652                    Arc::clone(&runtime.inner.ec_registry),
2653                    Arc::clone(&runtime.inner.db.store()),
2654                );
2655            }
2656        }
2657
2658        if let crate::replication::ReplicationRole::Replica { primary_addr } =
2659            runtime.inner.db.options().replication.role.clone()
2660        {
2661            let rt = runtime.clone();
2662            std::thread::Builder::new()
2663                .name("reddb-replica".into())
2664                .spawn(move || rt.run_replica_loop(primary_addr))
2665                .ok();
2666        }
2667
2668        // PLAN.md Phase 1 — Lifecycle Contract. Mark Ready once every
2669        // boot stage above has completed (WAL replay, restore-from-
2670        // remote, replica-loop spawn). Health probes flip from 503 to
2671        // 200 here; shutdown begins from this state.
2672        runtime.inner.lifecycle.mark_ready();
2673
2674        // Issue #583 slice 10 — ContinuousMaterializedView scheduler.
2675        // Low-priority background ticker that drains the cache's
2676        // `claim_due_at` set every ~50ms. Holds only a Weak<RuntimeInner>
2677        // so the thread exits cleanly when the runtime drops (≤50ms
2678        // latency between drop and exit). Materialized views without
2679        // a `REFRESH EVERY` clause stay on the manual-refresh path
2680        // and are skipped by `claim_due_at`, so the loop is a no-op
2681        // when no scheduled views exist.
2682        {
2683            let weak_inner = Arc::downgrade(&runtime.inner);
2684            std::thread::Builder::new()
2685                .name("reddb-mv-scheduler".into())
2686                .spawn(move || loop {
2687                    std::thread::sleep(std::time::Duration::from_millis(50));
2688                    let Some(inner) = weak_inner.upgrade() else {
2689                        break;
2690                    };
2691                    let rt = RedDBRuntime { inner };
2692                    rt.refresh_due_materialized_views();
2693                })
2694                .ok();
2695        }
2696
2697        // Issue #584 slice 12 — DeclarativeRetention background sweeper.
2698        // Low-priority ticker that physically reclaims rows whose
2699        // timestamp has fallen beyond the retention window. Holds a
2700        // `Weak<RuntimeInner>` so the thread exits within one tick of
2701        // the runtime drop (graceful shutdown leaves storage consistent
2702        // because each tick goes through the standard DELETE path —
2703        // there is no half-finished mutation state to clean up). The
2704        // tick interval is intentionally longer than the MV scheduler
2705        // (500ms) because retention is order-of-seconds at minimum.
2706        {
2707            let weak_inner = Arc::downgrade(&runtime.inner);
2708            std::thread::Builder::new()
2709                .name("reddb-retention-sweeper".into())
2710                .spawn(move || loop {
2711                    std::thread::sleep(std::time::Duration::from_millis(500));
2712                    let Some(inner) = weak_inner.upgrade() else {
2713                        break;
2714                    };
2715                    let rt = RedDBRuntime { inner };
2716                    rt.sweep_retention_tick(
2717                        crate::runtime::retention_sweeper::DEFAULT_SWEEPER_BATCH,
2718                    );
2719                })
2720                .ok();
2721        }
2722
2723        Ok(runtime)
2724    }
2725
2726    fn rehydrate_snapshot_xid_floor(&self) {
2727        let store = self.inner.db.store();
2728        for collection in store.list_collections() {
2729            let Some(manager) = store.get_collection(&collection) else {
2730                continue;
2731            };
2732            for entity in manager.query_all(|_| true) {
2733                self.inner
2734                    .snapshot_manager
2735                    .observe_committed_xid(entity.xmin);
2736                self.inner
2737                    .snapshot_manager
2738                    .observe_committed_xid(entity.xmax);
2739            }
2740        }
2741    }
2742
2743    /// Provision an empty Table-shaped collection that backs a
2744    /// `CREATE MATERIALIZED VIEW v` (issue #594 slice 9b of #575).
2745    /// `SELECT FROM v` reads this collection directly; the rewriter is
2746    /// configured to skip materialized views so the body is no longer
2747    /// substituted. REFRESH still writes to the cache slot — wiring it
2748    /// into this backing collection is the job of slice 9c.
2749    ///
2750    /// Idempotent: re-running for the same name leaves the existing
2751    /// collection in place (mirrors `CREATE TABLE IF NOT EXISTS`
2752    /// semantics). This keeps `CREATE OR REPLACE MATERIALIZED VIEW v`
2753    /// cheap — the body change does not invalidate already-buffered
2754    /// rows. Until 9c lands the backing is always empty anyway.
2755    pub(crate) fn ensure_materialized_view_backing(&self, name: &str) -> RedDBResult<()> {
2756        let store = self.inner.db.store();
2757        let mut changed = false;
2758        if store.get_collection(name).is_none() {
2759            store.get_or_create_collection(name);
2760            changed = true;
2761        }
2762        if self.inner.db.collection_contract(name).is_none() {
2763            self.inner
2764                .db
2765                .save_collection_contract(system_keyed_collection_contract(
2766                    name,
2767                    crate::catalog::CollectionModel::Table,
2768                ))
2769                .map_err(|err| RedDBError::Internal(err.to_string()))?;
2770            changed = true;
2771        }
2772        if changed {
2773            self.inner
2774                .db
2775                .persist_metadata()
2776                .map_err(|err| RedDBError::Internal(err.to_string()))?;
2777        }
2778        Ok(())
2779    }
2780
2781    /// Inverse of [`ensure_materialized_view_backing`] — drops the
2782    /// backing collection on `DROP MATERIALIZED VIEW v`. No-op when
2783    /// the collection was never created (e.g. a `DROP MATERIALIZED
2784    /// VIEW IF EXISTS v` against an unknown name).
2785    pub(crate) fn drop_materialized_view_backing(&self, name: &str) -> RedDBResult<()> {
2786        let store = self.inner.db.store();
2787        if store.get_collection(name).is_none() {
2788            return Ok(());
2789        }
2790        store
2791            .drop_collection(name)
2792            .map_err(|err| RedDBError::Internal(err.to_string()))?;
2793        // The contract may have been dropped already (DROP TABLE path)
2794        // — ignore "not found" errors by checking presence first.
2795        if self.inner.db.collection_contract(name).is_some() {
2796            self.inner
2797                .db
2798                .remove_collection_contract(name)
2799                .map_err(|err| RedDBError::Internal(err.to_string()))?;
2800        }
2801        self.invalidate_result_cache();
2802        self.inner
2803            .db
2804            .persist_metadata()
2805            .map_err(|err| RedDBError::Internal(err.to_string()))?;
2806        Ok(())
2807    }
2808
2809    fn bootstrap_system_keyed_collections(&self) -> RedDBResult<()> {
2810        let mut changed = false;
2811        for (name, model) in [
2812            ("red.config", crate::catalog::CollectionModel::Config),
2813            ("red.vault", crate::catalog::CollectionModel::Vault),
2814            // Issue #593 — materialized-view catalog. One row per
2815            // `CREATE MATERIALIZED VIEW`; rehydrated at boot before
2816            // the API opens.
2817            (
2818                crate::runtime::continuous_materialized_view::CATALOG_COLLECTION,
2819                crate::catalog::CollectionModel::Config,
2820            ),
2821        ] {
2822            if self.inner.db.store().get_collection(name).is_none() {
2823                self.inner.db.store().get_or_create_collection(name);
2824                changed = true;
2825            }
2826            if self.inner.db.collection_contract(name).is_none() {
2827                self.inner
2828                    .db
2829                    .save_collection_contract(system_keyed_collection_contract(name, model))
2830                    .map_err(|err| RedDBError::Internal(err.to_string()))?;
2831                changed = true;
2832            }
2833        }
2834        if changed {
2835            self.inner
2836                .db
2837                .persist_metadata()
2838                .map_err(|err| RedDBError::Internal(err.to_string()))?;
2839        }
2840        Ok(())
2841    }
2842
2843    pub fn db(&self) -> Arc<RedDB> {
2844        Arc::clone(&self.inner.db)
2845    }
2846
2847    /// Direct access to the runtime's secondary-index store.
2848    /// Used by bulk-insert entry points (gRPC binary bulk, HTTP bulk,
2849    /// wire bulk) that need to push new rows through the per-index
2850    /// maintenance hook after `store.bulk_insert` returns.
2851    pub fn index_store_ref(&self) -> &super::index_store::IndexStore {
2852        &self.inner.index_store
2853    }
2854
2855    /// Apply a DDL event to the schema-vocabulary reverse index
2856    /// (issue #120). Called by DDL execution paths after the catalog
2857    /// mutation has succeeded so the index never holds entries for
2858    /// half-applied DDL.
2859    pub(crate) fn schema_vocabulary_apply(
2860        &self,
2861        event: crate::runtime::schema_vocabulary::DdlEvent,
2862    ) {
2863        self.inner.schema_vocabulary.write().on_ddl(event);
2864    }
2865
2866    /// Lookup `token` in the schema-vocabulary reverse index. Returns
2867    /// an owned `Vec<VocabHit>` because the underlying read lock
2868    /// cannot be borrowed across the call boundary; the slice from
2869    /// `SchemaVocabulary::lookup` is cloned per hit.
2870    pub fn schema_vocabulary_lookup(
2871        &self,
2872        token: &str,
2873    ) -> Vec<crate::runtime::schema_vocabulary::VocabHit> {
2874        self.inner.schema_vocabulary.read().lookup(token).to_vec()
2875    }
2876
2877    /// Inject an AuthStore into the runtime. Called by server boot
2878    /// after the vault has been bootstrapped, so that `Value::Secret`
2879    /// auto-encrypt/decrypt can reach the vault AES key.
2880    pub fn set_auth_store(&self, store: Arc<crate::auth::store::AuthStore>) {
2881        *self.inner.auth_store.write() = Some(store);
2882    }
2883
2884    /// Snapshot the current AuthStore (if any). Used by the wire listener
2885    /// to validate bearer tokens issued via HTTP `/auth/login`.
2886    pub fn auth_store(&self) -> Option<Arc<crate::auth::store::AuthStore>> {
2887        self.inner.auth_store.read().clone()
2888    }
2889
2890    /// Read a vault KV secret from the configured AuthStore, if present.
2891    pub fn vault_kv_get(&self, key: &str) -> Option<String> {
2892        self.inner
2893            .auth_store
2894            .read()
2895            .as_ref()
2896            .and_then(|store| store.vault_kv_get(key))
2897    }
2898
2899    /// Write a vault KV secret and fail if the encrypted vault write is
2900    /// unavailable or cannot be made durable.
2901    pub fn vault_kv_try_set(&self, key: String, value: String) -> RedDBResult<()> {
2902        let store = self.inner.auth_store.read().clone().ok_or_else(|| {
2903            RedDBError::Query("secret storage requires an enabled, unsealed vault".to_string())
2904        })?;
2905        store
2906            .vault_kv_try_set(key, value)
2907            .map_err(|err| RedDBError::Query(err.to_string()))
2908    }
2909
2910    /// Inject an `OAuthValidator` into the runtime. When set, HTTP and
2911    /// wire transports try OAuth JWT validation before falling back to
2912    /// the local AuthStore lookup. Pass `None` to disable.
2913    pub fn set_oauth_validator(&self, validator: Option<Arc<crate::auth::oauth::OAuthValidator>>) {
2914        *self.inner.oauth_validator.write() = validator;
2915    }
2916
2917    /// Returns a clone of the configured `OAuthValidator` Arc, if any.
2918    /// Hot path: called per HTTP request when an Authorization header
2919    /// is present, so we hand back a cheap Arc clone.
2920    pub fn oauth_validator(&self) -> Option<Arc<crate::auth::oauth::OAuthValidator>> {
2921        self.inner.oauth_validator.read().clone()
2922    }
2923
2924    /// Returns the vault AES key (`red.secret.aes_key`) if an auth
2925    /// store is wired and a key has been generated. Used by the
2926    /// `Value::Secret` encrypt/decrypt pipeline.
2927    pub(crate) fn secret_aes_key(&self) -> Option<[u8; 32]> {
2928        let guard = self.inner.auth_store.read();
2929        guard.as_ref().and_then(|s| s.vault_secret_key())
2930    }
2931
2932    /// Resolve a boolean flag from `red_config`. Defaults to `default`
2933    /// when the key is missing or not coercible. If the same key has
2934    /// been written multiple times (SET CONFIG appends new rows), the
2935    /// most recent entity wins. Env-var overrides
2936    /// (`REDDB_<UP_DOTTED>`) take highest precedence.
2937    pub(crate) fn config_bool(&self, key: &str, default: bool) -> bool {
2938        if let Some(raw) = self.inner.env_config_overrides.get(key) {
2939            if let Some(crate::storage::schema::Value::Boolean(b)) =
2940                crate::runtime::config_overlay::coerce_env_value(key, raw)
2941            {
2942                return b;
2943            }
2944        }
2945        let store = self.inner.db.store();
2946        let Some(manager) = store.get_collection("red_config") else {
2947            return default;
2948        };
2949        let mut result = default;
2950        let mut latest_id: u64 = 0;
2951        manager.for_each_entity(|entity| {
2952            if let Some(row) = entity.data.as_row() {
2953                let entry_key = row.get_field("key").and_then(|v| match v {
2954                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2955                    _ => None,
2956                });
2957                if entry_key == Some(key) {
2958                    let id = entity.id.raw();
2959                    if id >= latest_id {
2960                        latest_id = id;
2961                        result = match row.get_field("value") {
2962                            Some(crate::storage::schema::Value::Boolean(b)) => *b,
2963                            Some(crate::storage::schema::Value::Text(s)) => {
2964                                matches!(s.as_ref(), "true" | "TRUE" | "True" | "1")
2965                            }
2966                            Some(crate::storage::schema::Value::Integer(n)) => *n != 0,
2967                            _ => default,
2968                        };
2969                    }
2970                }
2971            }
2972            true
2973        });
2974        result
2975    }
2976
2977    pub(crate) fn config_u64(&self, key: &str, default: u64) -> u64 {
2978        if let Some(raw) = self.inner.env_config_overrides.get(key) {
2979            if let Some(crate::storage::schema::Value::UnsignedInteger(n)) =
2980                crate::runtime::config_overlay::coerce_env_value(key, raw)
2981            {
2982                return n;
2983            }
2984        }
2985        let store = self.inner.db.store();
2986        let Some(manager) = store.get_collection("red_config") else {
2987            return default;
2988        };
2989        let mut result = default;
2990        let mut latest_id: u64 = 0;
2991        manager.for_each_entity(|entity| {
2992            if let Some(row) = entity.data.as_row() {
2993                let entry_key = row.get_field("key").and_then(|v| match v {
2994                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2995                    _ => None,
2996                });
2997                if entry_key == Some(key) {
2998                    let id = entity.id.raw();
2999                    if id >= latest_id {
3000                        latest_id = id;
3001                        result = match row.get_field("value") {
3002                            Some(crate::storage::schema::Value::Integer(n)) => *n as u64,
3003                            Some(crate::storage::schema::Value::UnsignedInteger(n)) => *n,
3004                            Some(crate::storage::schema::Value::Text(s)) => {
3005                                s.parse::<u64>().unwrap_or(default)
3006                            }
3007                            _ => default,
3008                        };
3009                    }
3010                }
3011            }
3012            true
3013        });
3014        result
3015    }
3016
3017    pub(crate) fn config_f64(&self, key: &str, default: f64) -> f64 {
3018        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3019            if let Ok(n) = raw.parse::<f64>() {
3020                return n;
3021            }
3022        }
3023        let store = self.inner.db.store();
3024        let Some(manager) = store.get_collection("red_config") else {
3025            return default;
3026        };
3027        let mut result = default;
3028        let mut latest_id: u64 = 0;
3029        manager.for_each_entity(|entity| {
3030            if let Some(row) = entity.data.as_row() {
3031                let entry_key = row.get_field("key").and_then(|v| match v {
3032                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3033                    _ => None,
3034                });
3035                if entry_key == Some(key) {
3036                    let id = entity.id.raw();
3037                    if id >= latest_id {
3038                        latest_id = id;
3039                        result = match row.get_field("value") {
3040                            Some(crate::storage::schema::Value::Float(n)) => *n,
3041                            Some(crate::storage::schema::Value::Integer(n)) => *n as f64,
3042                            Some(crate::storage::schema::Value::UnsignedInteger(n)) => *n as f64,
3043                            Some(crate::storage::schema::Value::Text(s)) => {
3044                                s.parse::<f64>().unwrap_or(default)
3045                            }
3046                            _ => default,
3047                        };
3048                    }
3049                }
3050            }
3051            true
3052        });
3053        result
3054    }
3055
3056    pub(crate) fn config_string(&self, key: &str, default: &str) -> String {
3057        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3058            return raw.clone();
3059        }
3060        let store = self.inner.db.store();
3061        let Some(manager) = store.get_collection("red_config") else {
3062            return default.to_string();
3063        };
3064        let mut result = default.to_string();
3065        let mut latest_id: u64 = 0;
3066        manager.for_each_entity(|entity| {
3067            if let Some(row) = entity.data.as_row() {
3068                let entry_key = row.get_field("key").and_then(|v| match v {
3069                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3070                    _ => None,
3071                });
3072                if entry_key == Some(key) {
3073                    let id = entity.id.raw();
3074                    if id >= latest_id {
3075                        latest_id = id;
3076                        if let Some(crate::storage::schema::Value::Text(value)) =
3077                            row.get_field("value")
3078                        {
3079                            result = value.to_string();
3080                        }
3081                    }
3082                }
3083            }
3084            true
3085        });
3086        result
3087    }
3088
3089    fn latest_metadata_for(
3090        &self,
3091        collection: &str,
3092        entity_id: u64,
3093    ) -> Option<crate::serde_json::Value> {
3094        self.inner
3095            .db
3096            .store()
3097            .get_metadata(collection, EntityId::new(entity_id))
3098            .map(|metadata| metadata_to_json(&metadata))
3099    }
3100
3101    fn persist_replica_lsn(&self, lsn: u64) {
3102        self.inner.db.store().set_config_tree(
3103            "red.replication",
3104            &crate::json!({
3105                "last_applied_lsn": lsn
3106            }),
3107        );
3108    }
3109
3110    fn persist_replication_health(
3111        &self,
3112        state: &str,
3113        last_error: &str,
3114        primary_lsn: Option<u64>,
3115        oldest_available_lsn: Option<u64>,
3116    ) {
3117        self.inner.db.store().set_config_tree(
3118            "red.replication",
3119            &crate::json!({
3120                "state": state,
3121                "last_error": last_error,
3122                "last_seen_primary_lsn": primary_lsn.unwrap_or(0),
3123                "last_seen_oldest_lsn": oldest_available_lsn.unwrap_or(0),
3124                "updated_at_unix_ms": SystemTime::now()
3125                    .duration_since(UNIX_EPOCH)
3126                    .unwrap_or_default()
3127                    .as_millis() as u64
3128            }),
3129        );
3130    }
3131
3132    /// Whether `SECRET('...')` literals should be encrypted with the
3133    /// vault AES key on INSERT. Default `true`.
3134    pub(crate) fn secret_auto_encrypt(&self) -> bool {
3135        self.config_bool("red.config.secret.auto_encrypt", true)
3136    }
3137
3138    /// Whether `Value::Secret` columns should be decrypted back to
3139    /// plaintext on SELECT when the vault is unsealed. Default `true`.
3140    /// Turning this off keeps secrets masked as `***` even while the
3141    /// vault is open — useful for audit trails or read-only exports.
3142    pub(crate) fn secret_auto_decrypt(&self) -> bool {
3143        self.config_bool("red.config.secret.auto_decrypt", true)
3144    }
3145
3146    /// Walk every record in `result` and swap `Value::Secret(bytes)`
3147    /// for the decrypted plaintext when the runtime has the vault
3148    /// AES key AND `red.config.secret.auto_decrypt = true`. If the
3149    /// key is missing, the vault is sealed, or auto_decrypt is off,
3150    /// secrets are left as `Value::Secret` which every formatter
3151    /// (Display, JSON) already masks as `***`.
3152    pub(crate) fn apply_secret_decryption(&self, result: &mut RuntimeQueryResult) {
3153        if !self.secret_auto_decrypt() {
3154            return;
3155        }
3156        let Some(key) = self.secret_aes_key() else {
3157            return;
3158        };
3159        for record in result.result.records.iter_mut() {
3160            for value in record.values_mut() {
3161                if let Value::Secret(ref bytes) = value {
3162                    if let Some(plain) =
3163                        super::impl_dml::decrypt_secret_payload(&key, bytes.as_slice())
3164                    {
3165                        if let Ok(text) = String::from_utf8(plain) {
3166                            *value = Value::text(text);
3167                        }
3168                    }
3169                }
3170            }
3171        }
3172    }
3173
3174    /// Emit a CDC change event and replicate to WAL buffer.
3175    /// Create a `MutationEngine` bound to this runtime.
3176    ///
3177    /// The engine is cheap to construct (no allocation) and should be
3178    /// dropped after `apply` returns. Use this from application-layer
3179    /// `create_row` / `create_rows_batch` instead of calling
3180    /// `bulk_insert` + `index_entity_insert` + `cdc_emit` separately.
3181    pub(crate) fn mutation_engine(&self) -> crate::runtime::mutation::MutationEngine<'_> {
3182        crate::runtime::mutation::MutationEngine::new(self)
3183    }
3184
3185    /// Public-mutation gate snapshot (PLAN.md W1).
3186    ///
3187    /// Surfaces that accept untrusted client requests (SQL DML/DDL,
3188    /// gRPC mutating RPCs, HTTP/native wire mutations, admin
3189    /// maintenance, serverless lifecycle) call `check_write` before
3190    /// dispatching to storage. Returns `RedDBError::ReadOnly` on any
3191    /// instance running as a replica or with `options.read_only =
3192    /// true`. The replica internal logical-WAL apply path reaches into
3193    /// the store directly and never calls this method, so legitimate
3194    /// replica catch-up still works.
3195    pub fn check_write(&self, kind: crate::runtime::write_gate::WriteKind) -> RedDBResult<()> {
3196        self.inner.write_gate.check(kind)
3197    }
3198
3199    /// Read-only handle to the gate, useful for transports that want
3200    /// to surface the policy in health/status output without taking on
3201    /// a dependency on the concrete enum.
3202    pub fn write_gate(&self) -> &crate::runtime::write_gate::WriteGate {
3203        &self.inner.write_gate
3204    }
3205
3206    /// Process lifecycle handle (PLAN.md Phase 1). Health probes,
3207    /// admin/shutdown, and signal handlers consult this single
3208    /// state machine.
3209    pub fn lifecycle(&self) -> &crate::runtime::lifecycle::Lifecycle {
3210        &self.inner.lifecycle
3211    }
3212
3213    /// Operator-imposed resource limits (PLAN.md Phase 4.1).
3214    pub fn resource_limits(&self) -> &crate::runtime::resource_limits::ResourceLimits {
3215        &self.inner.resource_limits
3216    }
3217
3218    /// Append-only audit log for admin mutations (PLAN.md Phase 6.5).
3219    pub fn audit_log(&self) -> &crate::runtime::audit_log::AuditLogger {
3220        &self.inner.audit_log
3221    }
3222
3223    /// Shared `Arc` to the audit logger — used by collaborators (the
3224    /// lease lifecycle, future request-context plumbing) that need to
3225    /// keep the logger alive past the runtime's stack frame.
3226    pub fn audit_log_arc(&self) -> Arc<crate::runtime::audit_log::AuditLogger> {
3227        Arc::clone(&self.inner.audit_log)
3228    }
3229
3230    /// Slice 10 of issue #527 — shared queue telemetry counters
3231    /// (delivered/acked/nacked). Cloned by `queue_delivery.rs` on
3232    /// each transition.
3233    pub(crate) fn queue_telemetry(
3234        &self,
3235    ) -> &crate::runtime::queue_telemetry::QueueTelemetryCounters {
3236        &self.inner.queue_telemetry
3237    }
3238
3239    /// Snapshots of the queue telemetry counters in label-deterministic
3240    /// order for `/metrics` rendering and the integration test.
3241    pub fn queue_telemetry_snapshot(
3242        &self,
3243    ) -> crate::runtime::queue_telemetry::QueueTelemetrySnapshot {
3244        crate::runtime::queue_telemetry::QueueTelemetrySnapshot {
3245            delivered: self.inner.queue_telemetry.delivered_snapshot(),
3246            acked: self.inner.queue_telemetry.acked_snapshot(),
3247            nacked: self.inner.queue_telemetry.nacked_snapshot(),
3248        }
3249    }
3250
3251    /// Slice 10 of issue #527 — render-time scan of pending entries
3252    /// per (queue, group) for the `queue_pending_gauge` exposition.
3253    /// Walks `red_queue_meta` live so the gauge cannot drift from
3254    /// the source of truth.
3255    pub fn queue_pending_counts(&self) -> Vec<((String, String), u64)> {
3256        let store = self.inner.db.store();
3257        crate::runtime::impl_queue::pending_counts_by_group(store.as_ref())
3258            .into_iter()
3259            .collect()
3260    }
3261
3262    /// Shared `Arc` to the write gate. Same rationale as
3263    /// `audit_log_arc`: collaborators (lease lifecycle, refresh
3264    /// thread) need a clone-cheap handle they can move into a
3265    /// background thread.
3266    pub fn write_gate_arc(&self) -> Arc<crate::runtime::write_gate::WriteGate> {
3267        Arc::clone(&self.inner.write_gate)
3268    }
3269
3270    /// Serverless writer-lease state machine. `None` when the operator
3271    /// did not opt into lease fencing (`RED_LEASE_REQUIRED` unset).
3272    pub fn lease_lifecycle(&self) -> Option<&Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>> {
3273        self.inner.lease_lifecycle.get()
3274    }
3275
3276    /// Install the lease lifecycle. Idempotent; subsequent calls
3277    /// return the previously stored value untouched.
3278    pub fn set_lease_lifecycle(
3279        &self,
3280        lifecycle: Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>,
3281    ) -> Result<(), Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>> {
3282        self.inner.lease_lifecycle.set(lifecycle)
3283    }
3284
3285    /// Reject the call when the requested batch size exceeds
3286    /// `RED_MAX_BATCH_SIZE`. Returns `RedDBError::QuotaExceeded`
3287    /// shaped so the HTTP layer can map it to 413 Payload Too
3288    /// Large (PLAN.md Phase 4.1).
3289    pub fn check_batch_size(&self, requested: usize) -> RedDBResult<()> {
3290        if self.inner.resource_limits.batch_size_exceeded(requested) {
3291            let max = self.inner.resource_limits.max_batch_size.unwrap_or(0);
3292            return Err(RedDBError::QuotaExceeded(format!(
3293                "max_batch_size:{requested}:{max}"
3294            )));
3295        }
3296        Ok(())
3297    }
3298
3299    /// Reject the call when the local DB file exceeds
3300    /// `RED_MAX_DB_SIZE_BYTES`. Reads file metadata once per call —
3301    /// the cost is a single `stat()` syscall, negligible against the
3302    /// I/O the caller is about to do. Returns `QuotaExceeded` shaped
3303    /// for HTTP 507 Insufficient Storage.
3304    pub fn check_db_size(&self) -> RedDBResult<()> {
3305        let Some(limit) = self.inner.resource_limits.max_db_size_bytes else {
3306            return Ok(());
3307        };
3308        if limit == 0 {
3309            return Ok(());
3310        }
3311        let Some(path) = self.inner.db.path() else {
3312            return Ok(());
3313        };
3314        let current = std::fs::metadata(path).map(|m| m.len()).unwrap_or(0);
3315        if current > limit {
3316            return Err(RedDBError::QuotaExceeded(format!(
3317                "max_db_size_bytes:{current}:{limit}"
3318            )));
3319        }
3320        Ok(())
3321    }
3322
3323    /// Graceful shutdown coordinator (PLAN.md Phase 1.1).
3324    ///
3325    /// Steps, in order, all idempotent across re-entrant calls:
3326    ///   1. Move lifecycle into `ShuttingDown` (concurrent callers
3327    ///      observe `Stopped` after first finishes).
3328    ///   2. Flush WAL + run final checkpoint via `db.flush()` so
3329    ///      every acked write is durable on disk.
3330    ///   3. If `backup_on_shutdown == true` and a remote backend is
3331    ///      configured, run a synchronous `trigger_backup()` so the
3332    ///      remote head reflects the final state.
3333    ///   4. Stamp the report and move to `Stopped`. Subsequent calls
3334    ///      return the cached report without re-running anything.
3335    ///
3336    /// On any error, the runtime is still marked `Stopped` so the
3337    /// process can exit; the caller logs the error context but does
3338    /// not retry the same shutdown — the operator can inspect the
3339    /// report fields to see which step failed.
3340    pub fn graceful_shutdown(
3341        &self,
3342        backup_on_shutdown: bool,
3343    ) -> RedDBResult<crate::runtime::lifecycle::ShutdownReport> {
3344        if !self.inner.lifecycle.begin_shutdown() {
3345            // Someone else already shut down (or is in flight). Return
3346            // the cached report so the HTTP caller and SIGTERM handler
3347            // get the same idempotent answer.
3348            return Ok(self.inner.lifecycle.shutdown_report().unwrap_or_default());
3349        }
3350
3351        let started_ms = std::time::SystemTime::now()
3352            .duration_since(std::time::UNIX_EPOCH)
3353            .map(|d| d.as_millis() as u64)
3354            .unwrap_or(0);
3355        let mut report = crate::runtime::lifecycle::ShutdownReport {
3356            started_at_ms: started_ms,
3357            ..Default::default()
3358        };
3359
3360        // Flush WAL + run any pending checkpoint. Local fsync is
3361        // unconditional — even a lease-lost replica needs its WAL on
3362        // disk before exit so a future restore has the latest tail.
3363        // The remote upload is gated separately so a lost-lease writer
3364        // doesn't clobber the new holder's state on its way out.
3365        let flush_res = self.inner.db.flush_local_only();
3366        report.flushed_wal = flush_res.is_ok();
3367        report.final_checkpoint = flush_res.is_ok();
3368        if let Err(err) = &flush_res {
3369            tracing::error!(
3370                target: "reddb::lifecycle",
3371                error = %err,
3372                "graceful_shutdown: local flush failed"
3373            );
3374        } else if let Err(lease_err) =
3375            self.assert_remote_write_allowed("shutdown/checkpoint_upload")
3376        {
3377            tracing::warn!(
3378                target: "reddb::serverless::lease",
3379                error = %lease_err,
3380                "graceful_shutdown: remote upload skipped — lease not held"
3381            );
3382        } else if let Err(err) = self.inner.db.upload_to_remote_backend() {
3383            tracing::error!(
3384                target: "reddb::lifecycle",
3385                error = %err,
3386                "graceful_shutdown: remote upload failed"
3387            );
3388        }
3389
3390        // Optional final backup. Skipped silently when no remote
3391        // backend is configured — `trigger_backup()` returns Err
3392        // anyway in that case, but logging it as a shutdown failure
3393        // would be misleading on a standalone (no-backend) runtime.
3394        if backup_on_shutdown && self.inner.db.remote_backend.is_some() {
3395            // The trigger_backup gate now reads `WriteKind::Backup`,
3396            // which a replica/read_only instance refuses. That's
3397            // intentional — replicas don't drive backups; only the
3398            // primary does. We still want shutdown to flush its WAL
3399            // even if the backup branch is gated off.
3400            match self.trigger_backup() {
3401                Ok(result) => {
3402                    report.backup_uploaded = result.uploaded;
3403                }
3404                Err(err) => {
3405                    tracing::warn!(
3406                        target: "reddb::lifecycle",
3407                        error = %err,
3408                        "graceful_shutdown: final backup skipped"
3409                    );
3410                }
3411            }
3412        }
3413
3414        let completed_ms = std::time::SystemTime::now()
3415            .duration_since(std::time::UNIX_EPOCH)
3416            .map(|d| d.as_millis() as u64)
3417            .unwrap_or(started_ms);
3418        report.completed_at_ms = completed_ms;
3419        report.duration_ms = completed_ms.saturating_sub(started_ms);
3420
3421        self.inner.lifecycle.finish_shutdown(report.clone());
3422        Ok(report)
3423    }
3424
3425    /// Emit a CDC record without invalidating the result cache.
3426    ///
3427    /// Used by `MutationEngine::append_batch` which calls
3428    /// `invalidate_result_cache` once for the whole batch before this
3429    /// loop, avoiding N write-lock acquisitions.
3430    pub(crate) fn cdc_emit_no_cache_invalidate(
3431        &self,
3432        operation: crate::replication::cdc::ChangeOperation,
3433        collection: &str,
3434        entity_id: u64,
3435        entity_kind: &str,
3436    ) -> u64 {
3437        let lsn = self
3438            .inner
3439            .cdc
3440            .emit(operation, collection, entity_id, entity_kind);
3441
3442        // Append to logical WAL replication buffer (if primary mode)
3443        if let Some(ref primary) = self.inner.db.replication {
3444            let store = self.inner.db.store();
3445            let entity = if operation == crate::replication::cdc::ChangeOperation::Delete {
3446                None
3447            } else {
3448                store.get(collection, EntityId::new(entity_id))
3449            };
3450            let record = ChangeRecord {
3451                lsn,
3452                timestamp: SystemTime::now()
3453                    .duration_since(UNIX_EPOCH)
3454                    .unwrap_or_default()
3455                    .as_millis() as u64,
3456                operation,
3457                collection: collection.to_string(),
3458                entity_id,
3459                entity_kind: entity_kind.to_string(),
3460                entity_bytes: entity
3461                    .as_ref()
3462                    .map(|e| UnifiedStore::serialize_entity(e, store.format_version())),
3463                metadata: self.latest_metadata_for(collection, entity_id),
3464                refresh_records: None,
3465            };
3466            let encoded = record.encode();
3467            primary.wal_buffer.append(record.lsn, encoded.clone());
3468            if let Some(spool) = &primary.logical_wal_spool {
3469                let _ = spool.append(record.lsn, &encoded);
3470            }
3471        }
3472        lsn
3473    }
3474
3475    pub(crate) fn cdc_emit_insert_batch_no_cache_invalidate(
3476        &self,
3477        collection: &str,
3478        ids: &[EntityId],
3479        entity_kind: &str,
3480    ) -> Vec<u64> {
3481        if ids.is_empty() {
3482            return Vec::new();
3483        }
3484
3485        // Without logical replication, CDC only needs the in-memory event
3486        // ring. Reserve all LSNs and push the batch under one mutex instead
3487        // of taking the ring lock once per inserted row.
3488        if self.inner.db.replication.is_none() {
3489            return self.inner.cdc.emit_batch_same_collection(
3490                crate::replication::cdc::ChangeOperation::Insert,
3491                collection,
3492                entity_kind,
3493                ids.iter().map(|id| id.raw()),
3494            );
3495        }
3496
3497        // Replication needs one logical-WAL record per entity with the
3498        // serialized entity bytes, so keep the existing per-row path.
3499        ids.iter()
3500            .map(|id| {
3501                self.cdc_emit_no_cache_invalidate(
3502                    crate::replication::cdc::ChangeOperation::Insert,
3503                    collection,
3504                    id.raw(),
3505                    entity_kind,
3506                )
3507            })
3508            .collect()
3509    }
3510
3511    pub fn cdc_emit(
3512        &self,
3513        operation: crate::replication::cdc::ChangeOperation,
3514        collection: &str,
3515        entity_id: u64,
3516        entity_kind: &str,
3517    ) -> u64 {
3518        let lsn = self
3519            .inner
3520            .cdc
3521            .emit(operation, collection, entity_id, entity_kind);
3522        // Perf: prior to this we called `invalidate_result_cache()`
3523        // which wipes EVERY cached query, across every table, under
3524        // a write lock — turning each INSERT into a serialisation
3525        // point for all readers. Swap to the per-table variant so
3526        // unrelated query caches survive.
3527        self.invalidate_result_cache_for_table(collection);
3528
3529        // Append to logical WAL replication buffer (if primary mode)
3530        if let Some(ref primary) = self.inner.db.replication {
3531            let store = self.inner.db.store();
3532            let entity = if operation == crate::replication::cdc::ChangeOperation::Delete {
3533                None
3534            } else {
3535                store.get(collection, EntityId::new(entity_id))
3536            };
3537            let record = ChangeRecord {
3538                lsn,
3539                timestamp: SystemTime::now()
3540                    .duration_since(UNIX_EPOCH)
3541                    .unwrap_or_default()
3542                    .as_millis() as u64,
3543                operation,
3544                collection: collection.to_string(),
3545                entity_id,
3546                entity_kind: entity_kind.to_string(),
3547                entity_bytes: entity
3548                    .as_ref()
3549                    .map(|entity| UnifiedStore::serialize_entity(entity, store.format_version())),
3550                metadata: self.latest_metadata_for(collection, entity_id),
3551                refresh_records: None,
3552            };
3553            let encoded = record.encode();
3554            primary.wal_buffer.append(record.lsn, encoded.clone());
3555            if let Some(spool) = &primary.logical_wal_spool {
3556                let _ = spool.append(record.lsn, &encoded);
3557            }
3558        }
3559        lsn
3560    }
3561
3562    pub(crate) fn cdc_emit_kv(
3563        &self,
3564        operation: crate::replication::cdc::ChangeOperation,
3565        collection: &str,
3566        key: &str,
3567        entity_id: u64,
3568        before: Option<crate::json::Value>,
3569        after: Option<crate::json::Value>,
3570    ) -> u64 {
3571        let lsn = self
3572            .inner
3573            .cdc
3574            .emit_kv(operation, collection, key, entity_id, before, after);
3575        self.inner.kv_stats.incr_watch_events_emitted();
3576        self.invalidate_result_cache_for_table(collection);
3577        lsn
3578    }
3579
3580    pub(crate) fn record_kv_watch_event(
3581        &self,
3582        operation: crate::replication::cdc::ChangeOperation,
3583        collection: &str,
3584        key: &str,
3585        entity_id: u64,
3586        before: Option<crate::json::Value>,
3587        after: Option<crate::json::Value>,
3588    ) {
3589        if self.current_xid().is_some() {
3590            let conn_id = current_connection_id();
3591            let event = crate::replication::cdc::KvWatchEvent {
3592                collection: collection.to_string(),
3593                key: key.to_string(),
3594                op: operation,
3595                before,
3596                after,
3597                lsn: 0,
3598                committed_at: 0,
3599                dropped_event_count: 0,
3600            };
3601            self.inner
3602                .pending_kv_watch_events
3603                .write()
3604                .entry(conn_id)
3605                .or_default()
3606                .push(event);
3607            return;
3608        }
3609
3610        self.cdc_emit_kv(operation, collection, key, entity_id, before, after);
3611    }
3612
3613    pub(crate) fn cdc_emit_prebuilt(
3614        &self,
3615        operation: crate::replication::cdc::ChangeOperation,
3616        collection: &str,
3617        entity: &UnifiedEntity,
3618        entity_kind: &str,
3619        metadata: Option<&crate::storage::Metadata>,
3620        invalidate_cache: bool,
3621    ) -> u64 {
3622        self.cdc_emit_prebuilt_with_columns(
3623            operation,
3624            collection,
3625            entity,
3626            entity_kind,
3627            metadata,
3628            invalidate_cache,
3629            None,
3630        )
3631    }
3632
3633    /// `cdc_emit_prebuilt` plus the list of column names whose values
3634    /// changed on this update. Callers that have already computed a
3635    /// `RowDamageVector` pass it here so downstream CDC consumers can
3636    /// filter events by touched column without re-diffing.
3637    /// `changed_columns` is only meaningful for `Update` operations —
3638    /// insert and delete events ignore it.
3639    pub(crate) fn cdc_emit_prebuilt_with_columns(
3640        &self,
3641        operation: crate::replication::cdc::ChangeOperation,
3642        collection: &str,
3643        entity: &UnifiedEntity,
3644        entity_kind: &str,
3645        metadata: Option<&crate::storage::Metadata>,
3646        invalidate_cache: bool,
3647        changed_columns: Option<Vec<String>>,
3648    ) -> u64 {
3649        if invalidate_cache {
3650            self.invalidate_result_cache();
3651        }
3652
3653        let public_id = entity.logical_id().raw();
3654        let lsn = self.inner.cdc.emit_with_columns(
3655            operation,
3656            collection,
3657            public_id,
3658            entity_kind,
3659            changed_columns,
3660        );
3661
3662        if let Some(ref primary) = self.inner.db.replication {
3663            let store = self.inner.db.store();
3664            let record = ChangeRecord {
3665                lsn,
3666                timestamp: SystemTime::now()
3667                    .duration_since(UNIX_EPOCH)
3668                    .unwrap_or_default()
3669                    .as_millis() as u64,
3670                operation,
3671                collection: collection.to_string(),
3672                entity_id: entity.id.raw(),
3673                entity_kind: entity_kind.to_string(),
3674                entity_bytes: Some(UnifiedStore::serialize_entity(
3675                    entity,
3676                    store.format_version(),
3677                )),
3678                metadata: metadata
3679                    .map(metadata_to_json)
3680                    .or_else(|| self.latest_metadata_for(collection, entity.id.raw())),
3681                refresh_records: None,
3682            };
3683            let encoded = record.encode();
3684            primary.wal_buffer.append(record.lsn, encoded.clone());
3685            if let Some(spool) = &primary.logical_wal_spool {
3686                let _ = spool.append(record.lsn, &encoded);
3687            }
3688        }
3689
3690        lsn
3691    }
3692
3693    pub(crate) fn cdc_emit_prebuilt_batch<'a, I>(
3694        &self,
3695        operation: crate::replication::cdc::ChangeOperation,
3696        entity_kind: &str,
3697        items: I,
3698        invalidate_cache: bool,
3699    ) where
3700        I: IntoIterator<
3701            Item = (
3702                &'a str,
3703                &'a UnifiedEntity,
3704                Option<&'a crate::storage::Metadata>,
3705            ),
3706        >,
3707    {
3708        let items: Vec<(&str, &UnifiedEntity, Option<&crate::storage::Metadata>)> =
3709            items.into_iter().collect();
3710        if items.is_empty() {
3711            return;
3712        }
3713
3714        if invalidate_cache {
3715            self.invalidate_result_cache();
3716        }
3717
3718        for (collection, entity, metadata) in items {
3719            self.cdc_emit_prebuilt(operation, collection, entity, entity_kind, metadata, false);
3720        }
3721    }
3722
3723    fn run_replica_loop(&self, primary_addr: String) {
3724        let endpoint = if primary_addr.starts_with("http") {
3725            primary_addr
3726        } else {
3727            format!("http://{primary_addr}")
3728        };
3729        let poll_ms = self.inner.db.options().replication.poll_interval_ms;
3730        let max_count = self.inner.db.options().replication.max_batch_size;
3731        let mut since_lsn = self.config_u64("red.replication.last_applied_lsn", 0);
3732
3733        let runtime = match tokio::runtime::Builder::new_current_thread()
3734            .enable_all()
3735            .build()
3736        {
3737            Ok(runtime) => runtime,
3738            Err(_) => return,
3739        };
3740
3741        runtime.block_on(async move {
3742            use crate::grpc::proto::red_db_client::RedDbClient;
3743            use crate::grpc::proto::JsonPayloadRequest;
3744
3745            let mut client = loop {
3746                match RedDbClient::connect(endpoint.clone()).await {
3747                    Ok(client) => {
3748                        self.persist_replication_health("connecting", "", None, None);
3749                        break client;
3750                    }
3751                    Err(_) => {
3752                        self.persist_replication_health(
3753                            "connecting",
3754                            "waiting for primary connection",
3755                            None,
3756                            None,
3757                        );
3758                        std::thread::sleep(std::time::Duration::from_millis(poll_ms.max(250)))
3759                    }
3760                }
3761            };
3762
3763            // PLAN.md Phase 11.5 — stateful applier guards LSN
3764            // monotonicity across pulls. Seed with the persisted
3765            // `last_applied_lsn` so reboots don't lose the chain
3766            // pointer.
3767            let applier = crate::replication::logical::LogicalChangeApplier::new(since_lsn);
3768
3769            loop {
3770                let payload = crate::json!({
3771                    "since_lsn": since_lsn,
3772                    "max_count": max_count
3773                });
3774                let request = tonic::Request::new(JsonPayloadRequest {
3775                    payload_json: crate::json::to_string(&payload)
3776                        .unwrap_or_else(|_| "{}".to_string()),
3777                });
3778
3779                if let Ok(response) = client.pull_wal_records(request).await {
3780                    if let Ok(value) =
3781                        crate::json::from_str::<crate::json::Value>(&response.into_inner().payload)
3782                    {
3783                        let current_lsn =
3784                            value.get("current_lsn").and_then(crate::json::Value::as_u64);
3785                        let oldest_available_lsn = value
3786                            .get("oldest_available_lsn")
3787                            .and_then(crate::json::Value::as_u64);
3788                        if since_lsn > 0
3789                            && oldest_available_lsn
3790                                .map(|oldest| oldest > since_lsn.saturating_add(1))
3791                                .unwrap_or(false)
3792                        {
3793                            self.persist_replication_health(
3794                                "stalled_gap",
3795                                "replica is behind the oldest logical WAL available on primary; re-bootstrap required",
3796                                current_lsn,
3797                                oldest_available_lsn,
3798                            );
3799                            std::thread::sleep(std::time::Duration::from_millis(poll_ms.max(250)));
3800                            continue;
3801                        }
3802                        if let Some(records) =
3803                            value.get("records").and_then(crate::json::Value::as_array)
3804                        {
3805                            for record in records {
3806                                let Some(data_hex) =
3807                                    record.get("data").and_then(crate::json::Value::as_str)
3808                                else {
3809                                    continue;
3810                                };
3811                                let Ok(data) = hex::decode(data_hex) else {
3812                                    self.inner.replica_apply_metrics.record(
3813                                        crate::replication::logical::ApplyErrorKind::Decode,
3814                                    );
3815                                    self.persist_replication_health(
3816                                        "apply_error",
3817                                        "failed to decode WAL record hex payload",
3818                                        current_lsn,
3819                                        oldest_available_lsn,
3820                                    );
3821                                    continue;
3822                                };
3823                                let Ok(change) = ChangeRecord::decode(&data) else {
3824                                    self.inner.replica_apply_metrics.record(
3825                                        crate::replication::logical::ApplyErrorKind::Decode,
3826                                    );
3827                                    self.persist_replication_health(
3828                                        "apply_error",
3829                                        "failed to decode logical WAL record",
3830                                        current_lsn,
3831                                        oldest_available_lsn,
3832                                    );
3833                                    continue;
3834                                };
3835                                match applier.apply(
3836                                    self.inner.db.as_ref(),
3837                                    &change,
3838                                    ApplyMode::Replica,
3839                                ) {
3840                                    Ok(crate::replication::logical::ApplyOutcome::Applied) => {
3841                                        self.invalidate_result_cache_for_table(&change.collection);
3842                                        since_lsn = since_lsn.max(change.lsn);
3843                                        self.persist_replica_lsn(since_lsn);
3844                                    }
3845                                    Ok(_) => {
3846                                        // Idempotent / Skipped: no advance, no error.
3847                                    }
3848                                    Err(err) => {
3849                                        self.inner.replica_apply_metrics.record(err.kind());
3850                                        // Issue #205 — emit operator-grade event
3851                                        // for the two replication-fatal kinds. `Gap`
3852                                        // / `Apply` / `Decode` already persist via
3853                                        // `persist_replication_health`; the
3854                                        // OperatorEvent variants only cover the
3855                                        // two "stream is broken" / "follower
3856                                        // diverged" conditions an operator must act
3857                                        // on out-of-band.
3858                                        match &err {
3859                                            crate::replication::logical::LogicalApplyError::Divergence { lsn, expected: _, got: _ } => {
3860                                                crate::telemetry::operator_event::OperatorEvent::Divergence {
3861                                                    peer: "primary".to_string(),
3862                                                    leader_lsn: *lsn,
3863                                                    follower_lsn: since_lsn,
3864                                                }
3865                                                .emit_global();
3866                                            }
3867                                            crate::replication::logical::LogicalApplyError::Gap { last, next } => {
3868                                                crate::telemetry::operator_event::OperatorEvent::ReplicationBroken {
3869                                                    peer: "primary".to_string(),
3870                                                    reason: format!("stalled gap last={last} next={next}"),
3871                                                }
3872                                                .emit_global();
3873                                            }
3874                                            _ => {}
3875                                        }
3876                                        let kind = match &err {
3877                                            crate::replication::logical::LogicalApplyError::Gap { .. } => "stalled_gap",
3878                                            crate::replication::logical::LogicalApplyError::Divergence { .. } => "divergence",
3879                                            _ => "apply_error",
3880                                        };
3881                                        self.persist_replication_health(
3882                                            kind,
3883                                            &format!("replica apply rejected: {err}"),
3884                                            current_lsn,
3885                                            oldest_available_lsn,
3886                                        );
3887                                        // Stop applying this batch. The
3888                                        // outer loop will retry on next
3889                                        // pull, which on a real Gap will
3890                                        // not magically heal — operator
3891                                        // must rebootstrap. For
3892                                        // Divergence, we explicitly do
3893                                        // not advance; this keeps the
3894                                        // replica visibly unhealthy
3895                                        // instead of silently swallowing
3896                                        // corruption.
3897                                        break;
3898                                    }
3899                                }
3900                            }
3901                        }
3902                        self.persist_replication_health(
3903                            "healthy",
3904                            "",
3905                            current_lsn,
3906                            oldest_available_lsn,
3907                        );
3908                    } else {
3909                        self.persist_replication_health(
3910                            "apply_error",
3911                            "failed to parse pull_wal_records response",
3912                            None,
3913                            None,
3914                        );
3915                    }
3916                } else {
3917                    self.persist_replication_health(
3918                        "connecting",
3919                        "primary pull_wal_records request failed",
3920                        None,
3921                        None,
3922                    );
3923                }
3924
3925                std::thread::sleep(std::time::Duration::from_millis(poll_ms));
3926            }
3927        });
3928    }
3929
3930    /// Poll CDC events since a given LSN.
3931    pub fn cdc_poll(
3932        &self,
3933        since_lsn: u64,
3934        max_count: usize,
3935    ) -> Vec<crate::replication::cdc::ChangeEvent> {
3936        self.inner.cdc.poll(since_lsn, max_count)
3937    }
3938
3939    /// PLAN.md Phase 11.4 — current CDC LSN. Public mutation
3940    /// surfaces (HTTP query, gRPC entity ops) call this immediately
3941    /// after a successful write to feed `enforce_commit_policy`.
3942    pub fn cdc_current_lsn(&self) -> u64 {
3943        self.inner.cdc.current_lsn()
3944    }
3945
3946    pub fn kv_watch_events_since(
3947        &self,
3948        collection: &str,
3949        key: &str,
3950        since_lsn: u64,
3951        max_count: usize,
3952    ) -> Vec<crate::replication::cdc::KvWatchEvent> {
3953        self.inner
3954            .cdc
3955            .poll(since_lsn, max_count)
3956            .into_iter()
3957            .filter_map(|event| event.kv)
3958            .filter(|event| event.collection == collection && event.key == key)
3959            .collect()
3960    }
3961
3962    pub fn kv_watch_events_since_prefix(
3963        &self,
3964        collection: &str,
3965        prefix: &str,
3966        since_lsn: u64,
3967        max_count: usize,
3968    ) -> Vec<crate::replication::cdc::KvWatchEvent> {
3969        self.inner
3970            .cdc
3971            .poll(since_lsn, max_count)
3972            .into_iter()
3973            .filter_map(|event| event.kv)
3974            .filter(|event| event.collection == collection && event.key.starts_with(prefix))
3975            .collect()
3976    }
3977
3978    pub(crate) fn kv_watch_subscribe<'a>(
3979        &'a self,
3980        collection: impl Into<String>,
3981        key: impl Into<String>,
3982        from_lsn: Option<u64>,
3983    ) -> crate::runtime::kv_watch::KvWatchStream<'a> {
3984        crate::runtime::kv_watch::KvWatchStream::subscribe(
3985            &self.inner.cdc,
3986            &self.inner.kv_stats,
3987            collection,
3988            key,
3989            from_lsn,
3990            self.kv_watch_idle_timeout_ms(),
3991        )
3992    }
3993
3994    pub(crate) fn kv_watch_subscribe_prefix<'a>(
3995        &'a self,
3996        collection: impl Into<String>,
3997        prefix: impl Into<String>,
3998        from_lsn: Option<u64>,
3999    ) -> crate::runtime::kv_watch::KvWatchStream<'a> {
4000        crate::runtime::kv_watch::KvWatchStream::subscribe_prefix(
4001            &self.inner.cdc,
4002            &self.inner.kv_stats,
4003            collection,
4004            prefix,
4005            from_lsn,
4006            self.kv_watch_idle_timeout_ms(),
4007        )
4008    }
4009
4010    pub(crate) fn kv_watch_idle_timeout_ms(&self) -> u64 {
4011        self.config_u64("red.config.kv.watch.idle_timeout_ms", 60_000)
4012    }
4013
4014    /// Get backup scheduler status.
4015    pub fn backup_status(&self) -> crate::replication::scheduler::BackupStatus {
4016        self.inner.backup_scheduler.status()
4017    }
4018
4019    /// Borrow the runtime's result Blob Cache.
4020    ///
4021    /// Wired for the `/admin/blob_cache/sweep` and
4022    /// `/admin/blob_cache/flush_namespace` HTTP handlers (issue #148
4023    /// follow-up): both delegate to
4024    /// `crate::storage::cache::sweeper::BlobCacheSweeper`, which takes a
4025    /// `&BlobCache`. Also used by `trigger_backup` when
4026    /// `red.config.backup.include_blob_cache=true` to locate the L2
4027    /// directory for archival.
4028    pub fn result_blob_cache(&self) -> &crate::storage::cache::BlobCache {
4029        &self.inner.result_blob_cache
4030    }
4031
4032    /// PLAN.md Phase 11.4 — owned snapshot of every registered
4033    /// replica's state on this primary. Returns empty vec on
4034    /// non-primary instances or when no replicas are registered yet.
4035    pub fn primary_replica_snapshots(&self) -> Vec<crate::replication::primary::ReplicaState> {
4036        self.inner
4037            .db
4038            .replication
4039            .as_ref()
4040            .map(|repl| repl.replica_snapshots())
4041            .unwrap_or_default()
4042    }
4043
4044    /// PLAN.md Phase 11.4 — active commit policy. Reads
4045    /// `RED_PRIMARY_COMMIT_POLICY` once at runtime construction;
4046    /// future env reloads will need a reload endpoint. Default is
4047    /// `Local` — current behavior, no replica blocking.
4048    pub fn commit_policy(&self) -> crate::replication::CommitPolicy {
4049        crate::replication::CommitPolicy::from_env()
4050    }
4051
4052    /// PLAN.md Phase 11.5 — accessor for replica-side apply error
4053    /// counters (gap / divergence / apply / decode). Returned
4054    /// snapshot is consistent across the four counters; the labels
4055    /// match `reddb_replica_apply_errors_total{kind}`.
4056    pub fn replica_apply_error_counts(
4057        &self,
4058    ) -> [(crate::replication::logical::ApplyErrorKind, u64); 4] {
4059        self.inner.replica_apply_metrics.snapshot()
4060    }
4061
4062    /// PLAN.md Phase 4.4 — per-caller quota bucket. Always
4063    /// returned; `is_configured()` lets callers short-circuit.
4064    pub fn quota_bucket(&self) -> &crate::runtime::quota_bucket::QuotaBucket {
4065        &self.inner.quota_bucket
4066    }
4067
4068    /// PLAN.md Phase 11.4 — observability snapshot of every
4069    /// replica's durable LSN as known to the commit waiter. Empty
4070    /// vec on non-primary instances or when no replica has acked.
4071    pub fn commit_waiter_snapshot(&self) -> Vec<(String, u64)> {
4072        self.inner
4073            .db
4074            .replication
4075            .as_ref()
4076            .map(|repl| repl.commit_waiter.snapshot())
4077            .unwrap_or_default()
4078    }
4079
4080    /// PLAN.md Phase 11.4 — `(reached, timed_out, not_required, last_micros)`
4081    /// counters for /metrics. Always-zero on non-primary instances.
4082    pub fn commit_waiter_metrics_snapshot(&self) -> (u64, u64, u64, u64) {
4083        self.inner
4084            .db
4085            .replication
4086            .as_ref()
4087            .map(|repl| repl.commit_waiter.metrics_snapshot())
4088            .unwrap_or((0, 0, 0, 0))
4089    }
4090
4091    /// PLAN.md Phase 11.4 — block until at least `count` replicas
4092    /// have durably applied through `target_lsn`, or `timeout`
4093    /// elapses. Returns the `AwaitOutcome` so the caller can decide
4094    /// whether to surface a timeout error to the client or continue
4095    /// (the policy mapping lives in the commit dispatcher).
4096    ///
4097    /// Foundation only — the write commit path doesn't yet call
4098    /// this. Wiring it is a per-surface task gated on the operator
4099    /// flipping `RED_PRIMARY_COMMIT_POLICY` away from `local`.
4100    pub fn await_replica_acks(
4101        &self,
4102        target_lsn: u64,
4103        count: u32,
4104        timeout: std::time::Duration,
4105    ) -> crate::replication::AwaitOutcome {
4106        match &self.inner.db.replication {
4107            Some(repl) => repl.commit_waiter.await_acks(target_lsn, count, timeout),
4108            None => {
4109                // No replication configured: policy must be `Local`.
4110                // Treat as immediate `NotRequired` so callers don't
4111                // block on a degenerate setup.
4112                crate::replication::AwaitOutcome::NotRequired
4113            }
4114        }
4115    }
4116
4117    /// PLAN.md Phase 11.4 — enforce the configured commit policy
4118    /// against `post_lsn` (the LSN of the just-completed write).
4119    /// Returns `Ok(AwaitOutcome)` on every successful enforcement
4120    /// (including `Reached` and `TimedOut` when fail-on-timeout is
4121    /// off). Returns `Err(ReadOnly)` only when:
4122    ///   * policy is `AckN(n)` with `n > 0`
4123    ///   * the wait timed out
4124    ///   * `RED_COMMIT_FAIL_ON_TIMEOUT=true` is set
4125    ///
4126    /// The HTTP / gRPC / wire surfaces map the error to 504 / wire
4127    /// backoff. Default behaviour (env unset) logs warn and returns
4128    /// success — matches PLAN.md "default v1 stays local" semantics
4129    /// while still letting the operator opt into hard-blocking.
4130    pub fn enforce_commit_policy(
4131        &self,
4132        post_lsn: u64,
4133    ) -> RedDBResult<crate::replication::AwaitOutcome> {
4134        let n = match self.commit_policy() {
4135            crate::replication::CommitPolicy::AckN(n) if n > 0 => n,
4136            _ => return Ok(crate::replication::AwaitOutcome::NotRequired),
4137        };
4138        let timeout_ms = std::env::var("RED_REPLICATION_ACK_TIMEOUT_MS")
4139            .ok()
4140            .and_then(|v| v.parse::<u64>().ok())
4141            .unwrap_or(5_000);
4142        let outcome =
4143            self.await_replica_acks(post_lsn, n, std::time::Duration::from_millis(timeout_ms));
4144        if let crate::replication::AwaitOutcome::TimedOut { observed, required } = &outcome {
4145            tracing::warn!(
4146                target: "reddb::commit",
4147                post_lsn,
4148                observed = *observed,
4149                required = *required,
4150                timeout_ms,
4151                "ack_n: timed out waiting for replicas"
4152            );
4153            let fail = std::env::var("RED_COMMIT_FAIL_ON_TIMEOUT")
4154                .ok()
4155                .map(|v| {
4156                    let t = v.trim();
4157                    t.eq_ignore_ascii_case("true") || t == "1" || t.eq_ignore_ascii_case("yes")
4158                })
4159                .unwrap_or(false);
4160            if fail {
4161                return Err(RedDBError::ReadOnly(format!(
4162                    "commit policy timed out at lsn {post_lsn}: observed={observed} required={required} (RED_COMMIT_FAIL_ON_TIMEOUT=true)"
4163                )));
4164            }
4165        }
4166        Ok(outcome)
4167    }
4168
4169    /// PLAN.md Phase 6.3 — whether at-rest encryption is configured.
4170    /// Reads `RED_ENCRYPTION_KEY` / `RED_ENCRYPTION_KEY_FILE` lazily;
4171    /// returns `("enabled", None)` when a key is loadable, `("error", Some(msg))`
4172    /// when the operator set the env but it doesn't parse, and
4173    /// `("disabled", None)` when no key is configured. The pager
4174    /// hookup is deferred — this accessor surfaces the operator's
4175    /// intent for /admin/status without yet using the key in writes.
4176    pub fn encryption_at_rest_status(&self) -> (&'static str, Option<String>) {
4177        match crate::crypto::page_encryption::key_from_env() {
4178            Ok(Some(_)) => ("enabled", None),
4179            Ok(None) => ("disabled", None),
4180            Err(err) => ("error", Some(err)),
4181        }
4182    }
4183
4184    /// PLAN.md Phase 11.5 — current replica apply health label
4185    /// (`ok`, `gap`, `divergence`, `apply_error`, `connecting`,
4186    /// `stalled_gap`). Read from the persisted `red.replication.state`
4187    /// config key updated by the replica loop. Returns `None` on
4188    /// non-replica instances or when no apply has run yet.
4189    pub fn replica_apply_health(&self) -> Option<String> {
4190        let state = self.config_string("red.replication.state", "");
4191        if state.is_empty() {
4192            None
4193        } else {
4194            Some(state)
4195        }
4196    }
4197
4198    /// Current local LSN paired with the LSN of the most recently
4199    /// archived WAL segment. The difference is the replication /
4200    /// archive lag operators alert on (PLAN.md Phase 5.1). Returns
4201    /// `(0, 0)` when neither replication nor archiving is configured.
4202    pub fn wal_archive_progress(&self) -> (u64, u64) {
4203        let current_lsn = self
4204            .inner
4205            .db
4206            .replication
4207            .as_ref()
4208            .map(|repl| {
4209                repl.logical_wal_spool
4210                    .as_ref()
4211                    .map(|spool| spool.current_lsn())
4212                    .unwrap_or_else(|| repl.wal_buffer.current_lsn())
4213            })
4214            .unwrap_or_else(|| self.inner.cdc.current_lsn());
4215        let last_archived_lsn = self.config_u64("red.config.timeline.last_archived_lsn", 0);
4216        (current_lsn, last_archived_lsn)
4217    }
4218
4219    /// Trigger an immediate backup.
4220    pub fn trigger_backup(&self) -> RedDBResult<crate::replication::scheduler::BackupResult> {
4221        self.check_write(crate::runtime::write_gate::WriteKind::Backup)?;
4222        // Defense in depth — check_write above already rejects when
4223        // the lease is NotHeld, but log + audit the lease angle here
4224        // explicitly so dashboards distinguish "lease lost" from a
4225        // generic read-only refusal.
4226        self.assert_remote_write_allowed("admin/backup")?;
4227        let started = std::time::Instant::now();
4228        let snapshot = self.create_snapshot()?;
4229        let mut uploaded = false;
4230
4231        if let (Some(backend), Some(path)) = (&self.inner.db.remote_backend, self.inner.db.path()) {
4232            let default_snapshot_prefix = self.inner.db.options().default_snapshot_prefix();
4233            let default_wal_prefix = self.inner.db.options().default_wal_archive_prefix();
4234            let default_head_key = self.inner.db.options().default_backup_head_key();
4235            let snapshot_prefix = self.config_string(
4236                "red.config.backup.snapshot_prefix",
4237                &default_snapshot_prefix,
4238            );
4239            let wal_prefix =
4240                self.config_string("red.config.wal.archive.prefix", &default_wal_prefix);
4241            let head_key = self.config_string("red.config.backup.head_key", &default_head_key);
4242            let timeline_id = self.config_string("red.config.timeline.id", "main");
4243            let snapshot_key = crate::storage::wal::archive_snapshot(
4244                backend.as_ref(),
4245                path,
4246                snapshot.snapshot_id,
4247                &snapshot_prefix,
4248            )
4249            .map_err(|err| RedDBError::Internal(err.to_string()))?;
4250            let current_lsn = self
4251                .inner
4252                .db
4253                .replication
4254                .as_ref()
4255                .map(|repl| {
4256                    repl.logical_wal_spool
4257                        .as_ref()
4258                        .map(|spool| spool.current_lsn())
4259                        .unwrap_or_else(|| repl.wal_buffer.current_lsn())
4260                })
4261                .unwrap_or_else(|| self.inner.cdc.current_lsn());
4262            let last_archived_lsn = self.config_u64("red.config.timeline.last_archived_lsn", 0);
4263            // Hash the local snapshot bytes so the manifest can carry
4264            // the digest for restore-side verification (PLAN.md
4265            // Phase 4). Failure to hash is non-fatal — we still
4266            // publish the manifest, just without a checksum, so a
4267            // future fix can backfill rather than losing the backup.
4268            let snapshot_sha256 =
4269                crate::storage::wal::SnapshotManifest::compute_snapshot_sha256(path)
4270                    .map_err(|err| {
4271                        tracing::warn!(
4272                            target: "reddb::backup",
4273                            error = %err,
4274                            snapshot_id = snapshot.snapshot_id,
4275                            "snapshot hash failed; manifest will lack checksum"
4276                        );
4277                    })
4278                    .ok();
4279            let manifest = crate::storage::wal::SnapshotManifest {
4280                timeline_id: timeline_id.clone(),
4281                snapshot_key: snapshot_key.clone(),
4282                snapshot_id: snapshot.snapshot_id,
4283                snapshot_time: snapshot.created_at_unix_ms as u64,
4284                base_lsn: current_lsn,
4285                schema_version: crate::api::REDDB_FORMAT_VERSION,
4286                format_version: crate::api::REDDB_FORMAT_VERSION,
4287                snapshot_sha256,
4288            };
4289            crate::storage::wal::publish_snapshot_manifest(backend.as_ref(), &manifest)
4290                .map_err(|err| RedDBError::Internal(err.to_string()))?;
4291
4292            // PLAN.md Phase 11.3 — read the head of the WAL hash chain
4293            // so the new segment can link back. `None` means we're
4294            // starting a fresh timeline (after a clean restore or on
4295            // first archive ever); the segment's `prev_hash` will be
4296            // `None` and restore-side validation accepts that only for
4297            // the first segment in `plan.wal_segments`.
4298            let prev_segment_hash = self.config_string("red.config.timeline.last_segment_hash", "");
4299            let prev_hash_arg = if prev_segment_hash.is_empty() {
4300                None
4301            } else {
4302                Some(prev_segment_hash)
4303            };
4304
4305            let archived_lsn = if let Some(primary) = &self.inner.db.replication {
4306                let oldest = primary
4307                    .logical_wal_spool
4308                    .as_ref()
4309                    .and_then(|spool| spool.oldest_lsn().ok().flatten())
4310                    .or_else(|| primary.wal_buffer.oldest_lsn())
4311                    .unwrap_or(last_archived_lsn);
4312                if last_archived_lsn > 0 && last_archived_lsn < oldest.saturating_sub(1) {
4313                    return Err(RedDBError::Internal(format!(
4314                        "logical WAL gap detected: last_archived_lsn={last_archived_lsn}, oldest_available_lsn={oldest}"
4315                    )));
4316                }
4317                let records = if let Some(spool) = &primary.logical_wal_spool {
4318                    spool
4319                        .read_since(last_archived_lsn, usize::MAX)
4320                        .map_err(|err| RedDBError::Internal(err.to_string()))?
4321                } else {
4322                    primary.wal_buffer.read_since(last_archived_lsn, usize::MAX)
4323                };
4324                if let Some(meta) = crate::storage::wal::archive_change_records(
4325                    backend.as_ref(),
4326                    &wal_prefix,
4327                    &records,
4328                    prev_hash_arg,
4329                )
4330                .map_err(|err| RedDBError::Internal(err.to_string()))?
4331                {
4332                    if let Some(spool) = &primary.logical_wal_spool {
4333                        let _ = spool.prune_through(meta.lsn_end);
4334                    }
4335                    // Advance the chain head so the next archive call
4336                    // links to this segment's hash. If the segment has
4337                    // no sha256 (legacy / hashing failed) we leave the
4338                    // head as-is — the next segment then carries the
4339                    // prior chain head, preserving continuity.
4340                    if let Some(sha) = &meta.sha256 {
4341                        self.inner.db.store().set_config_tree(
4342                            "red.config.timeline",
4343                            &crate::json!({ "last_segment_hash": sha }),
4344                        );
4345                    }
4346                    meta.lsn_end
4347                } else {
4348                    last_archived_lsn
4349                }
4350            } else {
4351                last_archived_lsn
4352            };
4353
4354            let head = crate::storage::wal::BackupHead {
4355                timeline_id,
4356                snapshot_key,
4357                snapshot_id: snapshot.snapshot_id,
4358                snapshot_time: snapshot.created_at_unix_ms as u64,
4359                current_lsn,
4360                last_archived_lsn: archived_lsn,
4361                wal_prefix,
4362            };
4363            crate::storage::wal::publish_backup_head(backend.as_ref(), &head_key, &head)
4364                .map_err(|err| RedDBError::Internal(err.to_string()))?;
4365            self.inner.db.store().set_config_tree(
4366                "red.config.timeline",
4367                &crate::json!({
4368                    "last_archived_lsn": archived_lsn,
4369                    "id": head.timeline_id
4370                }),
4371            );
4372
4373            // PLAN.md Phase 2.4 — refresh the unified `MANIFEST.json`
4374            // at the prefix root so external tooling sees a single
4375            // catalog of every snapshot + WAL segment with their
4376            // checksums. Best-effort: a manifest publish failure
4377            // doesn't fail the backup (the per-artifact sidecars
4378            // already give restore-side integrity), but it does log
4379            // so dashboards can flag stale catalogs.
4380            if let Err(err) = crate::storage::wal::publish_unified_manifest_for_prefix(
4381                backend.as_ref(),
4382                &snapshot_prefix,
4383            ) {
4384                tracing::warn!(
4385                    target: "reddb::backup",
4386                    error = %err,
4387                    snapshot_prefix = %snapshot_prefix,
4388                    "unified MANIFEST.json refresh failed; per-artifact sidecars unaffected"
4389                );
4390            }
4391
4392            // PLAN.md Phase 11.4 — when the operator picked a
4393            // commit policy that demands replica durability, block
4394            // until the configured count of replicas has acked the
4395            // archived LSN (or the timeout fires). For backup the
4396            // policy decides the *DR posture* — `local` returns
4397            // immediately, `ack_n` ensures at least N replicas saw
4398            // the new tail before we report success to the
4399            // operator. A `TimedOut` is logged but does NOT fail
4400            // the backup: the local WAL + remote upload are durable
4401            // regardless; the missing acks are reported via
4402            // /metrics and /admin/status so the operator can decide.
4403            match self.commit_policy() {
4404                crate::replication::CommitPolicy::AckN(n) if n > 0 => {
4405                    let timeout = std::env::var("RED_REPLICATION_ACK_TIMEOUT_MS")
4406                        .ok()
4407                        .and_then(|v| v.parse::<u64>().ok())
4408                        .unwrap_or(5_000);
4409                    let outcome = self.await_replica_acks(
4410                        archived_lsn,
4411                        n,
4412                        std::time::Duration::from_millis(timeout),
4413                    );
4414                    match outcome {
4415                        crate::replication::AwaitOutcome::Reached(count) => {
4416                            tracing::debug!(
4417                                target: "reddb::backup",
4418                                archived_lsn,
4419                                n,
4420                                count,
4421                                "ack_n: replicas synced before backup return"
4422                            );
4423                        }
4424                        crate::replication::AwaitOutcome::TimedOut { observed, required } => {
4425                            tracing::warn!(
4426                                target: "reddb::backup",
4427                                archived_lsn,
4428                                observed,
4429                                required,
4430                                timeout_ms = timeout,
4431                                "ack_n: timed out waiting for replicas; backup uploaded but DR posture degraded"
4432                            );
4433                        }
4434                        crate::replication::AwaitOutcome::NotRequired => {}
4435                    }
4436                }
4437                _ => {} // Local / RemoteWal / Quorum: no blocking yet
4438            }
4439
4440            // Issue #148 follow-up — opt-in archive of the L2 Blob Cache
4441            // directory tree. Default off so a standard backup stays
4442            // small; flip via `red.config.backup.include_blob_cache=true`
4443            // when warm-cache restore is required (per
4444            // docs/operations/blob-cache-backup-restore.md §1).
4445            //
4446            // The L2 tree is *derived* state (ADR 0006) — its absence
4447            // never causes data loss; it only affects post-restore
4448            // p99 latency until the cache re-warms. We therefore log
4449            // (not fail) on per-file upload errors so a partial L2
4450            // upload never aborts a healthy snapshot+WAL backup.
4451            if self.config_bool("red.config.backup.include_blob_cache", false) {
4452                let blob_cache_prefix = self.config_string(
4453                    "red.config.backup.blob_cache_prefix",
4454                    &format!("{snapshot_prefix}blob_cache/"),
4455                );
4456                if let Some(l2_path) = self.inner.result_blob_cache.l2_path() {
4457                    match crate::storage::cache::archive_blob_cache_l2(
4458                        backend.as_ref(),
4459                        l2_path,
4460                        &blob_cache_prefix,
4461                    ) {
4462                        Ok(count) => {
4463                            tracing::info!(
4464                                target: "reddb::backup",
4465                                files_uploaded = count,
4466                                blob_cache_prefix = %blob_cache_prefix,
4467                                "include_blob_cache: archived L2 directory"
4468                            );
4469                        }
4470                        Err(err) => {
4471                            tracing::warn!(
4472                                target: "reddb::backup",
4473                                error = %err,
4474                                blob_cache_prefix = %blob_cache_prefix,
4475                                "include_blob_cache: L2 archive failed; backup proceeding (cache is derived state)"
4476                            );
4477                        }
4478                    }
4479                } else {
4480                    tracing::debug!(
4481                        target: "reddb::backup",
4482                        "include_blob_cache=true but no L2 path configured; nothing to archive"
4483                    );
4484                }
4485            }
4486
4487            uploaded = true;
4488        }
4489
4490        Ok(crate::replication::scheduler::BackupResult {
4491            snapshot_id: snapshot.snapshot_id,
4492            uploaded,
4493            duration_ms: started.elapsed().as_millis() as u64,
4494            timestamp: snapshot.created_at_unix_ms as u64,
4495        })
4496    }
4497
4498    pub fn acquire(&self) -> RedDBResult<RuntimeConnection> {
4499        let mut pool = self
4500            .inner
4501            .pool
4502            .lock()
4503            .map_err(|e| RedDBError::Internal(format!("connection pool lock poisoned: {e}")))?;
4504        if pool.active >= self.inner.pool_config.max_connections {
4505            return Err(RedDBError::Internal(
4506                "connection pool exhausted".to_string(),
4507            ));
4508        }
4509
4510        let id = if let Some(id) = pool.idle.pop() {
4511            id
4512        } else {
4513            let id = pool.next_id;
4514            pool.next_id += 1;
4515            id
4516        };
4517        pool.active += 1;
4518        pool.total_checkouts += 1;
4519        drop(pool);
4520
4521        Ok(RuntimeConnection {
4522            id,
4523            inner: Arc::clone(&self.inner),
4524        })
4525    }
4526
4527    pub fn checkpoint(&self) -> RedDBResult<()> {
4528        // Local fsync always allowed — losing the lease shouldn't
4529        // prevent us from durably persisting what's already in memory.
4530        // The remote upload is the side-effect that risks clobbering a
4531        // peer's state, so it's behind the lease gate.
4532        self.inner.db.flush_local_only().map_err(|err| {
4533            // Issue #205 — local flush failure is a CheckpointFailed
4534            // operator-grade event. The local-flush path also covers
4535            // the WAL fsync we depend on, so a failure here doubles as
4536            // the WalFsyncFailed signal for the runtime entry point.
4537            let msg = err.to_string();
4538            crate::telemetry::operator_event::OperatorEvent::CheckpointFailed {
4539                lsn: 0,
4540                error: msg.clone(),
4541            }
4542            .emit_global();
4543            crate::telemetry::operator_event::OperatorEvent::WalFsyncFailed {
4544                path: "<flush_local_only>".to_string(),
4545                error: msg.clone(),
4546            }
4547            .emit_global();
4548            RedDBError::Engine(msg)
4549        })?;
4550        if let Err(err) = self.assert_remote_write_allowed("checkpoint") {
4551            tracing::warn!(
4552                target: "reddb::serverless::lease",
4553                error = %err,
4554                "checkpoint: skipping remote upload — lease not held"
4555            );
4556            return Ok(());
4557        }
4558        self.inner
4559            .db
4560            .upload_to_remote_backend()
4561            .map_err(|err| RedDBError::Engine(err.to_string()))
4562    }
4563
4564    /// Guard remote-mutating operations on the writer lease.
4565    /// Returns `Ok(())` when no remote backend is configured (the
4566    /// lease is irrelevant) or the lease state is `NotRequired` /
4567    /// `Held`. Returns `RedDBError::ReadOnly` when the lease is
4568    /// `NotHeld`, with an audit-friendly action label so the caller
4569    /// can record the rejection.
4570    pub(crate) fn assert_remote_write_allowed(&self, action: &str) -> RedDBResult<()> {
4571        if self.inner.db.remote_backend.is_none() {
4572            return Ok(());
4573        }
4574        match self.inner.write_gate.lease_state() {
4575            crate::runtime::write_gate::LeaseGateState::NotHeld => {
4576                self.inner.audit_log.record(
4577                    action,
4578                    "system",
4579                    "remote_backend",
4580                    "err: writer lease not held",
4581                    crate::json::Value::Null,
4582                );
4583                Err(RedDBError::ReadOnly(format!(
4584                    "writer lease not held — {action} blocked (serverless fence)"
4585                )))
4586            }
4587            _ => Ok(()),
4588        }
4589    }
4590
4591    pub fn run_maintenance(&self) -> RedDBResult<()> {
4592        self.inner
4593            .db
4594            .run_maintenance()
4595            .map_err(|err| RedDBError::Internal(err.to_string()))
4596    }
4597
4598    pub fn scan_collection(
4599        &self,
4600        collection: &str,
4601        cursor: Option<ScanCursor>,
4602        limit: usize,
4603    ) -> RedDBResult<ScanPage> {
4604        let store = self.inner.db.store();
4605        let manager = store
4606            .get_collection(collection)
4607            .ok_or_else(|| RedDBError::NotFound(collection.to_string()))?;
4608
4609        let mut entities = manager.query_all(|_| true);
4610        entities.sort_by_key(|entity| entity.id.raw());
4611
4612        let offset = cursor.map(|cursor| cursor.offset).unwrap_or(0);
4613        let total = entities.len();
4614        let end = total.min(offset.saturating_add(limit.max(1)));
4615        let items = if offset >= total {
4616            Vec::new()
4617        } else {
4618            entities[offset..end].to_vec()
4619        };
4620        let next = (end < total).then_some(ScanCursor { offset: end });
4621
4622        Ok(ScanPage {
4623            collection: collection.to_string(),
4624            items,
4625            next,
4626            total,
4627        })
4628    }
4629
4630    pub fn catalog(&self) -> CatalogModelSnapshot {
4631        self.inner.db.catalog_model_snapshot()
4632    }
4633
4634    pub fn catalog_consistency_report(&self) -> crate::catalog::CatalogConsistencyReport {
4635        self.inner.db.catalog_consistency_report()
4636    }
4637
4638    pub fn catalog_attention_summary(&self) -> CatalogAttentionSummary {
4639        crate::catalog::attention_summary(&self.catalog())
4640    }
4641
4642    pub fn collection_attention(&self) -> Vec<CollectionDescriptor> {
4643        crate::catalog::collection_attention(&self.catalog())
4644    }
4645
4646    pub fn index_attention(&self) -> Vec<CatalogIndexStatus> {
4647        crate::catalog::index_attention(&self.catalog())
4648    }
4649
4650    pub fn graph_projection_attention(&self) -> Vec<CatalogGraphProjectionStatus> {
4651        crate::catalog::graph_projection_attention(&self.catalog())
4652    }
4653
4654    pub fn analytics_job_attention(&self) -> Vec<CatalogAnalyticsJobStatus> {
4655        crate::catalog::analytics_job_attention(&self.catalog())
4656    }
4657
4658    pub fn stats(&self) -> RuntimeStats {
4659        let pool = runtime_pool_lock(self);
4660        RuntimeStats {
4661            active_connections: pool.active,
4662            idle_connections: pool.idle.len(),
4663            total_checkouts: pool.total_checkouts,
4664            paged_mode: self.inner.db.is_paged(),
4665            started_at_unix_ms: self.inner.started_at_unix_ms,
4666            store: self.inner.db.stats(),
4667            system: SystemInfo::collect(),
4668            result_blob_cache: self.inner.result_blob_cache.stats(),
4669            kv: self.inner.kv_stats.snapshot(),
4670            metrics_ingest: self.inner.metrics_ingest_stats.snapshot(),
4671        }
4672    }
4673
4674    pub(crate) fn record_metrics_ingest(
4675        &self,
4676        accepted_samples: u64,
4677        accepted_series: u64,
4678        rejected_samples: u64,
4679        rejected_series: u64,
4680    ) {
4681        self.inner.metrics_ingest_stats.record(
4682            accepted_samples,
4683            accepted_series,
4684            rejected_samples,
4685            rejected_series,
4686        );
4687    }
4688
4689    pub(crate) fn record_metrics_cardinality_budget_rejections(&self, rejected_series: u64) {
4690        self.inner
4691            .metrics_ingest_stats
4692            .record_cardinality_budget_rejections(rejected_series);
4693    }
4694
4695    pub(crate) fn record_metrics_tenant_activity(
4696        &self,
4697        tenant: &str,
4698        namespace: &str,
4699        operation: &str,
4700    ) {
4701        self.inner
4702            .metrics_tenant_activity_stats
4703            .record(tenant, namespace, operation);
4704    }
4705
4706    pub(crate) fn metrics_tenant_activity_snapshot(
4707        &self,
4708    ) -> Vec<crate::runtime::MetricsTenantActivityStats> {
4709        self.inner.metrics_tenant_activity_stats.snapshot()
4710    }
4711
4712    /// Execute a query under a typed scope override without embedding
4713    /// the tenant / user / role values into the SQL string. Use this
4714    /// from transport middleware (HTTP / gRPC / worker loops) where the
4715    /// scope is resolved from auth claims and the SQL is a parameterised
4716    /// template — avoids the string-concat injection risk of building
4717    /// `WITHIN TENANT '<id>' …` manually, and is drop-in compatible with
4718    /// prepared statements that didn't know about tenancy.
4719    ///
4720    /// Precedence matches the `WITHIN` clause: the passed `scope`
4721    /// overrides `SET LOCAL TENANT`, which overrides `SET TENANT`.
4722    /// The override is pushed on the thread-local scope stack for the
4723    /// duration of the call and popped on return — pool-shared
4724    /// connections cannot leak it across requests.
4725    pub fn execute_query_with_scope(
4726        &self,
4727        query: &str,
4728        scope: crate::runtime::within_clause::ScopeOverride,
4729    ) -> RedDBResult<RuntimeQueryResult> {
4730        if scope.is_empty() {
4731            return self.execute_query(query);
4732        }
4733        let _scope_guard = ScopeOverrideGuard::install(scope);
4734        self.execute_query(query)
4735    }
4736
4737    /// Issue #205 — single lifecycle exit for slow-query logging.
4738    ///
4739    /// `execute_query_inner` does the real work; this wrapper times it
4740    /// and, if elapsed exceeds the configured threshold, hands the
4741    /// triple `(QueryKind, elapsed_ms, sql_redacted, scope)` to the
4742    /// SlowQueryLogger. The threshold + sample_pct were captured at
4743    /// SlowQueryLogger construction (runtime startup), so the per-call
4744    /// cost on below-threshold paths is one relaxed atomic load.
4745    pub fn execute_query(&self, query: &str) -> RedDBResult<RuntimeQueryResult> {
4746        let started = std::time::Instant::now();
4747        let result = self.execute_query_inner(query);
4748        let elapsed_ms = started.elapsed().as_millis() as u64;
4749
4750        // Build EffectiveScope from the same thread-locals frame-build
4751        // consults — keeps the slow-log row consistent with the audit /
4752        // RLS view of "this statement". `ai_scope()` is the canonical
4753        // builder.
4754        let scope = self.ai_scope();
4755        let kind = match result
4756            .as_ref()
4757            .map(|r| r.statement_type)
4758            .unwrap_or("select")
4759        {
4760            "select" => crate::telemetry::slow_query_logger::QueryKind::Select,
4761            "insert" => crate::telemetry::slow_query_logger::QueryKind::Insert,
4762            "update" => crate::telemetry::slow_query_logger::QueryKind::Update,
4763            "delete" => crate::telemetry::slow_query_logger::QueryKind::Delete,
4764            _ => crate::telemetry::slow_query_logger::QueryKind::Internal,
4765        };
4766        // SQL redaction: pass the raw query through. The slow-query
4767        // logger writes structured JSON so embedded literals stay
4768        // escape-safe at the JSON boundary (proven by
4769        // `adversarial_sql_is_escape_safe` in slow_query_logger.rs).
4770        // PII redaction (e.g. literal masking) is a follow-up.
4771        self.inner
4772            .slow_query_logger
4773            .record(kind, elapsed_ms, query.to_string(), &scope);
4774
4775        result
4776    }
4777
4778    #[inline(never)]
4779    fn execute_query_inner(&self, query: &str) -> RedDBResult<RuntimeQueryResult> {
4780        // ── ULTRA-TURBO: autocommit `SELECT * FROM t WHERE _entity_id = N` ──
4781        //
4782        // Moved above every boot-cost the normal path pays (WITHIN
4783        // strip, SET LOCAL parse, tx_local_tenants read, snapshot
4784        // guard, tracing span, tx_contexts read) because the bench's
4785        // `select_point` scenario was observed at 28× vs PostgreSQL —
4786        // the dominant cost wasn't the entity fetch but the ceremony
4787        // before it. Only fires when there's no ambient transaction
4788        // context or WITHIN override, so the snapshot install we skip
4789        // truly is a no-op for this query.
4790        if !has_scope_override_active()
4791            && !query.trim_start().starts_with("WITHIN")
4792            && !query.trim_start().starts_with("within")
4793            && !self
4794                .inner
4795                .tx_contexts
4796                .read()
4797                .contains_key(&current_connection_id())
4798        {
4799            if let Some(result) = self.try_fast_entity_lookup(query) {
4800                return result;
4801            }
4802        }
4803
4804        // `WITHIN TENANT '<id>' [USER '<u>'] [AS ROLE '<r>'] <stmt>` —
4805        // strip the prefix, push a stack-scoped override, recurse on
4806        // the inner statement, pop on return. Stack lives in a
4807        // thread-local but is balanced by the RAII guard, so a
4808        // pool-shared connection cannot leak the override across
4809        // requests and an early `?` return still pops cleanly.
4810        match crate::runtime::within_clause::try_strip_within_prefix(query) {
4811            Ok(Some((scope, inner))) => {
4812                let _scope_guard = ScopeOverrideGuard::install(scope);
4813                // Re-enter the inner path, NOT `execute_query`, so the
4814                // slow-query lifecycle hook records exactly one row per
4815                // top-level statement (the WITHIN-stripped form would
4816                // double-record).
4817                return self.execute_query_inner(inner);
4818            }
4819            Ok(None) => {}
4820            Err(msg) => return Err(RedDBError::Query(msg)),
4821        }
4822
4823        // `EXPLAIN <stmt>` — introspection. Runs the planner on the
4824        // inner statement (WITHOUT executing it) and returns the
4825        // CanonicalLogicalNode tree as rows so the caller can see the
4826        // operator shape and estimated cost. `EXPLAIN ALTER FOR ...`
4827        // is a distinct schema-diff command and continues down the
4828        // regular SQL path.
4829        if let Some(inner) = strip_explain_prefix(query) {
4830            return self.explain_as_rows(query, inner);
4831        }
4832
4833        // `SET LOCAL TENANT '<id>'` — write the per-transaction tenant
4834        // override and return. Outside a transaction the statement is
4835        // an error (matches PG semantics: SET LOCAL only takes effect
4836        // within an active transaction).
4837        if let Some(value) = parse_set_local_tenant(query)? {
4838            let conn_id = current_connection_id();
4839            if !self.inner.tx_contexts.read().contains_key(&conn_id) {
4840                return Err(RedDBError::Query(
4841                    "SET LOCAL TENANT requires an active transaction".to_string(),
4842                ));
4843            }
4844            self.inner
4845                .tx_local_tenants
4846                .write()
4847                .insert(conn_id, value.clone());
4848            return Ok(RuntimeQueryResult::ok_message(
4849                query.to_string(),
4850                &match &value {
4851                    Some(id) => format!("local tenant set: {id}"),
4852                    None => "local tenant cleared".to_string(),
4853                },
4854                "set_local_tenant",
4855            ));
4856        }
4857
4858        if super::red_schema::is_system_schema_write(query) {
4859            return Err(RedDBError::Query(
4860                super::red_schema::READ_ONLY_ERROR.to_string(),
4861            ));
4862        }
4863
4864        let rewritten_query = super::red_schema::rewrite_virtual_names(query);
4865        let execution_query = rewritten_query.as_deref().unwrap_or(query);
4866
4867        let frame = super::statement_frame::StatementExecutionFrame::build(self, execution_query)?;
4868        let _frame_guards = frame.install(self);
4869
4870        // Phase 6 logging: enter a span stamped with conn_id / tenant
4871        // / query_len. Every downstream tracing::info!/warn!/error!
4872        // inherits these fields — no need to thread them manually
4873        // through storage/scan layers. Entered AFTER the WITHIN /
4874        // SET LOCAL TENANT resolution above so the span reflects the
4875        // effective scope for this statement.
4876        let _log_span = crate::telemetry::span::query_span(query).entered();
4877
4878        // ── CTE prelude (#41) — `WITH x AS (...) SELECT ... FROM x` ──
4879        if let Some(rewritten) = frame.prepare_cte(execution_query)? {
4880            return self.execute_query_expr(rewritten);
4881        }
4882
4883        // ── TURBO: bypass SQL parse for SELECT * FROM x WHERE _entity_id = N ──
4884        if let Some(result) = self.try_fast_entity_lookup(execution_query) {
4885            return result;
4886        }
4887
4888        // ── Result cache: return cached result if still fresh (30s TTL) ──
4889        if let Some(result) = frame.read_result_cache(self) {
4890            return Ok(result);
4891        }
4892
4893        let prepared = frame.prepare_statement(self, execution_query)?;
4894        let mode = prepared.mode;
4895        let expr = prepared.expr;
4896
4897        let statement = query_expr_name(&expr);
4898        let result_cache_scopes = query_expr_result_cache_scopes(&expr);
4899
4900        let _lock_guard = frame.prepare_dispatch(self, &expr)?;
4901        let frame_iface: &dyn super::statement_frame::ReadFrame = &frame;
4902
4903        let query_result = match expr {
4904            QueryExpr::Graph(_) | QueryExpr::Path(_) => {
4905                // Apply MVCC visibility + RLS gate while materialising the
4906                // graph: every node entity is screened against the source
4907                // collection's policy chain (basic and `Nodes`-targeted)
4908                // and dropped when the caller's tenant / role doesn't
4909                // admit it. Edges are pruned automatically because the
4910                // graph builder skips edges whose endpoints aren't in
4911                // `allowed_nodes`.
4912                let (graph, node_properties, edge_properties) =
4913                    self.materialize_graph_with_rls()?;
4914                let result =
4915                    crate::storage::query::unified::UnifiedExecutor::execute_on_with_graph_properties(
4916                        &graph,
4917                        &expr,
4918                        node_properties,
4919                        edge_properties,
4920                    )
4921                        .map_err(|err| RedDBError::Query(err.to_string()))?;
4922
4923                Ok(RuntimeQueryResult {
4924                    query: query.to_string(),
4925                    mode,
4926                    statement,
4927                    engine: "materialized-graph",
4928                    result,
4929                    affected_rows: 0,
4930                    statement_type: "select",
4931                })
4932            }
4933            QueryExpr::Table(table) => {
4934                let table = self.resolve_table_expr_subqueries(
4935                    table,
4936                    &frame as &dyn super::statement_frame::ReadFrame,
4937                )?;
4938                if super::red_schema::is_virtual_table(&table.table) {
4939                    return Ok(RuntimeQueryResult {
4940                        query: query.to_string(),
4941                        mode,
4942                        statement,
4943                        engine: "runtime-red-schema",
4944                        result: super::red_schema::red_query(
4945                            self,
4946                            &table.table,
4947                            &table,
4948                            &frame as &dyn super::statement_frame::ReadFrame,
4949                        )?,
4950                        affected_rows: 0,
4951                        statement_type: "select",
4952                    });
4953                }
4954
4955                if let Some(result) = self.execute_probabilistic_select(&table)? {
4956                    return Ok(RuntimeQueryResult {
4957                        query: query.to_string(),
4958                        mode,
4959                        statement,
4960                        engine: "runtime-probabilistic",
4961                        result,
4962                        affected_rows: 0,
4963                        statement_type: "select",
4964                    });
4965                }
4966
4967                // Foreign-table intercept (Phase 3.2.2 PG parity).
4968                //
4969                // When the referenced table matches a `CREATE FOREIGN TABLE`
4970                // registration, short-circuit into the FDW scan. Phase 3.2
4971                // wrappers don't yet support pushdown, so filters/projections
4972                // apply post-scan via `apply_foreign_table_filters` — good
4973                // enough for correctness; perf work lands in 3.2.3.
4974                if self.inner.foreign_tables.is_foreign_table(&table.table) {
4975                    let records = self
4976                        .inner
4977                        .foreign_tables
4978                        .scan(&table.table)
4979                        .map_err(|e| RedDBError::Internal(e.to_string()))?;
4980                    let result = apply_foreign_table_filters(records, &table);
4981                    return Ok(RuntimeQueryResult {
4982                        query: query.to_string(),
4983                        mode,
4984                        statement,
4985                        engine: "runtime-fdw",
4986                        result,
4987                        affected_rows: 0,
4988                        statement_type: "select",
4989                    });
4990                }
4991
4992                // Row-Level Security enforcement (Phase 2.5.2 PG parity).
4993                //
4994                // When RLS is enabled on this table, fetch every policy
4995                // that applies to the current (role, SELECT) pair and
4996                // fold them into the query's WHERE clause: policies
4997                // OR-combine (any of them admitting the row is enough),
4998                // then AND into the caller's existing filter.
4999                //
5000                // Anonymous callers (no thread-local identity) pass
5001                // `role = None`; policies with a specific `TO role`
5002                // clause skip, but `TO PUBLIC` policies still apply.
5003                //
5004                // When `inject_rls_filters` returns `None` the table has
5005                // RLS enabled but no policy admits the caller's role —
5006                // short-circuit with an empty result set instead of
5007                // synthesising a contradiction filter.
5008                let Some(table_with_rls) = self.authorize_relational_table_select(
5009                    table,
5010                    &frame as &dyn super::statement_frame::ReadFrame,
5011                )?
5012                else {
5013                    let empty = crate::storage::query::unified::UnifiedResult::empty();
5014                    return Ok(RuntimeQueryResult {
5015                        query: query.to_string(),
5016                        mode,
5017                        statement,
5018                        engine: "runtime-table-rls",
5019                        result: empty,
5020                        affected_rows: 0,
5021                        statement_type: "select",
5022                    });
5023                };
5024                Ok(RuntimeQueryResult {
5025                    query: query.to_string(),
5026                    mode,
5027                    statement,
5028                    engine: "runtime-table",
5029                    result: execute_runtime_table_query(
5030                        &self.inner.db,
5031                        &table_with_rls,
5032                        Some(&self.inner.index_store),
5033                    )?,
5034                    affected_rows: 0,
5035                    statement_type: "select",
5036                })
5037            }
5038            QueryExpr::Join(join) => {
5039                // Fold per-table RLS filters into each `QueryExpr::Table`
5040                // leaf of the join tree before executing. Without this
5041                // the join executor scans both tables raw and ignores
5042                // policies — a `WITHIN TENANT 'x'` against a join of
5043                // two tenant-scoped tables would leak cross-tenant rows.
5044                // When any leaf has RLS enabled and zero matching policy,
5045                // short-circuit to an empty join result instead of
5046                // emitting a contradiction filter.
5047                let join_with_rls = match self.authorize_relational_join_select(
5048                    join,
5049                    &frame as &dyn super::statement_frame::ReadFrame,
5050                )? {
5051                    Some(j) => j,
5052                    None => {
5053                        return Ok(RuntimeQueryResult {
5054                            query: query.to_string(),
5055                            mode,
5056                            statement,
5057                            engine: "runtime-join-rls",
5058                            result: crate::storage::query::unified::UnifiedResult::empty(),
5059                            affected_rows: 0,
5060                            statement_type: "select",
5061                        });
5062                    }
5063                };
5064                Ok(RuntimeQueryResult {
5065                    query: query.to_string(),
5066                    mode,
5067                    statement,
5068                    engine: "runtime-join",
5069                    result: execute_runtime_join_query(&self.inner.db, &join_with_rls)?,
5070                    affected_rows: 0,
5071                    statement_type: "select",
5072                })
5073            }
5074            QueryExpr::Vector(vector) => Ok(RuntimeQueryResult {
5075                query: query.to_string(),
5076                mode,
5077                statement,
5078                engine: "runtime-vector",
5079                result: execute_runtime_vector_query(&self.inner.db, &vector)?,
5080                affected_rows: 0,
5081                statement_type: "select",
5082            }),
5083            QueryExpr::Hybrid(hybrid) => Ok(RuntimeQueryResult {
5084                query: query.to_string(),
5085                mode,
5086                statement,
5087                engine: "runtime-hybrid",
5088                result: execute_runtime_hybrid_query(&self.inner.db, &hybrid)?,
5089                affected_rows: 0,
5090                statement_type: "select",
5091            }),
5092            // DML execution
5093            QueryExpr::Insert(ref insert) if super::red_schema::is_virtual_table(&insert.table) => {
5094                Err(RedDBError::Query(
5095                    super::red_schema::READ_ONLY_ERROR.to_string(),
5096                ))
5097            }
5098            QueryExpr::Update(ref update) if super::red_schema::is_virtual_table(&update.table) => {
5099                Err(RedDBError::Query(
5100                    super::red_schema::READ_ONLY_ERROR.to_string(),
5101                ))
5102            }
5103            QueryExpr::Delete(ref delete) if super::red_schema::is_virtual_table(&delete.table) => {
5104                Err(RedDBError::Query(
5105                    super::red_schema::READ_ONLY_ERROR.to_string(),
5106                ))
5107            }
5108            QueryExpr::Insert(ref insert) => self
5109                .with_deferred_store_wal_for_dml(self.insert_may_emit_events(insert), || {
5110                    self.execute_insert(query, insert)
5111                }),
5112            QueryExpr::Update(ref update) => self
5113                .with_deferred_store_wal_for_dml(self.update_may_emit_events(update), || {
5114                    self.execute_update(query, update)
5115                }),
5116            QueryExpr::Delete(ref delete) => self
5117                .with_deferred_store_wal_for_dml(self.delete_may_emit_events(delete), || {
5118                    self.execute_delete(query, delete)
5119                }),
5120            // DDL execution
5121            QueryExpr::CreateTable(ref create) => self.execute_create_table(query, create),
5122            QueryExpr::CreateCollection(ref create) => {
5123                self.execute_create_collection(query, create)
5124            }
5125            QueryExpr::CreateVector(ref create) => self.execute_create_vector(query, create),
5126            QueryExpr::DropTable(ref drop_tbl) => self.execute_drop_table(query, drop_tbl),
5127            QueryExpr::DropGraph(ref drop_graph) => self.execute_drop_graph(query, drop_graph),
5128            QueryExpr::DropVector(ref drop_vector) => self.execute_drop_vector(query, drop_vector),
5129            QueryExpr::DropDocument(ref drop_document) => {
5130                self.execute_drop_document(query, drop_document)
5131            }
5132            QueryExpr::DropKv(ref drop_kv) => self.execute_drop_kv(query, drop_kv),
5133            QueryExpr::DropCollection(ref drop_collection) => {
5134                self.execute_drop_collection(query, drop_collection)
5135            }
5136            QueryExpr::Truncate(ref truncate) => self.execute_truncate(query, truncate),
5137            QueryExpr::AlterTable(ref alter) => self.execute_alter_table(query, alter),
5138            QueryExpr::ExplainAlter(ref explain) => self.execute_explain_alter(query, explain),
5139            // Graph analytics commands
5140            QueryExpr::GraphCommand(ref cmd) => self.execute_graph_command(query, cmd),
5141            // Search commands
5142            QueryExpr::SearchCommand(ref cmd) => self.execute_search_command(query, cmd),
5143            // ASK: RAG query with LLM synthesis
5144            QueryExpr::Ask(ref ask) => self.execute_ask(query, ask),
5145            QueryExpr::CreateIndex(ref create_idx) => self.execute_create_index(query, create_idx),
5146            QueryExpr::DropIndex(ref drop_idx) => self.execute_drop_index(query, drop_idx),
5147            QueryExpr::ProbabilisticCommand(ref cmd) => {
5148                self.execute_probabilistic_command(query, cmd)
5149            }
5150            // Time-series DDL
5151            QueryExpr::CreateTimeSeries(ref ts) => self.execute_create_timeseries(query, ts),
5152            QueryExpr::DropTimeSeries(ref ts) => self.execute_drop_timeseries(query, ts),
5153            // Queue DDL and commands
5154            QueryExpr::CreateQueue(ref q) => self.execute_create_queue(query, q),
5155            QueryExpr::AlterQueue(ref q) => self.execute_alter_queue(query, q),
5156            QueryExpr::DropQueue(ref q) => self.execute_drop_queue(query, q),
5157            QueryExpr::QueueSelect(ref q) => self.execute_queue_select(query, q),
5158            QueryExpr::QueueCommand(ref cmd) => self.execute_queue_command(query, cmd),
5159            QueryExpr::EventsBackfill(ref backfill) => {
5160                self.execute_events_backfill(query, backfill)
5161            }
5162            QueryExpr::EventsBackfillStatus { ref collection } => Err(RedDBError::Query(format!(
5163                "EVENTS BACKFILL STATUS for '{collection}' is not implemented in this slice"
5164            ))),
5165            QueryExpr::KvCommand(ref cmd) => self.execute_kv_command(query, cmd),
5166            QueryExpr::ConfigCommand(ref cmd) => self.execute_config_command(query, cmd),
5167            QueryExpr::CreateTree(ref tree) => self.execute_create_tree(query, tree),
5168            QueryExpr::DropTree(ref tree) => self.execute_drop_tree(query, tree),
5169            QueryExpr::TreeCommand(ref cmd) => self.execute_tree_command(query, cmd),
5170            // SET CONFIG key = value
5171            QueryExpr::SetConfig { ref key, ref value } => {
5172                if key.starts_with("red.secret.") {
5173                    return Err(RedDBError::Query(
5174                        "red.secret.* is reserved for vault secrets; use SET SECRET".to_string(),
5175                    ));
5176                }
5177                let store = self.inner.db.store();
5178                let json_val = match value {
5179                    Value::Text(s) => crate::serde_json::Value::String(s.to_string()),
5180                    Value::Integer(n) => crate::serde_json::Value::Number(*n as f64),
5181                    Value::Float(n) => crate::serde_json::Value::Number(*n),
5182                    Value::Boolean(b) => crate::serde_json::Value::Bool(*b),
5183                    _ => crate::serde_json::Value::String(value.to_string()),
5184                };
5185                store.set_config_tree(key, &json_val);
5186                update_current_config_value(key, value.clone());
5187                // Config changes can flip runtime behavior mid-session
5188                // (auto_decrypt, auto_encrypt, etc.) — invalidate the
5189                // result cache so subsequent reads re-execute against
5190                // the new config.
5191                self.invalidate_result_cache();
5192                Ok(RuntimeQueryResult::ok_message(
5193                    query.to_string(),
5194                    &format!("config set: {key}"),
5195                    "set",
5196                ))
5197            }
5198            // SET SECRET key = value
5199            QueryExpr::SetSecret { ref key, ref value } => {
5200                if key.starts_with("red.config.") {
5201                    return Err(RedDBError::Query(
5202                        "red.config.* is reserved for config; use SET CONFIG".to_string(),
5203                    ));
5204                }
5205                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
5206                    RedDBError::Query("SET SECRET requires an enabled, unsealed vault".to_string())
5207                })?;
5208                if matches!(value, Value::Null) {
5209                    auth_store
5210                        .vault_kv_try_delete(key)
5211                        .map_err(|err| RedDBError::Query(err.to_string()))?;
5212                    update_current_secret_value(key, None);
5213                    self.invalidate_result_cache();
5214                    return Ok(RuntimeQueryResult::ok_message(
5215                        query.to_string(),
5216                        &format!("secret deleted: {key}"),
5217                        "delete_secret",
5218                    ));
5219                }
5220                let value = secret_sql_value_to_string(value)?;
5221                auth_store
5222                    .vault_kv_try_set(key.clone(), value.clone())
5223                    .map_err(|err| RedDBError::Query(err.to_string()))?;
5224                update_current_secret_value(key, Some(value));
5225                self.invalidate_result_cache();
5226                Ok(RuntimeQueryResult::ok_message(
5227                    query.to_string(),
5228                    &format!("secret set: {key}"),
5229                    "set_secret",
5230                ))
5231            }
5232            // DELETE SECRET key
5233            QueryExpr::DeleteSecret { ref key } => {
5234                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
5235                    RedDBError::Query(
5236                        "DELETE SECRET requires an enabled, unsealed vault".to_string(),
5237                    )
5238                })?;
5239                let deleted = auth_store
5240                    .vault_kv_try_delete(key)
5241                    .map_err(|err| RedDBError::Query(err.to_string()))?;
5242                if deleted {
5243                    update_current_secret_value(key, None);
5244                }
5245                self.invalidate_result_cache();
5246                Ok(RuntimeQueryResult::ok_message(
5247                    query.to_string(),
5248                    &format!("secret deleted: {key}"),
5249                    if deleted {
5250                        "delete_secret"
5251                    } else {
5252                        "delete_secret_not_found"
5253                    },
5254                ))
5255            }
5256            // SHOW SECRET[S] [prefix]
5257            QueryExpr::ShowSecrets { ref prefix } => {
5258                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
5259                    RedDBError::Query("SHOW SECRET requires an enabled, unsealed vault".to_string())
5260                })?;
5261                if !auth_store.is_vault_backed() {
5262                    return Err(RedDBError::Query(
5263                        "SHOW SECRET requires an enabled, unsealed vault".to_string(),
5264                    ));
5265                }
5266                let mut keys = auth_store.vault_kv_keys();
5267                keys.sort();
5268                let mut result = UnifiedResult::with_columns(vec![
5269                    "key".into(),
5270                    "value".into(),
5271                    "status".into(),
5272                ]);
5273                for key in keys {
5274                    if let Some(ref pfx) = prefix {
5275                        if !key.starts_with(pfx) {
5276                            continue;
5277                        }
5278                    }
5279                    let mut record = UnifiedRecord::new();
5280                    record.set("key", Value::text(key));
5281                    record.set("value", Value::text("***"));
5282                    record.set("status", Value::text("active"));
5283                    result.push(record);
5284                }
5285                Ok(RuntimeQueryResult {
5286                    query: query.to_string(),
5287                    mode,
5288                    statement: "show_secrets",
5289                    engine: "runtime-secret",
5290                    result,
5291                    affected_rows: 0,
5292                    statement_type: "select",
5293                })
5294            }
5295            // SHOW CONFIG [prefix]
5296            QueryExpr::ShowConfig { ref prefix } => {
5297                let store = self.inner.db.store();
5298                let all_collections = store.list_collections();
5299                if !all_collections.contains(&"red_config".to_string()) {
5300                    let result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
5301                    return Ok(RuntimeQueryResult {
5302                        query: query.to_string(),
5303                        mode,
5304                        statement: "show_config",
5305                        engine: "runtime-config",
5306                        result,
5307                        affected_rows: 0,
5308                        statement_type: "select",
5309                    });
5310                }
5311                let manager = store
5312                    .get_collection("red_config")
5313                    .ok_or_else(|| RedDBError::NotFound("red_config".to_string()))?;
5314                let entities = manager.query_all(|_| true);
5315                let mut latest = std::collections::BTreeMap::<String, (u64, Value, Value)>::new();
5316                for entity in entities {
5317                    if let EntityData::Row(ref row) = entity.data {
5318                        if let Some(ref named) = row.named {
5319                            let key_val = named.get("key").cloned().unwrap_or(Value::Null);
5320                            let val = named.get("value").cloned().unwrap_or(Value::Null);
5321                            let key_str = match &key_val {
5322                                Value::Text(s) => s.as_ref(),
5323                                _ => continue,
5324                            };
5325                            if let Some(ref pfx) = prefix {
5326                                if !key_str.starts_with(pfx.as_str()) {
5327                                    continue;
5328                                }
5329                            }
5330                            let entity_id = entity.id.raw();
5331                            match latest.get(key_str) {
5332                                Some((prev_id, _, _)) if *prev_id > entity_id => {}
5333                                _ => {
5334                                    latest.insert(key_str.to_string(), (entity_id, key_val, val));
5335                                }
5336                            }
5337                        }
5338                    }
5339                }
5340                let mut result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
5341                for (_, key_val, val) in latest.into_values() {
5342                    let mut record = UnifiedRecord::new();
5343                    record.set("key", key_val);
5344                    record.set("value", val);
5345                    result.push(record);
5346                }
5347                Ok(RuntimeQueryResult {
5348                    query: query.to_string(),
5349                    mode,
5350                    statement: "show_config",
5351                    engine: "runtime-config",
5352                    result,
5353                    affected_rows: 0,
5354                    statement_type: "select",
5355                })
5356            }
5357            // Session-local multi-tenancy handle (Phase 2.5.3).
5358            //
5359            // SET TENANT 'id' / SET TENANT NULL / RESET TENANT — writes
5360            // the thread-local; SHOW TENANT returns it. Paired with the
5361            // CURRENT_TENANT() scalar for use in RLS policies.
5362            QueryExpr::SetTenant(ref value) => {
5363                match value {
5364                    Some(id) => set_current_tenant(id.clone()),
5365                    None => clear_current_tenant(),
5366                }
5367                Ok(RuntimeQueryResult::ok_message(
5368                    query.to_string(),
5369                    &match value {
5370                        Some(id) => format!("tenant set: {id}"),
5371                        None => "tenant cleared".to_string(),
5372                    },
5373                    "set_tenant",
5374                ))
5375            }
5376            QueryExpr::ShowTenant => {
5377                let mut result = UnifiedResult::with_columns(vec!["tenant".into()]);
5378                let mut record = UnifiedRecord::new();
5379                record.set(
5380                    "tenant",
5381                    current_tenant().map(Value::text).unwrap_or(Value::Null),
5382                );
5383                result.push(record);
5384                Ok(RuntimeQueryResult {
5385                    query: query.to_string(),
5386                    mode,
5387                    statement: "show_tenant",
5388                    engine: "runtime-tenant",
5389                    result,
5390                    affected_rows: 0,
5391                    statement_type: "select",
5392                })
5393            }
5394            // Transaction control (Phase 2.3 PG parity).
5395            //
5396            // BEGIN allocates a real `Xid` and stores a `TxnContext` keyed by
5397            // the current connection's id. COMMIT/ROLLBACK release it through
5398            // the `SnapshotManager` so future snapshots see the correct set of
5399            // active/aborted transactions.
5400            //
5401            // Tuple stamping (xmin/xmax) and read-path visibility filtering
5402            // land in Phase 2.3.2 — this dispatch only manages the snapshot
5403            // registry. Statements running outside a TxnContext still behave
5404            // as autocommit (xid=0 → visible to every snapshot).
5405            QueryExpr::TransactionControl(ref ctl) => {
5406                use crate::storage::query::ast::TxnControl;
5407                use crate::storage::transaction::snapshot::{TxnContext, Xid};
5408                use crate::storage::transaction::IsolationLevel;
5409
5410                // Phase 2.3 keys transactions by a thread-local connection id.
5411                // The stdio/gRPC paths wire a real per-connection id later;
5412                // for embedded use (one RedDBRuntime per process-ish caller)
5413                // we fall back to a deterministic placeholder.
5414                let conn_id = current_connection_id();
5415
5416                let (kind, msg) = match ctl {
5417                    TxnControl::Begin => {
5418                        let mgr = Arc::clone(&self.inner.snapshot_manager);
5419                        let xid = mgr.begin();
5420                        let snapshot = mgr.snapshot(xid);
5421                        let ctx = TxnContext {
5422                            xid,
5423                            isolation: IsolationLevel::SnapshotIsolation,
5424                            snapshot,
5425                            savepoints: Vec::new(),
5426                            released_sub_xids: Vec::new(),
5427                        };
5428                        self.inner.tx_contexts.write().insert(conn_id, ctx);
5429                        ("begin", format!("BEGIN — xid={xid} (snapshot isolation)"))
5430                    }
5431                    TxnControl::Commit => {
5432                        // SET LOCAL TENANT ends with the transaction.
5433                        self.inner.tx_local_tenants.write().remove(&conn_id);
5434                        let ctx = self.inner.tx_contexts.write().remove(&conn_id);
5435                        match ctx {
5436                            Some(ctx) => {
5437                                let mut own_xids = std::collections::HashSet::new();
5438                                own_xids.insert(ctx.xid);
5439                                for (_, sub) in &ctx.savepoints {
5440                                    own_xids.insert(*sub);
5441                                }
5442                                for sub in &ctx.released_sub_xids {
5443                                    own_xids.insert(*sub);
5444                                }
5445                                if let Err(err) = self.check_table_row_write_conflicts(
5446                                    conn_id,
5447                                    &ctx.snapshot,
5448                                    &own_xids,
5449                                ) {
5450                                    for (_, sub) in &ctx.savepoints {
5451                                        self.inner.snapshot_manager.rollback(*sub);
5452                                    }
5453                                    for sub in &ctx.released_sub_xids {
5454                                        self.inner.snapshot_manager.rollback(*sub);
5455                                    }
5456                                    self.inner.snapshot_manager.rollback(ctx.xid);
5457                                    self.revive_pending_versioned_updates(conn_id);
5458                                    self.revive_pending_tombstones(conn_id);
5459                                    self.discard_pending_kv_watch_events(conn_id);
5460                                    self.discard_pending_store_wal_actions(conn_id);
5461                                    return Err(err);
5462                                }
5463                                self.restore_pending_write_stamps(conn_id);
5464                                if let Err(err) = self.flush_pending_store_wal_actions(conn_id) {
5465                                    for (_, sub) in &ctx.savepoints {
5466                                        self.inner.snapshot_manager.rollback(*sub);
5467                                    }
5468                                    for sub in &ctx.released_sub_xids {
5469                                        self.inner.snapshot_manager.rollback(*sub);
5470                                    }
5471                                    self.inner.snapshot_manager.rollback(ctx.xid);
5472                                    self.revive_pending_versioned_updates(conn_id);
5473                                    self.revive_pending_tombstones(conn_id);
5474                                    self.discard_pending_kv_watch_events(conn_id);
5475                                    return Err(err);
5476                                }
5477                                // Phase 2.3.2e: commit every open sub-xid
5478                                // so they also become visible. Their
5479                                // work is promoted to the parent txn's
5480                                // result exactly like a RELEASE would
5481                                // have done.
5482                                for (_, sub) in &ctx.savepoints {
5483                                    self.inner.snapshot_manager.commit(*sub);
5484                                }
5485                                for sub in &ctx.released_sub_xids {
5486                                    self.inner.snapshot_manager.commit(*sub);
5487                                }
5488                                self.inner.snapshot_manager.commit(ctx.xid);
5489                                self.finalize_pending_versioned_updates(conn_id);
5490                                self.finalize_pending_tombstones(conn_id);
5491                                self.finalize_pending_kv_watch_events(conn_id);
5492                                ("commit", format!("COMMIT — xid={} committed", ctx.xid))
5493                            }
5494                            None => (
5495                                "commit",
5496                                "COMMIT outside transaction — no-op (autocommit)".to_string(),
5497                            ),
5498                        }
5499                    }
5500                    TxnControl::Rollback => {
5501                        self.inner.tx_local_tenants.write().remove(&conn_id);
5502                        let ctx = self.inner.tx_contexts.write().remove(&conn_id);
5503                        match ctx {
5504                            Some(ctx) => {
5505                                // Phase 2.3.2e: abort every open sub-xid
5506                                // too so their writes stay hidden.
5507                                for (_, sub) in &ctx.savepoints {
5508                                    self.inner.snapshot_manager.rollback(*sub);
5509                                }
5510                                for sub in &ctx.released_sub_xids {
5511                                    self.inner.snapshot_manager.rollback(*sub);
5512                                }
5513                                self.inner.snapshot_manager.rollback(ctx.xid);
5514                                // Phase 2.3.2b: tuples that the txn had
5515                                // xmax-stamped become live again — wipe xmax
5516                                // back to 0 so later snapshots see them.
5517                                self.revive_pending_versioned_updates(conn_id);
5518                                self.revive_pending_tombstones(conn_id);
5519                                self.discard_pending_kv_watch_events(conn_id);
5520                                self.discard_pending_store_wal_actions(conn_id);
5521                                ("rollback", format!("ROLLBACK — xid={} aborted", ctx.xid))
5522                            }
5523                            None => (
5524                                "rollback",
5525                                "ROLLBACK outside transaction — no-op (autocommit)".to_string(),
5526                            ),
5527                        }
5528                    }
5529                    // Phase 2.3.2e: savepoints map onto sub-xids. Each
5530                    // SAVEPOINT allocates a fresh xid and pushes it
5531                    // onto the per-txn stack so subsequent writes can
5532                    // be selectively rolled back. RELEASE pops without
5533                    // aborting; ROLLBACK TO aborts the sub-xid (and
5534                    // any nested ones) + revives their tombstones.
5535                    TxnControl::Savepoint(name) => {
5536                        let mgr = Arc::clone(&self.inner.snapshot_manager);
5537                        let mut guard = self.inner.tx_contexts.write();
5538                        match guard.get_mut(&conn_id) {
5539                            Some(ctx) => {
5540                                let sub = mgr.begin();
5541                                ctx.savepoints.push((name.clone(), sub));
5542                                ("savepoint", format!("SAVEPOINT {name} — sub_xid={sub}"))
5543                            }
5544                            None => (
5545                                "savepoint",
5546                                "SAVEPOINT outside transaction — no-op".to_string(),
5547                            ),
5548                        }
5549                    }
5550                    TxnControl::ReleaseSavepoint(name) => {
5551                        let mut guard = self.inner.tx_contexts.write();
5552                        match guard.get_mut(&conn_id) {
5553                            Some(ctx) => {
5554                                let pos = ctx
5555                                    .savepoints
5556                                    .iter()
5557                                    .position(|(n, _)| n == name)
5558                                    .ok_or_else(|| {
5559                                        RedDBError::Internal(format!(
5560                                            "savepoint {name} does not exist"
5561                                        ))
5562                                    })?;
5563                                // RELEASE pops the named savepoint and
5564                                // any nested ones. Their sub-xids move
5565                                // to `released_sub_xids` so they commit
5566                                // (or roll back) alongside the parent
5567                                // xid — PG semantics: released
5568                                // savepoints still contribute their
5569                                // work, but their names are gone.
5570                                let released = ctx.savepoints.len() - pos;
5571                                let popped: Vec<Xid> = ctx
5572                                    .savepoints
5573                                    .split_off(pos)
5574                                    .into_iter()
5575                                    .map(|(_, x)| x)
5576                                    .collect();
5577                                ctx.released_sub_xids.extend(popped);
5578                                (
5579                                    "release_savepoint",
5580                                    format!("RELEASE SAVEPOINT {name} — {released} level(s)"),
5581                                )
5582                            }
5583                            None => (
5584                                "release_savepoint",
5585                                "RELEASE outside transaction — no-op".to_string(),
5586                            ),
5587                        }
5588                    }
5589                    TxnControl::RollbackToSavepoint(name) => {
5590                        let mgr = Arc::clone(&self.inner.snapshot_manager);
5591                        // Splice out the savepoint + nested ones under
5592                        // a narrow lock, then run the snapshot-manager
5593                        // + tombstone side-effects without the tx map
5594                        // held so nothing re-enters.
5595                        let drop_result: Option<(Xid, Vec<Xid>)> = {
5596                            let mut guard = self.inner.tx_contexts.write();
5597                            if let Some(ctx) = guard.get_mut(&conn_id) {
5598                                let pos = ctx
5599                                    .savepoints
5600                                    .iter()
5601                                    .position(|(n, _)| n == name)
5602                                    .ok_or_else(|| {
5603                                        RedDBError::Internal(format!(
5604                                            "savepoint {name} does not exist"
5605                                        ))
5606                                    })?;
5607                                let savepoint_xid = ctx.savepoints[pos].1;
5608                                let aborted: Vec<Xid> = ctx
5609                                    .savepoints
5610                                    .split_off(pos)
5611                                    .into_iter()
5612                                    .map(|(_, x)| x)
5613                                    .collect();
5614                                Some((savepoint_xid, aborted))
5615                            } else {
5616                                None
5617                            }
5618                        };
5619
5620                        match drop_result {
5621                            Some((savepoint_xid, aborted)) => {
5622                                for x in &aborted {
5623                                    mgr.rollback(*x);
5624                                }
5625                                let reverted_updates =
5626                                    self.revive_versioned_updates_since(conn_id, savepoint_xid);
5627                                let revived = self.revive_tombstones_since(conn_id, savepoint_xid);
5628                                (
5629                                    "rollback_to_savepoint",
5630                                    format!(
5631                                        "ROLLBACK TO SAVEPOINT {name} — aborted {} sub_xid(s), reverted {reverted_updates} update(s), revived {revived} tombstone(s)",
5632                                        aborted.len(),
5633                                    ),
5634                                )
5635                            }
5636                            None => (
5637                                "rollback_to_savepoint",
5638                                "ROLLBACK TO outside transaction — no-op".to_string(),
5639                            ),
5640                        }
5641                    }
5642                };
5643                Ok(RuntimeQueryResult::ok_message(
5644                    query.to_string(),
5645                    &msg,
5646                    kind,
5647                ))
5648            }
5649            // Schema + Sequence DDL (Phase 1.3 PG parity).
5650            //
5651            // Schemas are lightweight logical namespaces: a CREATE SCHEMA call
5652            // just registers the name in `red_config` under `schema.{name}`.
5653            // Table lookups still happen by collection name; clients using
5654            // `schema.table` qualified names collapse to collection `schema.table`.
5655            //
5656            // Sequences persist a 64-bit counter + metadata (start, increment)
5657            // in `red_config` under `sequence.{name}.*`. Scalar callers
5658            // `nextval('name')` / `currval('name')` arrive with the MVCC phase
5659            // once we have a proper mutating-function dispatch path; for now the
5660            // DDL just establishes the catalog entry so clients don't error.
5661            QueryExpr::CreateSchema(ref q) => {
5662                let store = self.inner.db.store();
5663                let key = format!("schema.{}", q.name);
5664                if store.get_config(&key).is_some() {
5665                    if q.if_not_exists {
5666                        return Ok(RuntimeQueryResult::ok_message(
5667                            query.to_string(),
5668                            &format!("schema {} already exists — skipped", q.name),
5669                            "create_schema",
5670                        ));
5671                    }
5672                    return Err(RedDBError::Internal(format!(
5673                        "schema {} already exists",
5674                        q.name
5675                    )));
5676                }
5677                store.set_config_tree(&key, &crate::serde_json::Value::Bool(true));
5678                Ok(RuntimeQueryResult::ok_message(
5679                    query.to_string(),
5680                    &format!("schema {} created", q.name),
5681                    "create_schema",
5682                ))
5683            }
5684            QueryExpr::DropSchema(ref q) => {
5685                let store = self.inner.db.store();
5686                let key = format!("schema.{}", q.name);
5687                let existed = store.get_config(&key).is_some();
5688                if !existed && !q.if_exists {
5689                    return Err(RedDBError::Internal(format!(
5690                        "schema {} does not exist",
5691                        q.name
5692                    )));
5693                }
5694                // Remove marker from red_config via set to null.
5695                store.set_config_tree(&key, &crate::serde_json::Value::Null);
5696                let suffix = if q.cascade {
5697                    " (CASCADE accepted — tables untouched)"
5698                } else {
5699                    ""
5700                };
5701                Ok(RuntimeQueryResult::ok_message(
5702                    query.to_string(),
5703                    &format!("schema {} dropped{}", q.name, suffix),
5704                    "drop_schema",
5705                ))
5706            }
5707            QueryExpr::CreateSequence(ref q) => {
5708                let store = self.inner.db.store();
5709                let base = format!("sequence.{}", q.name);
5710                let start_key = format!("{base}.start");
5711                let incr_key = format!("{base}.increment");
5712                let curr_key = format!("{base}.current");
5713                if store.get_config(&start_key).is_some() {
5714                    if q.if_not_exists {
5715                        return Ok(RuntimeQueryResult::ok_message(
5716                            query.to_string(),
5717                            &format!("sequence {} already exists — skipped", q.name),
5718                            "create_sequence",
5719                        ));
5720                    }
5721                    return Err(RedDBError::Internal(format!(
5722                        "sequence {} already exists",
5723                        q.name
5724                    )));
5725                }
5726                // Persist start + increment, and set current so the first
5727                // nextval returns `start`.
5728                let initial_current = q.start - q.increment;
5729                store.set_config_tree(
5730                    &start_key,
5731                    &crate::serde_json::Value::Number(q.start as f64),
5732                );
5733                store.set_config_tree(
5734                    &incr_key,
5735                    &crate::serde_json::Value::Number(q.increment as f64),
5736                );
5737                store.set_config_tree(
5738                    &curr_key,
5739                    &crate::serde_json::Value::Number(initial_current as f64),
5740                );
5741                Ok(RuntimeQueryResult::ok_message(
5742                    query.to_string(),
5743                    &format!(
5744                        "sequence {} created (start={}, increment={})",
5745                        q.name, q.start, q.increment
5746                    ),
5747                    "create_sequence",
5748                ))
5749            }
5750            QueryExpr::DropSequence(ref q) => {
5751                let store = self.inner.db.store();
5752                let base = format!("sequence.{}", q.name);
5753                let existed = store.get_config(&format!("{base}.start")).is_some();
5754                if !existed && !q.if_exists {
5755                    return Err(RedDBError::Internal(format!(
5756                        "sequence {} does not exist",
5757                        q.name
5758                    )));
5759                }
5760                for k in ["start", "increment", "current"] {
5761                    store.set_config_tree(&format!("{base}.{k}"), &crate::serde_json::Value::Null);
5762                }
5763                Ok(RuntimeQueryResult::ok_message(
5764                    query.to_string(),
5765                    &format!("sequence {} dropped", q.name),
5766                    "drop_sequence",
5767                ))
5768            }
5769            // Views — CREATE [MATERIALIZED] VIEW (Phase 2.1 PG parity).
5770            //
5771            // The view definition is stored in-memory on RuntimeInner (not
5772            // persisted). SELECTs that reference the view name will substitute
5773            // the stored `QueryExpr` via `resolve_view_reference` during
5774            // planning (same entry point used by table-name resolution).
5775            //
5776            // Materialized views additionally allocate a slot in
5777            // `MaterializedViewCache`; a REFRESH repopulates that slot.
5778            QueryExpr::CreateView(ref q) => {
5779                let mut views = self.inner.views.write();
5780                if views.contains_key(&q.name) && !q.or_replace {
5781                    if q.if_not_exists {
5782                        return Ok(RuntimeQueryResult::ok_message(
5783                            query.to_string(),
5784                            &format!("view {} already exists — skipped", q.name),
5785                            "create_view",
5786                        ));
5787                    }
5788                    return Err(RedDBError::Internal(format!(
5789                        "view {} already exists",
5790                        q.name
5791                    )));
5792                }
5793                views.insert(q.name.clone(), Arc::new(q.clone()));
5794                drop(views);
5795
5796                // Materialized view: register cache slot (data is empty until REFRESH).
5797                if q.materialized {
5798                    use crate::storage::cache::result::{MaterializedViewDef, RefreshPolicy};
5799                    let refresh = match q.refresh_every_ms {
5800                        Some(ms) => RefreshPolicy::Periodic(std::time::Duration::from_millis(ms)),
5801                        None => RefreshPolicy::Manual,
5802                    };
5803                    let dependencies = collect_table_refs(&q.query);
5804                    let def = MaterializedViewDef {
5805                        name: q.name.clone(),
5806                        query: format!("<parsed view {}>", q.name),
5807                        dependencies: dependencies.clone(),
5808                        refresh,
5809                        retention_duration_ms: q.retention_duration_ms,
5810                    };
5811                    self.inner.materialized_views.write().register(def);
5812
5813                    // Issue #593 slice 9a — persist the descriptor to
5814                    // the system catalog so the definition survives a
5815                    // restart. Upsert semantics (delete-then-insert by
5816                    // name) keep the catalog free of duplicate rows
5817                    // across `CREATE OR REPLACE` churn.
5818                    let descriptor =
5819                        crate::runtime::continuous_materialized_view::MaterializedViewDescriptor {
5820                            name: q.name.clone(),
5821                            source_sql: query.to_string(),
5822                            source_collections: dependencies,
5823                            refresh_every_ms: q.refresh_every_ms,
5824                            retention_duration_ms: q.retention_duration_ms,
5825                        };
5826                    let store = self.inner.db.store();
5827                    crate::runtime::continuous_materialized_view::persist_descriptor(
5828                        store.as_ref(),
5829                        &descriptor,
5830                    )?;
5831
5832                    // Issue #594 slice 9b — provision a Table-shaped
5833                    // backing collection named after the view. The
5834                    // rewriter skips materialized views (see
5835                    // `rewrite_view_refs_inner`) so `SELECT FROM v`
5836                    // resolves to this collection directly. Empty
5837                    // until REFRESH wires through it in 9c.
5838                    self.ensure_materialized_view_backing(&q.name)?;
5839                }
5840                // Plan cache may have cached a plan that didn't know about this
5841                // view — invalidate so future references pick up the new binding.
5842                // Result cache gets flushed too: OR REPLACE must not serve a
5843                // prior execution of the obsolete body.
5844                self.invalidate_plan_cache();
5845                self.invalidate_result_cache();
5846
5847                Ok(RuntimeQueryResult::ok_message(
5848                    query.to_string(),
5849                    &format!(
5850                        "{}view {} created",
5851                        if q.materialized { "materialized " } else { "" },
5852                        q.name
5853                    ),
5854                    "create_view",
5855                ))
5856            }
5857            QueryExpr::DropView(ref q) => {
5858                let mut views = self.inner.views.write();
5859                let removed = views.remove(&q.name);
5860                let existed = removed.is_some();
5861                let removed_materialized =
5862                    removed.as_ref().map(|v| v.materialized).unwrap_or(false);
5863                drop(views);
5864                if q.materialized || existed {
5865                    // Try the materialised cache too — silent if absent.
5866                    self.inner.materialized_views.write().remove(&q.name);
5867                    // Issue #593 slice 9a — remove any persisted
5868                    // catalog row. Idempotent: a no-op when the view
5869                    // was never materialized (no row was ever written).
5870                    let store = self.inner.db.store();
5871                    crate::runtime::continuous_materialized_view::remove_by_name(
5872                        store.as_ref(),
5873                        &q.name,
5874                    )?;
5875                }
5876                // Issue #594 slice 9b — drop the backing collection
5877                // that was provisioned at CREATE time. Only mat views
5878                // ever had one; regular views never did.
5879                if removed_materialized || q.materialized {
5880                    self.drop_materialized_view_backing(&q.name)?;
5881                }
5882                // Drop any plan / result cache entries that baked the
5883                // view body into their QueryExpr.
5884                self.invalidate_plan_cache();
5885                self.invalidate_result_cache();
5886                if !existed && !q.if_exists {
5887                    return Err(RedDBError::Internal(format!(
5888                        "view {} does not exist",
5889                        q.name
5890                    )));
5891                }
5892                self.invalidate_plan_cache();
5893                Ok(RuntimeQueryResult::ok_message(
5894                    query.to_string(),
5895                    &format!("view {} dropped", q.name),
5896                    "drop_view",
5897                ))
5898            }
5899            QueryExpr::RefreshMaterializedView(ref q) => {
5900                // Look up the view definition, execute its underlying query,
5901                // and stash the serialized result in the materialised cache.
5902                let view = {
5903                    let views = self.inner.views.read();
5904                    views.get(&q.name).cloned()
5905                };
5906                let view = match view {
5907                    Some(v) => v,
5908                    None => {
5909                        return Err(RedDBError::Internal(format!(
5910                            "view {} does not exist",
5911                            q.name
5912                        )))
5913                    }
5914                };
5915                if !view.materialized {
5916                    return Err(RedDBError::Internal(format!(
5917                        "view {} is not materialized — REFRESH requires \
5918                         CREATE MATERIALIZED VIEW",
5919                        q.name
5920                    )));
5921                }
5922                // Execute the underlying query fresh.
5923                let started = std::time::Instant::now();
5924                let now_ms = std::time::SystemTime::now()
5925                    .duration_since(std::time::UNIX_EPOCH)
5926                    .map(|d| d.as_millis() as u64)
5927                    .unwrap_or(0);
5928                match self.execute_query_expr((*view.query).clone()) {
5929                    Ok(inner_result) => {
5930                        // Issue #595 slice 9c — atomically replace the
5931                        // backing collection's contents under a single
5932                        // WAL group. Concurrent SELECT from the view
5933                        // sees either the prior or new contents, never
5934                        // partial. A crash before the WAL commit lands
5935                        // leaves the prior contents intact on recovery.
5936                        let entities =
5937                            view_records_to_entities(&q.name, &inner_result.result.records);
5938                        let row_count = entities.len() as u64;
5939                        let store = self.inner.db.store();
5940                        let serialized_records = match store.refresh_collection(&q.name, entities) {
5941                            Ok(records) => records,
5942                            Err(err) => {
5943                                let duration_ms = started.elapsed().as_millis() as u64;
5944                                let msg = err.to_string();
5945                                self.inner
5946                                    .materialized_views
5947                                    .write()
5948                                    .record_refresh_failure(
5949                                        &q.name,
5950                                        msg.clone(),
5951                                        duration_ms,
5952                                        now_ms,
5953                                    );
5954                                return Err(RedDBError::Internal(format!(
5955                                    "REFRESH MATERIALIZED VIEW {}: {msg}",
5956                                    q.name
5957                                )));
5958                            }
5959                        };
5960
5961                        // Issue #596 slice 9d — emit a Refresh
5962                        // ChangeRecord into the logical-WAL spool so
5963                        // replicas deterministically replay the same
5964                        // backing-collection contents via
5965                        // `LogicalChangeApplier::apply_record`.
5966                        if let Some(ref primary) = self.inner.db.replication {
5967                            let lsn = self.inner.cdc.emit(
5968                                crate::replication::cdc::ChangeOperation::Refresh,
5969                                &q.name,
5970                                0,
5971                                "refresh",
5972                            );
5973                            self.invalidate_result_cache_for_table(&q.name);
5974                            let timestamp = std::time::SystemTime::now()
5975                                .duration_since(std::time::UNIX_EPOCH)
5976                                .unwrap_or_default()
5977                                .as_millis() as u64;
5978                            let record = ChangeRecord::for_refresh(
5979                                lsn,
5980                                timestamp,
5981                                q.name.clone(),
5982                                serialized_records,
5983                            );
5984                            let encoded = record.encode();
5985                            primary.wal_buffer.append(record.lsn, encoded.clone());
5986                            if let Some(spool) = &primary.logical_wal_spool {
5987                                let _ = spool.append(record.lsn, &encoded);
5988                            }
5989                        }
5990
5991                        let duration_ms = started.elapsed().as_millis() as u64;
5992                        let serialized = format!("{:?}", inner_result.result);
5993                        self.inner
5994                            .materialized_views
5995                            .write()
5996                            .record_refresh_success(
5997                                &q.name,
5998                                serialized.into_bytes(),
5999                                row_count,
6000                                duration_ms,
6001                                now_ms,
6002                            );
6003                        // SELECT FROM v now reads through the rewriter
6004                        // skip into the backing collection — drop the
6005                        // result cache so prior empty-backing reads
6006                        // don't shadow the new contents.
6007                        self.invalidate_result_cache();
6008                        Ok(RuntimeQueryResult::ok_message(
6009                            query.to_string(),
6010                            &format!("materialized view {} refreshed", q.name),
6011                            "refresh_materialized_view",
6012                        ))
6013                    }
6014                    Err(err) => {
6015                        let duration_ms = started.elapsed().as_millis() as u64;
6016                        let msg = err.to_string();
6017                        self.inner
6018                            .materialized_views
6019                            .write()
6020                            .record_refresh_failure(&q.name, msg.clone(), duration_ms, now_ms);
6021                        Err(err)
6022                    }
6023                }
6024            }
6025            // Row Level Security (Phase 2.5 PG parity).
6026            //
6027            // Policies live in an in-memory registry keyed by (table, name).
6028            // Enforcement (AND-ing the policy's USING clause into every
6029            // query's WHERE for the table) arrives in Phase 2.5.2 via the
6030            // filter compiler; this dispatch only manages the catalog.
6031            QueryExpr::CreatePolicy(ref q) => {
6032                let key = (q.table.clone(), q.name.clone());
6033                self.inner
6034                    .rls_policies
6035                    .write()
6036                    .insert(key, Arc::new(q.clone()));
6037                self.invalidate_plan_cache();
6038                // Issue #120 — surface policy names in the
6039                // schema-vocabulary so AskPipeline (#121) can resolve
6040                // a policy reference back to its table.
6041                self.schema_vocabulary_apply(
6042                    crate::runtime::schema_vocabulary::DdlEvent::CreatePolicy {
6043                        collection: q.table.clone(),
6044                        policy: q.name.clone(),
6045                    },
6046                );
6047                Ok(RuntimeQueryResult::ok_message(
6048                    query.to_string(),
6049                    &format!("policy {} on {} created", q.name, q.table),
6050                    "create_policy",
6051                ))
6052            }
6053            QueryExpr::DropPolicy(ref q) => {
6054                let removed = self
6055                    .inner
6056                    .rls_policies
6057                    .write()
6058                    .remove(&(q.table.clone(), q.name.clone()))
6059                    .is_some();
6060                if !removed && !q.if_exists {
6061                    return Err(RedDBError::Internal(format!(
6062                        "policy {} on {} does not exist",
6063                        q.name, q.table
6064                    )));
6065                }
6066                self.invalidate_plan_cache();
6067                // Issue #120 — keep the schema-vocabulary policy
6068                // entry in sync.
6069                self.schema_vocabulary_apply(
6070                    crate::runtime::schema_vocabulary::DdlEvent::DropPolicy {
6071                        collection: q.table.clone(),
6072                        policy: q.name.clone(),
6073                    },
6074                );
6075                Ok(RuntimeQueryResult::ok_message(
6076                    query.to_string(),
6077                    &format!("policy {} on {} dropped", q.name, q.table),
6078                    "drop_policy",
6079                ))
6080            }
6081            // Foreign Data Wrappers (Phase 3.2 PG parity).
6082            //
6083            // CREATE SERVER / CREATE FOREIGN TABLE register into the shared
6084            // `ForeignTableRegistry`. The read path consults that registry
6085            // before dispatching a SELECT — when the table name matches a
6086            // registered foreign table, we forward the scan to the wrapper
6087            // and skip the normal collection lookup.
6088            //
6089            // Phase 3.2 is in-memory only; persistence across restarts is a
6090            // 3.2.2 follow-up that mirrors the view registry pattern.
6091            QueryExpr::CreateServer(ref q) => {
6092                use crate::storage::fdw::FdwOptions;
6093                let registry = Arc::clone(&self.inner.foreign_tables);
6094                if registry.server(&q.name).is_some() {
6095                    if q.if_not_exists {
6096                        return Ok(RuntimeQueryResult::ok_message(
6097                            query.to_string(),
6098                            &format!("server {} already exists — skipped", q.name),
6099                            "create_server",
6100                        ));
6101                    }
6102                    return Err(RedDBError::Internal(format!(
6103                        "server {} already exists",
6104                        q.name
6105                    )));
6106                }
6107                let mut opts = FdwOptions::new();
6108                for (k, v) in &q.options {
6109                    opts.values.insert(k.clone(), v.clone());
6110                }
6111                registry
6112                    .create_server(&q.name, &q.wrapper, opts)
6113                    .map_err(|e| RedDBError::Internal(e.to_string()))?;
6114                Ok(RuntimeQueryResult::ok_message(
6115                    query.to_string(),
6116                    &format!("server {} created (wrapper {})", q.name, q.wrapper),
6117                    "create_server",
6118                ))
6119            }
6120            QueryExpr::DropServer(ref q) => {
6121                let existed = self.inner.foreign_tables.drop_server(&q.name);
6122                if !existed && !q.if_exists {
6123                    return Err(RedDBError::Internal(format!(
6124                        "server {} does not exist",
6125                        q.name
6126                    )));
6127                }
6128                Ok(RuntimeQueryResult::ok_message(
6129                    query.to_string(),
6130                    &format!(
6131                        "server {} dropped{}",
6132                        q.name,
6133                        if q.cascade { " (cascade)" } else { "" }
6134                    ),
6135                    "drop_server",
6136                ))
6137            }
6138            QueryExpr::CreateForeignTable(ref q) => {
6139                use crate::storage::fdw::{FdwOptions, ForeignColumn, ForeignTable};
6140                let registry = Arc::clone(&self.inner.foreign_tables);
6141                if registry.foreign_table(&q.name).is_some() {
6142                    if q.if_not_exists {
6143                        return Ok(RuntimeQueryResult::ok_message(
6144                            query.to_string(),
6145                            &format!("foreign table {} already exists — skipped", q.name),
6146                            "create_foreign_table",
6147                        ));
6148                    }
6149                    return Err(RedDBError::Internal(format!(
6150                        "foreign table {} already exists",
6151                        q.name
6152                    )));
6153                }
6154                let mut opts = FdwOptions::new();
6155                for (k, v) in &q.options {
6156                    opts.values.insert(k.clone(), v.clone());
6157                }
6158                let columns: Vec<ForeignColumn> = q
6159                    .columns
6160                    .iter()
6161                    .map(|c| ForeignColumn {
6162                        name: c.name.clone(),
6163                        data_type: c.data_type.clone(),
6164                        not_null: c.not_null,
6165                    })
6166                    .collect();
6167                registry
6168                    .create_foreign_table(ForeignTable {
6169                        name: q.name.clone(),
6170                        server_name: q.server.clone(),
6171                        columns,
6172                        options: opts,
6173                    })
6174                    .map_err(|e| RedDBError::Internal(e.to_string()))?;
6175                self.invalidate_plan_cache();
6176                Ok(RuntimeQueryResult::ok_message(
6177                    query.to_string(),
6178                    &format!("foreign table {} created (server {})", q.name, q.server),
6179                    "create_foreign_table",
6180                ))
6181            }
6182            QueryExpr::DropForeignTable(ref q) => {
6183                let existed = self.inner.foreign_tables.drop_foreign_table(&q.name);
6184                if !existed && !q.if_exists {
6185                    return Err(RedDBError::Internal(format!(
6186                        "foreign table {} does not exist",
6187                        q.name
6188                    )));
6189                }
6190                self.invalidate_plan_cache();
6191                Ok(RuntimeQueryResult::ok_message(
6192                    query.to_string(),
6193                    &format!("foreign table {} dropped", q.name),
6194                    "drop_foreign_table",
6195                ))
6196            }
6197            // COPY table FROM 'path' (Phase 1.5 PG parity).
6198            //
6199            // Stream CSV rows through the shared `CsvImporter`. The collection
6200            // is auto-created on first insert (via `insert_auto`-style path);
6201            // VACUUM/ANALYZE afterwards is up to the caller.
6202            QueryExpr::CopyFrom(ref q) => {
6203                use crate::storage::import::{CsvConfig, CsvImporter};
6204                let store = self.inner.db.store();
6205                let cfg = CsvConfig {
6206                    collection: q.table.clone(),
6207                    has_header: q.has_header,
6208                    delimiter: q.delimiter.map(|c| c as u8).unwrap_or(b','),
6209                    ..CsvConfig::default()
6210                };
6211                let importer = CsvImporter::new(cfg);
6212                let stats = importer
6213                    .import_file(&q.path, store.as_ref())
6214                    .map_err(|e| RedDBError::Internal(format!("COPY failed: {e}")))?;
6215                // Tables are written → invalidate cached plans / result cache.
6216                self.note_table_write(&q.table);
6217                Ok(RuntimeQueryResult::ok_message(
6218                    query.to_string(),
6219                    &format!(
6220                        "COPY imported {} rows into {} ({} errors skipped, {}ms)",
6221                        stats.records_imported, q.table, stats.errors_skipped, stats.duration_ms
6222                    ),
6223                    "copy_from",
6224                ))
6225            }
6226            // Maintenance commands (Phase 1.2 PG parity).
6227            //
6228            // - VACUUM [FULL] [table]: refreshes planner stats for the target
6229            //   collection(s) and — when FULL — triggers a full pager persist
6230            //   (flushes dirty pages + fsync). Also invalidates the result cache
6231            //   so subsequent reads re-execute against the freshly compacted
6232            //   storage. RedDB's segment/btree GC runs continuously via the
6233            //   background lifecycle; explicit space reclamation for sealed
6234            //   segments arrives with Phase 2.3 (MVCC + dead-tuple reclamation).
6235            // - ANALYZE [table]: reruns `analyze_collection` +
6236            //   `persist_table_stats` via `refresh_table_planner_stats` so the
6237            //   planner has fresh histograms, distinct estimates, null counts.
6238            //
6239            // Both commands accept an optional target; omitting the target
6240            // iterates every collection in the store.
6241            QueryExpr::MaintenanceCommand(ref cmd) => {
6242                use crate::storage::query::ast::MaintenanceCommand as Mc;
6243                let store = self.inner.db.store();
6244                let (kind, msg) = match cmd {
6245                    Mc::Analyze { target } => {
6246                        let targets: Vec<String> = match target {
6247                            Some(t) => vec![t.clone()],
6248                            None => store.list_collections(),
6249                        };
6250                        for t in &targets {
6251                            self.refresh_table_planner_stats(t);
6252                        }
6253                        (
6254                            "analyze",
6255                            format!("ANALYZE refreshed stats for {} table(s)", targets.len()),
6256                        )
6257                    }
6258                    Mc::Vacuum { target, full } => {
6259                        let targets: Vec<String> = match target {
6260                            Some(t) => vec![t.clone()],
6261                            None => store.list_collections(),
6262                        };
6263                        let cutoff_xid = self.mvcc_vacuum_cutoff_xid();
6264                        let mut vacuum_stats =
6265                            crate::storage::unified::store::MvccVacuumStats::default();
6266                        for t in &targets {
6267                            let stats = store.vacuum_mvcc_history(t, cutoff_xid).map_err(|e| {
6268                                RedDBError::Internal(format!(
6269                                    "VACUUM MVCC history failed for {t}: {e}"
6270                                ))
6271                            })?;
6272                            if stats.reclaimed_versions > 0 {
6273                                self.rebuild_runtime_indexes_for_table(t)?;
6274                            }
6275                            vacuum_stats.add(&stats);
6276                        }
6277                        self.inner.snapshot_manager.prune_aborted(cutoff_xid);
6278                        // Stats refresh covers every target (same as ANALYZE).
6279                        for t in &targets {
6280                            self.refresh_table_planner_stats(t);
6281                        }
6282                        // FULL forces a pager persist (dirty-page flush + fsync).
6283                        // Regular VACUUM relies on the background writer / segment
6284                        // lifecycle so the command is non-blocking.
6285                        let persisted = if *full {
6286                            match store.persist() {
6287                                Ok(()) => true,
6288                                Err(e) => {
6289                                    return Err(RedDBError::Internal(format!(
6290                                        "VACUUM FULL persist failed: {e:?}"
6291                                    )));
6292                                }
6293                            }
6294                        } else {
6295                            false
6296                        };
6297                        // Result cache depended on pre-vacuum state.
6298                        self.invalidate_result_cache();
6299                        (
6300                            "vacuum",
6301                            format!(
6302                                "VACUUM{} processed {} table(s): scanned_versions={}, retained_versions={}, reclaimed_versions={}, retained_history_versions={}, reclaimed_history_versions={}, retained_tombstones={}, reclaimed_tombstones={}{}",
6303                                if *full { " FULL" } else { "" },
6304                                targets.len(),
6305                                vacuum_stats.scanned_versions,
6306                                vacuum_stats.retained_versions,
6307                                vacuum_stats.reclaimed_versions,
6308                                vacuum_stats.retained_history_versions,
6309                                vacuum_stats.reclaimed_history_versions,
6310                                vacuum_stats.retained_tombstones,
6311                                vacuum_stats.reclaimed_tombstones,
6312                                if persisted {
6313                                    " (pages flushed to disk)"
6314                                } else {
6315                                    ""
6316                                }
6317                            ),
6318                        )
6319                    }
6320                };
6321                Ok(RuntimeQueryResult::ok_message(
6322                    query.to_string(),
6323                    &msg,
6324                    kind,
6325                ))
6326            }
6327            // GRANT / REVOKE / ALTER USER (RBAC milestone).
6328            //
6329            // These hit the AuthStore directly. The privilege-check
6330            // gate at the top of `execute_query_expr` already decided
6331            // whether the caller may even run the statement; here we
6332            // just translate the AST into AuthStore calls.
6333            QueryExpr::Grant(ref g) => self.execute_grant_statement(query, g),
6334            QueryExpr::Revoke(ref r) => self.execute_revoke_statement(query, r),
6335            QueryExpr::AlterUser(ref a) => self.execute_alter_user_statement(query, a),
6336            QueryExpr::CreateIamPolicy { ref id, ref json } => {
6337                self.execute_create_iam_policy(query, id, json)
6338            }
6339            QueryExpr::DropIamPolicy { ref id } => self.execute_drop_iam_policy(query, id),
6340            QueryExpr::AttachPolicy {
6341                ref policy_id,
6342                ref principal,
6343            } => self.execute_attach_policy(query, policy_id, principal),
6344            QueryExpr::DetachPolicy {
6345                ref policy_id,
6346                ref principal,
6347            } => self.execute_detach_policy(query, policy_id, principal),
6348            QueryExpr::ShowPolicies { ref filter } => {
6349                self.execute_show_policies(query, filter.as_ref())
6350            }
6351            QueryExpr::ShowEffectivePermissions {
6352                ref user,
6353                ref resource,
6354            } => self.execute_show_effective_permissions(query, user, resource.as_ref()),
6355            QueryExpr::SimulatePolicy {
6356                ref user,
6357                ref action,
6358                ref resource,
6359            } => self.execute_simulate_policy(query, user, action, resource),
6360            QueryExpr::CreateMigration(ref q) => self.execute_create_migration(query, q),
6361            QueryExpr::ApplyMigration(ref q) => self.execute_apply_migration(query, q),
6362            QueryExpr::RollbackMigration(ref q) => self.execute_rollback_migration(query, q),
6363            QueryExpr::ExplainMigration(ref q) => self.execute_explain_migration(query, q),
6364        };
6365
6366        // Decrypt Value::Secret columns in-place before caching, so
6367        // cached results match the post-decrypt shape and repeat
6368        // queries skip the per-row AES-GCM pass.
6369        let mut query_result = query_result;
6370        if let Ok(ref mut result) = query_result {
6371            if result.statement_type == "select" {
6372                self.apply_secret_decryption(result);
6373            }
6374        }
6375
6376        // Cache SELECT results for 30s.
6377        // Skip: pre-serialized JSON (large clone), and result sets > 5 rows.
6378        // Large multi-row results (range scans, filtered scans) are rarely
6379        // repeated with the same literal values so the cache hit rate is near
6380        // zero while the clone cost (100 records × ~16 fields each) is high.
6381        // Aggregations (1 row) and point lookups (1 row) still benefit.
6382        if let Ok(ref result) = query_result {
6383            frame.write_result_cache(self, result, result_cache_scopes);
6384        }
6385
6386        query_result
6387    }
6388
6389    /// Snapshot of every registered materialized view's runtime
6390    /// state — feeds the `red.materialized_views` virtual table.
6391    /// Issue #583 slice 10.
6392    pub fn materialized_view_metadata(
6393        &self,
6394    ) -> Vec<crate::storage::cache::result::MaterializedViewMetadata> {
6395        // Issue #595 slice 9c — `current_row_count` is now scraped
6396        // live from the backing collection rather than read from the
6397        // cache slot. Mirrors the slice-10 invariant on
6398        // `queue_pending_gauge` in #527: the live store is the source
6399        // of truth, the cache slot only carries last-refresh telemetry
6400        // (timing, error, refresh cadence).
6401        let store = self.inner.db.store();
6402        let mut entries = self.inner.materialized_views.read().metadata();
6403        for entry in &mut entries {
6404            if let Some(manager) = store.get_collection(&entry.name) {
6405                entry.current_row_count = manager.count() as u64;
6406            }
6407        }
6408        entries
6409    }
6410
6411    /// Drive scheduled refreshes for materialized views with a
6412    /// `REFRESH EVERY <duration>` clause. Called from the background
6413    /// scheduler thread (and from unit tests with a fake clock via
6414    /// `claim_due_at`). Each invocation atomically claims the set of
6415    /// due views (so two concurrent ticks never double-fire the same
6416    /// view) and runs each refresh through the standard execution
6417    /// path — failures are captured in `last_error` and the prior
6418    /// content stays intact. Issue #583 slice 10.
6419    /// Snapshot of every tracked retention sweeper state — feeds the
6420    /// three extra columns on `red.retention`. Issue #584 slice 12.
6421    pub(crate) fn retention_sweeper_snapshot(
6422        &self,
6423    ) -> Vec<(String, crate::runtime::retention_sweeper::SweeperState)> {
6424        self.inner.retention_sweeper.read().snapshot()
6425    }
6426
6427    /// Drive one tick of the retention sweeper. Iterates collections
6428    /// with a retention policy set, physically deletes at most
6429    /// `batch_size` expired rows per collection, and records the
6430    /// `last_sweep_at_ms` / `rows_swept_total` / pending estimate that
6431    /// `red.retention` exposes. Called from the background sweeper
6432    /// thread; safe to invoke directly from tests with a small batch
6433    /// size to drain rows deterministically. Issue #584 slice 12.
6434    ///
6435    /// Deletes are issued as `DELETE FROM <collection> WHERE
6436    /// <ts_column> < <cutoff>` through the standard `execute_query`
6437    /// chokepoint so WAL participation and snapshot guards apply
6438    /// exactly as for a user-issued DELETE — replicas replay the
6439    /// sweeper's deletes via the same WAL stream with no special
6440    /// handling on the replication side.
6441    ///
6442    /// Batching is enforced by tightening the cutoff: if more than
6443    /// `batch_size` rows are expired, the cutoff is dropped to the
6444    /// `batch_size`-th oldest expired timestamp + 1 so the predicate
6445    /// matches roughly `batch_size` rows; the remainder is reported
6446    /// as `current_rows_pending_sweep_estimate` and drained on the
6447    /// next tick.
6448    pub fn sweep_retention_tick(&self, batch_size: usize) {
6449        if batch_size == 0 {
6450            return;
6451        }
6452        let now_ms = std::time::SystemTime::now()
6453            .duration_since(std::time::UNIX_EPOCH)
6454            .map(|d| d.as_millis() as u64)
6455            .unwrap_or(0);
6456
6457        let store = self.inner.db.store();
6458        let collections = store.list_collections();
6459        for name in collections {
6460            let Some(contract) = self.inner.db.collection_contract(&name) else {
6461                continue;
6462            };
6463            let Some(retention_ms) = contract.retention_duration_ms else {
6464                continue;
6465            };
6466            let Some(ts_column) =
6467                crate::runtime::retention_filter::resolve_timestamp_column(&contract)
6468            else {
6469                continue;
6470            };
6471            let Some(manager) = store.get_collection(&name) else {
6472                continue;
6473            };
6474            let cutoff = (now_ms as i64).saturating_sub(retention_ms as i64);
6475
6476            // Single pass: collect expired timestamps. We keep the
6477            // full Vec rather than a bounded heap because the partial
6478            // sort below is the simplest correct way to find the
6479            // batch-th oldest; for the slice's "1000-row default
6480            // batch" target this is bounded enough for production
6481            // operation, and the alternative (in-place heap of size
6482            // batch+1) is a follow-up optimisation.
6483            let mut expired_ts: Vec<i64> = Vec::new();
6484            manager.for_each_entity(|entity| {
6485                let ts = match ts_column.as_str() {
6486                    "created_at" => Some(entity.created_at as i64),
6487                    "updated_at" => Some(entity.updated_at as i64),
6488                    other => entity
6489                        .data
6490                        .as_row()
6491                        .and_then(|row| row.get_field(other))
6492                        .and_then(|v| match v {
6493                            crate::storage::schema::Value::TimestampMs(t) => Some(*t),
6494                            crate::storage::schema::Value::Timestamp(t) => {
6495                                Some(t.saturating_mul(1_000))
6496                            }
6497                            crate::storage::schema::Value::BigInt(t) => Some(*t),
6498                            crate::storage::schema::Value::UnsignedInteger(t) => {
6499                                i64::try_from(*t).ok()
6500                            }
6501                            crate::storage::schema::Value::Integer(t) => Some(*t),
6502                            _ => None,
6503                        }),
6504                };
6505                if let Some(t) = ts {
6506                    if t < cutoff {
6507                        expired_ts.push(t);
6508                    }
6509                }
6510                true
6511            });
6512
6513            let total_expired = expired_ts.len() as u64;
6514            if total_expired == 0 {
6515                self.inner
6516                    .retention_sweeper
6517                    .write()
6518                    .record_tick(&name, 0, 0, now_ms);
6519                continue;
6520            }
6521
6522            let (effective_cutoff, pending) = if (total_expired as usize) <= batch_size {
6523                (cutoff, 0u64)
6524            } else {
6525                // Tighten the cutoff to the (batch_size)-th oldest
6526                // expired timestamp + 1 so DELETE matches roughly
6527                // `batch_size` rows.
6528                expired_ts.sort_unstable();
6529                let nth = expired_ts[batch_size - 1];
6530                (
6531                    nth.saturating_add(1),
6532                    total_expired.saturating_sub(batch_size as u64),
6533                )
6534            };
6535
6536            let stmt = format!(
6537                "DELETE FROM {} WHERE {} < {}",
6538                name, ts_column, effective_cutoff
6539            );
6540            let deleted = match self.execute_query(&stmt) {
6541                Ok(r) => r.affected_rows,
6542                Err(_) => 0,
6543            };
6544
6545            self.inner
6546                .retention_sweeper
6547                .write()
6548                .record_tick(&name, deleted, pending, now_ms);
6549        }
6550    }
6551
6552    pub fn refresh_due_materialized_views(&self) {
6553        let due = {
6554            let mut cache = self.inner.materialized_views.write();
6555            cache.claim_due_at(std::time::Instant::now())
6556        };
6557        for name in due {
6558            // Round-trip through `execute_query` (rather than the
6559            // prepared-statement `execute_query_expr` fast path, which
6560            // explicitly rejects DDL/maintenance statements). Failures
6561            // are captured inside the RefreshMaterializedView handler
6562            // via `record_refresh_failure`; the scheduler ignores the
6563            // Result so one bad view doesn't halt the loop.
6564            let stmt = format!("REFRESH MATERIALIZED VIEW {}", name);
6565            let _ = self.execute_query(&stmt);
6566        }
6567    }
6568
6569    /// Execute a pre-parsed `QueryExpr` directly, bypassing SQL parsing and the
6570    /// plan cache. Used by the prepared-statement fast path so that `execute_prepared`
6571    /// calls pay zero parse + cache overhead.
6572    ///
6573    /// Applies secret decryption on SELECT results, identical to `execute_query`.
6574    pub fn execute_query_expr(&self, expr: QueryExpr) -> RedDBResult<RuntimeQueryResult> {
6575        let _config_snapshot_guard = ConfigSnapshotGuard::install(Arc::clone(&self.inner.db));
6576        let _secret_store_guard = SecretStoreGuard::install(self.inner.auth_store.read().clone());
6577        // View rewrite (Phase 2.1): substitute any `QueryExpr::Table(tq)`
6578        // whose `tq.table` matches a registered view with the view's
6579        // underlying query. Safe to call even when no views are registered.
6580        let expr = self.rewrite_view_refs(expr);
6581
6582        self.validate_model_operations_before_auth(&expr)?;
6583        // Granular RBAC privilege check. Runs before dispatch so a
6584        // denied caller never reaches storage. Fail-closed: any error
6585        // resolving the action / resource produces PermissionDenied.
6586        if let Err(err) = self.check_query_privilege(&expr) {
6587            return Err(RedDBError::Query(format!("permission denied: {err}")));
6588        }
6589
6590        let statement = query_expr_name(&expr);
6591        let mode = detect_mode(statement);
6592        let query_str = statement;
6593
6594        let result = self.dispatch_expr(expr, query_str, mode)?;
6595        let mut r = result;
6596        if r.statement_type == "select" {
6597            self.apply_secret_decryption(&mut r);
6598        }
6599        Ok(r)
6600    }
6601
6602    pub(super) fn validate_model_operations_before_auth(
6603        &self,
6604        expr: &QueryExpr,
6605    ) -> RedDBResult<()> {
6606        use crate::catalog::CollectionModel;
6607        use crate::runtime::ddl::polymorphic_resolver;
6608        use crate::storage::query::ast::KvCommand;
6609
6610        let system_schema_target = match expr {
6611            QueryExpr::DropTable(q) => Some(q.name.as_str()),
6612            QueryExpr::DropGraph(q) => Some(q.name.as_str()),
6613            QueryExpr::DropVector(q) => Some(q.name.as_str()),
6614            QueryExpr::DropDocument(q) => Some(q.name.as_str()),
6615            QueryExpr::DropKv(q) => Some(q.name.as_str()),
6616            QueryExpr::DropCollection(q) => Some(q.name.as_str()),
6617            QueryExpr::Truncate(q) => Some(q.name.as_str()),
6618            _ => None,
6619        };
6620        if system_schema_target.is_some_and(crate::runtime::impl_ddl::is_system_schema_name) {
6621            return Err(RedDBError::Query("system schema is read-only".to_string()));
6622        }
6623
6624        let expected = match expr {
6625            QueryExpr::DropTable(q) => Some((q.name.as_str(), CollectionModel::Table)),
6626            QueryExpr::DropGraph(q) => Some((q.name.as_str(), CollectionModel::Graph)),
6627            QueryExpr::DropVector(q) => Some((q.name.as_str(), CollectionModel::Vector)),
6628            QueryExpr::DropDocument(q) => Some((q.name.as_str(), CollectionModel::Document)),
6629            QueryExpr::DropKv(q) => Some((q.name.as_str(), q.model)),
6630            QueryExpr::DropCollection(q) => q.model.map(|model| (q.name.as_str(), model)),
6631            QueryExpr::Truncate(q) => q.model.map(|model| (q.name.as_str(), model)),
6632            QueryExpr::KvCommand(cmd) => {
6633                let (collection, model) = match cmd {
6634                    KvCommand::Put {
6635                        collection, model, ..
6636                    }
6637                    | KvCommand::Get {
6638                        collection, model, ..
6639                    }
6640                    | KvCommand::Incr {
6641                        collection, model, ..
6642                    }
6643                    | KvCommand::Cas {
6644                        collection, model, ..
6645                    }
6646                    | KvCommand::Delete {
6647                        collection, model, ..
6648                    } => (collection.as_str(), *model),
6649                    KvCommand::Rotate { collection, .. }
6650                    | KvCommand::History { collection, .. }
6651                    | KvCommand::List { collection, .. }
6652                    | KvCommand::Purge { collection, .. } => {
6653                        (collection.as_str(), CollectionModel::Vault)
6654                    }
6655                    KvCommand::InvalidateTags { collection, .. } => {
6656                        (collection.as_str(), CollectionModel::Kv)
6657                    }
6658                    KvCommand::Watch {
6659                        collection, model, ..
6660                    } => (collection.as_str(), *model),
6661                    KvCommand::Unseal { collection, .. } => {
6662                        (collection.as_str(), CollectionModel::Vault)
6663                    }
6664                };
6665                Some((collection, model))
6666            }
6667            QueryExpr::ConfigCommand(cmd) => {
6668                self.validate_config_command_before_auth(cmd)?;
6669                None
6670            }
6671            _ => None,
6672        };
6673
6674        let Some((name, expected_model)) = expected else {
6675            return Ok(());
6676        };
6677        let snapshot = self.inner.db.catalog_model_snapshot();
6678        let Some(actual_model) = snapshot
6679            .collections
6680            .iter()
6681            .find(|collection| collection.name == name)
6682            .map(|collection| collection.declared_model.unwrap_or(collection.model))
6683        else {
6684            return Ok(());
6685        };
6686        polymorphic_resolver::ensure_model_match(expected_model, actual_model)
6687    }
6688
6689    /// Walk a `QueryExpr` and replace `QueryExpr::Table(tq)` nodes whose
6690    /// `tq.table` matches a registered view name with the view's stored
6691    /// body. Recurses through joins so `SELECT ... FROM t JOIN myview ...`
6692    /// resolves correctly. Pure operation — no side effects.
6693    pub(super) fn rewrite_view_refs(&self, expr: QueryExpr) -> QueryExpr {
6694        // Fast path: no views registered → return original expression.
6695        if self.inner.views.read().is_empty() {
6696            return expr;
6697        }
6698        self.rewrite_view_refs_inner(expr)
6699    }
6700
6701    fn rewrite_view_refs_inner(&self, expr: QueryExpr) -> QueryExpr {
6702        use crate::storage::query::ast::{Filter, TableSource};
6703        match expr {
6704            QueryExpr::Table(mut tq) => {
6705                // 1. If the TableSource is a subquery, recurse into it so
6706                //    `SELECT ... FROM (SELECT ... FROM myview) t` expands.
6707                //    The legacy `table` field (set to a synthetic
6708                //    "__subq_NNNN" sentinel) stays as-is so callers that
6709                //    read it keep compiling.
6710                if let Some(TableSource::Subquery(body)) = tq.source.take() {
6711                    tq.source = Some(TableSource::Subquery(Box::new(
6712                        self.rewrite_view_refs_inner(*body),
6713                    )));
6714                    return QueryExpr::Table(tq);
6715                }
6716
6717                // 2. Restore the source field (took it above for match).
6718                // When the source was `None` or `TableSource::Name(_)`, the
6719                // real lookup key is `tq.table` — check the view registry.
6720                let maybe_view = {
6721                    let views = self.inner.views.read();
6722                    views.get(&tq.table).cloned()
6723                };
6724                let Some(view) = maybe_view else {
6725                    return QueryExpr::Table(tq);
6726                };
6727
6728                // Issue #594 slice 9b — materialized views are read
6729                // from their backing collection, not by substituting
6730                // the body. Returning the TableQuery as-is lets the
6731                // normal table-read path resolve `SELECT FROM v`
6732                // against the collection provisioned at CREATE time.
6733                if view.materialized {
6734                    return QueryExpr::Table(tq);
6735                }
6736
6737                // Recurse into the view body — views may reference other
6738                // views. The recursion yields the final QueryExpr we need
6739                // to merge the outer's filter / limit / offset into.
6740                let inner_expr = self.rewrite_view_refs_inner((*view.query).clone());
6741
6742                // Phase 5: when the body is a Table we merge the outer
6743                // TableQuery's WHERE / LIMIT / OFFSET into it so stacked
6744                // views filter recursively. Non-table bodies (Search,
6745                // Ask, Vector, Graph, Hybrid) can't meaningfully combine
6746                // with an outer Table query today — return the body
6747                // verbatim; outer predicates are lost. Full projection
6748                // merge lands in Phase 5.2.
6749                match inner_expr {
6750                    QueryExpr::Table(mut inner_tq) => {
6751                        if let Some(outer_filter) = tq.filter.take() {
6752                            inner_tq.filter = Some(match inner_tq.filter.take() {
6753                                Some(existing) => {
6754                                    Filter::And(Box::new(existing), Box::new(outer_filter))
6755                                }
6756                                None => outer_filter,
6757                            });
6758                            // Keep the `Expr` form in lock-step with the
6759                            // merged `Filter`. The executor prefers
6760                            // `where_expr` and nulls `filter` when it is
6761                            // present (see `execute_query_inner`), so a
6762                            // stacked view whose outer predicate was only
6763                            // merged into `filter` would silently drop that
6764                            // predicate at eval time (#635).
6765                            inner_tq.where_expr = inner_tq
6766                                .filter
6767                                .as_ref()
6768                                .map(crate::storage::query::sql_lowering::filter_to_expr);
6769                        }
6770                        if let Some(outer_limit) = tq.limit {
6771                            inner_tq.limit = Some(match inner_tq.limit {
6772                                Some(existing) => existing.min(outer_limit),
6773                                None => outer_limit,
6774                            });
6775                        }
6776                        if let Some(outer_offset) = tq.offset {
6777                            inner_tq.offset = Some(match inner_tq.offset {
6778                                Some(existing) => existing + outer_offset,
6779                                None => outer_offset,
6780                            });
6781                        }
6782                        QueryExpr::Table(inner_tq)
6783                    }
6784                    other => other,
6785                }
6786            }
6787            QueryExpr::Join(mut jq) => {
6788                jq.left = Box::new(self.rewrite_view_refs_inner(*jq.left));
6789                jq.right = Box::new(self.rewrite_view_refs_inner(*jq.right));
6790                QueryExpr::Join(jq)
6791            }
6792            // Other variants don't carry nested QueryExpr that can reference
6793            // a view by table name. Return as-is.
6794            other => other,
6795        }
6796    }
6797
6798    /// Internal dispatch: route a `QueryExpr` to the appropriate executor.
6799    /// Shared by `execute_query` (after parse/cache) and `execute_query_expr`
6800    /// (direct call from prepared-statement handler).
6801    fn authorize_relational_table_select(
6802        &self,
6803        mut table: TableQuery,
6804        frame: &dyn super::statement_frame::ReadFrame,
6805    ) -> RedDBResult<Option<TableQuery>> {
6806        if let Some(TableSource::Subquery(inner)) = table.source.take() {
6807            let authorized_inner = self.authorize_relational_select_expr(*inner, frame)?;
6808            table.source = Some(TableSource::Subquery(Box::new(authorized_inner)));
6809            return Ok(Some(table));
6810        }
6811
6812        self.check_table_column_projection_authz(&table, frame)?;
6813
6814        if self.inner.rls_enabled_tables.read().contains(&table.table) {
6815            return Ok(inject_rls_filters(self, frame, table));
6816        }
6817
6818        Ok(Some(table))
6819    }
6820
6821    fn authorize_relational_join_select(
6822        &self,
6823        mut join: JoinQuery,
6824        frame: &dyn super::statement_frame::ReadFrame,
6825    ) -> RedDBResult<Option<JoinQuery>> {
6826        self.check_join_column_projection_authz(&join, frame)?;
6827        join.left = Box::new(self.authorize_relational_join_child(*join.left, frame)?);
6828        join.right = Box::new(self.authorize_relational_join_child(*join.right, frame)?);
6829        Ok(inject_rls_into_join(self, frame, join))
6830    }
6831
6832    fn authorize_relational_join_child(
6833        &self,
6834        expr: QueryExpr,
6835        frame: &dyn super::statement_frame::ReadFrame,
6836    ) -> RedDBResult<QueryExpr> {
6837        match expr {
6838            QueryExpr::Table(mut table) => {
6839                if let Some(TableSource::Subquery(inner)) = table.source.take() {
6840                    let authorized_inner = self.authorize_relational_select_expr(*inner, frame)?;
6841                    table.source = Some(TableSource::Subquery(Box::new(authorized_inner)));
6842                }
6843                Ok(QueryExpr::Table(table))
6844            }
6845            QueryExpr::Join(join) => self
6846                .authorize_relational_join_select(join, frame)?
6847                .map(QueryExpr::Join)
6848                .ok_or_else(|| {
6849                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6850                }),
6851            other => Ok(other),
6852        }
6853    }
6854
6855    fn authorize_relational_select_expr(
6856        &self,
6857        expr: QueryExpr,
6858        frame: &dyn super::statement_frame::ReadFrame,
6859    ) -> RedDBResult<QueryExpr> {
6860        match expr {
6861            QueryExpr::Table(table) => self
6862                .authorize_relational_table_select(table, frame)?
6863                .map(QueryExpr::Table)
6864                .ok_or_else(|| {
6865                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6866                }),
6867            QueryExpr::Join(join) => self
6868                .authorize_relational_join_select(join, frame)?
6869                .map(QueryExpr::Join)
6870                .ok_or_else(|| {
6871                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6872                }),
6873            other => Ok(other),
6874        }
6875    }
6876
6877    fn check_table_column_projection_authz(
6878        &self,
6879        table: &TableQuery,
6880        frame: &dyn super::statement_frame::ReadFrame,
6881    ) -> RedDBResult<()> {
6882        let Some((username, role)) = frame.identity() else {
6883            return Ok(());
6884        };
6885        let Some(auth_store) = self.inner.auth_store.read().clone() else {
6886            return Ok(());
6887        };
6888
6889        let columns = self.resolved_table_projection_columns(table)?;
6890        let request = ColumnAccessRequest::select(table.table.clone(), columns);
6891        let principal = UserId::from_parts(frame.effective_scope(), username);
6892        let ctx = runtime_iam_context(
6893            role,
6894            frame.effective_scope(),
6895            auth_store.principal_is_system_owned(&principal),
6896        );
6897        let outcome = auth_store.check_column_projection_authz(&principal, &request, &ctx);
6898        if outcome.allowed() {
6899            return Ok(());
6900        }
6901
6902        if let Some(denied) = outcome.first_denied_column() {
6903            return Err(RedDBError::Query(format!(
6904                "permission denied: principal=`{username}` cannot select column `{}`",
6905                denied.resource.name
6906            )));
6907        }
6908        Err(RedDBError::Query(format!(
6909            "permission denied: principal=`{username}` cannot select table `{}`",
6910            table.table
6911        )))
6912    }
6913
6914    fn check_join_column_projection_authz(
6915        &self,
6916        join: &JoinQuery,
6917        frame: &dyn super::statement_frame::ReadFrame,
6918    ) -> RedDBResult<()> {
6919        let mut by_table: HashMap<String, BTreeSet<String>> = HashMap::new();
6920        let projections = crate::storage::query::sql_lowering::effective_join_projections(join);
6921        self.collect_join_projection_columns(join, &projections, &mut by_table)?;
6922
6923        for (table, columns) in by_table {
6924            let query = TableQuery {
6925                table,
6926                source: None,
6927                alias: None,
6928                select_items: Vec::new(),
6929                columns: columns.into_iter().map(Projection::Column).collect(),
6930                where_expr: None,
6931                filter: None,
6932                group_by_exprs: Vec::new(),
6933                group_by: Vec::new(),
6934                having_expr: None,
6935                having: None,
6936                order_by: Vec::new(),
6937                limit: None,
6938                limit_param: None,
6939                offset: None,
6940                offset_param: None,
6941                expand: None,
6942                as_of: None,
6943                sessionize: None,
6944            };
6945            self.check_table_column_projection_authz(&query, frame)?;
6946        }
6947        Ok(())
6948    }
6949
6950    fn collect_join_projection_columns(
6951        &self,
6952        join: &JoinQuery,
6953        projections: &[Projection],
6954        out: &mut HashMap<String, BTreeSet<String>>,
6955    ) -> RedDBResult<()> {
6956        let left = table_side_context(join.left.as_ref());
6957        let right = table_side_context(join.right.as_ref());
6958
6959        if projections
6960            .iter()
6961            .any(|projection| matches!(projection, Projection::All))
6962        {
6963            for side in [left.as_ref(), right.as_ref()].into_iter().flatten() {
6964                out.entry(side.table.clone())
6965                    .or_default()
6966                    .extend(self.table_all_projection_columns(&side.table)?);
6967            }
6968            return Ok(());
6969        }
6970
6971        for projection in projections {
6972            collect_projection_columns_for_join_side(
6973                projection,
6974                left.as_ref(),
6975                right.as_ref(),
6976                out,
6977            )?;
6978        }
6979        Ok(())
6980    }
6981
6982    fn resolved_table_projection_columns(&self, table: &TableQuery) -> RedDBResult<Vec<String>> {
6983        let projections = crate::storage::query::sql_lowering::effective_table_projections(table);
6984        if projections
6985            .iter()
6986            .any(|projection| matches!(projection, Projection::All))
6987        {
6988            return self.table_all_projection_columns(&table.table);
6989        }
6990
6991        let mut columns = BTreeSet::new();
6992        for projection in &projections {
6993            collect_projection_columns_for_table(
6994                projection,
6995                &table.table,
6996                table.alias.as_deref(),
6997                &mut columns,
6998            );
6999        }
7000        Ok(columns.into_iter().collect())
7001    }
7002
7003    fn table_all_projection_columns(&self, table: &str) -> RedDBResult<Vec<String>> {
7004        if let Some(contract) = self.inner.db.collection_contract_arc(table) {
7005            let columns: Vec<String> = contract
7006                .declared_columns
7007                .iter()
7008                .map(|column| column.name.clone())
7009                .collect();
7010            if !columns.is_empty() {
7011                return Ok(columns);
7012            }
7013        }
7014
7015        let records = scan_runtime_table_source_records_limited(&self.inner.db, table, Some(1))?;
7016        Ok(records
7017            .first()
7018            .map(|record| {
7019                record
7020                    .column_names()
7021                    .into_iter()
7022                    .map(|column| column.to_string())
7023                    .collect()
7024            })
7025            .unwrap_or_default())
7026    }
7027
7028    fn resolve_table_expr_subqueries(
7029        &self,
7030        mut table: TableQuery,
7031        frame: &dyn super::statement_frame::ReadFrame,
7032    ) -> RedDBResult<TableQuery> {
7033        if let Some(TableSource::Subquery(inner)) = table.source.take() {
7034            let inner = self.resolve_select_expr_subqueries(*inner, frame)?;
7035            table.source = Some(TableSource::Subquery(Box::new(inner)));
7036        }
7037
7038        let outer_scopes = relation_scopes_for_query(&QueryExpr::Table(table.clone()));
7039        for item in &mut table.select_items {
7040            if let crate::storage::query::ast::SelectItem::Expr { expr, .. } = item {
7041                *expr = self.resolve_expr_subqueries(expr.clone(), &outer_scopes, frame)?;
7042            }
7043        }
7044        if let Some(where_expr) = table.where_expr.take() {
7045            table.where_expr =
7046                Some(self.resolve_expr_subqueries(where_expr, &outer_scopes, frame)?);
7047            table.filter = None;
7048        }
7049        if let Some(having_expr) = table.having_expr.take() {
7050            table.having_expr =
7051                Some(self.resolve_expr_subqueries(having_expr, &outer_scopes, frame)?);
7052            table.having = None;
7053        }
7054        for expr in &mut table.group_by_exprs {
7055            *expr = self.resolve_expr_subqueries(expr.clone(), &outer_scopes, frame)?;
7056        }
7057        for clause in &mut table.order_by {
7058            if let Some(expr) = clause.expr.take() {
7059                clause.expr = Some(self.resolve_expr_subqueries(expr, &outer_scopes, frame)?);
7060            }
7061        }
7062        Ok(table)
7063    }
7064
7065    fn resolve_select_expr_subqueries(
7066        &self,
7067        expr: QueryExpr,
7068        frame: &dyn super::statement_frame::ReadFrame,
7069    ) -> RedDBResult<QueryExpr> {
7070        match expr {
7071            QueryExpr::Table(table) => self
7072                .resolve_table_expr_subqueries(table, frame)
7073                .map(QueryExpr::Table),
7074            QueryExpr::Join(mut join) => {
7075                join.left = Box::new(self.resolve_select_expr_subqueries(*join.left, frame)?);
7076                join.right = Box::new(self.resolve_select_expr_subqueries(*join.right, frame)?);
7077                Ok(QueryExpr::Join(join))
7078            }
7079            other => Ok(other),
7080        }
7081    }
7082
7083    fn resolve_expr_subqueries(
7084        &self,
7085        expr: crate::storage::query::ast::Expr,
7086        outer_scopes: &[String],
7087        frame: &dyn super::statement_frame::ReadFrame,
7088    ) -> RedDBResult<crate::storage::query::ast::Expr> {
7089        use crate::storage::query::ast::Expr;
7090
7091        match expr {
7092            Expr::Subquery { query, span } => {
7093                let values = self.execute_expr_subquery_values(query, outer_scopes, frame)?;
7094                if values.len() > 1 {
7095                    return Err(RedDBError::Query(
7096                        "scalar subquery returned more than one row".to_string(),
7097                    ));
7098                }
7099                Ok(Expr::Literal {
7100                    value: values.into_iter().next().unwrap_or(Value::Null),
7101                    span,
7102                })
7103            }
7104            Expr::BinaryOp { op, lhs, rhs, span } => Ok(Expr::BinaryOp {
7105                op,
7106                lhs: Box::new(self.resolve_expr_subqueries(*lhs, outer_scopes, frame)?),
7107                rhs: Box::new(self.resolve_expr_subqueries(*rhs, outer_scopes, frame)?),
7108                span,
7109            }),
7110            Expr::UnaryOp { op, operand, span } => Ok(Expr::UnaryOp {
7111                op,
7112                operand: Box::new(self.resolve_expr_subqueries(*operand, outer_scopes, frame)?),
7113                span,
7114            }),
7115            Expr::Cast {
7116                inner,
7117                target,
7118                span,
7119            } => Ok(Expr::Cast {
7120                inner: Box::new(self.resolve_expr_subqueries(*inner, outer_scopes, frame)?),
7121                target,
7122                span,
7123            }),
7124            Expr::FunctionCall { name, args, span } => {
7125                let args = args
7126                    .into_iter()
7127                    .map(|arg| self.resolve_expr_subqueries(arg, outer_scopes, frame))
7128                    .collect::<RedDBResult<Vec<_>>>()?;
7129                Ok(Expr::FunctionCall { name, args, span })
7130            }
7131            Expr::Case {
7132                branches,
7133                else_,
7134                span,
7135            } => {
7136                let branches = branches
7137                    .into_iter()
7138                    .map(|(cond, value)| {
7139                        Ok((
7140                            self.resolve_expr_subqueries(cond, outer_scopes, frame)?,
7141                            self.resolve_expr_subqueries(value, outer_scopes, frame)?,
7142                        ))
7143                    })
7144                    .collect::<RedDBResult<Vec<_>>>()?;
7145                let else_ = else_
7146                    .map(|expr| self.resolve_expr_subqueries(*expr, outer_scopes, frame))
7147                    .transpose()?
7148                    .map(Box::new);
7149                Ok(Expr::Case {
7150                    branches,
7151                    else_,
7152                    span,
7153                })
7154            }
7155            Expr::IsNull {
7156                operand,
7157                negated,
7158                span,
7159            } => Ok(Expr::IsNull {
7160                operand: Box::new(self.resolve_expr_subqueries(*operand, outer_scopes, frame)?),
7161                negated,
7162                span,
7163            }),
7164            Expr::InList {
7165                target,
7166                values,
7167                negated,
7168                span,
7169            } => {
7170                let target =
7171                    Box::new(self.resolve_expr_subqueries(*target, outer_scopes, frame)?);
7172                let mut resolved = Vec::new();
7173                for value in values {
7174                    if let Expr::Subquery { query, .. } = value {
7175                        resolved.extend(
7176                            self.execute_expr_subquery_values(query, outer_scopes, frame)?
7177                                .into_iter()
7178                                .map(Expr::lit),
7179                        );
7180                    } else {
7181                        resolved.push(self.resolve_expr_subqueries(value, outer_scopes, frame)?);
7182                    }
7183                }
7184                Ok(Expr::InList {
7185                    target,
7186                    values: resolved,
7187                    negated,
7188                    span,
7189                })
7190            }
7191            Expr::Between {
7192                target,
7193                low,
7194                high,
7195                negated,
7196                span,
7197            } => Ok(Expr::Between {
7198                target: Box::new(self.resolve_expr_subqueries(*target, outer_scopes, frame)?),
7199                low: Box::new(self.resolve_expr_subqueries(*low, outer_scopes, frame)?),
7200                high: Box::new(self.resolve_expr_subqueries(*high, outer_scopes, frame)?),
7201                negated,
7202                span,
7203            }),
7204            other => Ok(other),
7205        }
7206    }
7207
7208    fn execute_expr_subquery_values(
7209        &self,
7210        subquery: crate::storage::query::ast::ExprSubquery,
7211        outer_scopes: &[String],
7212        frame: &dyn super::statement_frame::ReadFrame,
7213    ) -> RedDBResult<Vec<Value>> {
7214        let query = *subquery.query;
7215        if query_references_outer_scope(&query, outer_scopes) {
7216            return Err(RedDBError::Query(
7217                "NOT_YET_SUPPORTED: correlated subqueries are not supported yet; track follow-up issue #470-correlated-subqueries".to_string(),
7218            ));
7219        }
7220        let query = self.rewrite_view_refs(query);
7221        let query = self.resolve_select_expr_subqueries(query, frame)?;
7222        let query = self.authorize_relational_select_expr(query, frame)?;
7223        let result = match query {
7224            QueryExpr::Table(table) => {
7225                execute_runtime_table_query(&self.inner.db, &table, Some(&self.inner.index_store))?
7226            }
7227            QueryExpr::Join(join) => execute_runtime_join_query(&self.inner.db, &join)?,
7228            other => {
7229                return Err(RedDBError::Query(format!(
7230                    "expression subquery must be a SELECT query, got {}",
7231                    query_expr_name(&other)
7232                )))
7233            }
7234        };
7235        first_column_values(result)
7236    }
7237
7238    fn dispatch_expr(
7239        &self,
7240        expr: QueryExpr,
7241        query_str: &str,
7242        mode: QueryMode,
7243    ) -> RedDBResult<RuntimeQueryResult> {
7244        let statement = query_expr_name(&expr);
7245        match expr {
7246            QueryExpr::Graph(_) | QueryExpr::Path(_) => {
7247                // Graph queries are not cacheable as prepared statements.
7248                Err(RedDBError::Query(
7249                    "graph queries cannot be used as prepared statements".to_string(),
7250                ))
7251            }
7252            QueryExpr::Table(table) => {
7253                let scope = self.ai_scope();
7254                let table = self.resolve_table_expr_subqueries(
7255                    table,
7256                    &scope as &dyn super::statement_frame::ReadFrame,
7257                )?;
7258                if super::red_schema::is_virtual_table(&table.table) {
7259                    return Ok(RuntimeQueryResult {
7260                        query: query_str.to_string(),
7261                        mode,
7262                        statement,
7263                        engine: "runtime-red-schema",
7264                        result: super::red_schema::red_query(
7265                            self,
7266                            &table.table,
7267                            &table,
7268                            &scope as &dyn super::statement_frame::ReadFrame,
7269                        )?,
7270                        affected_rows: 0,
7271                        statement_type: "select",
7272                    });
7273                }
7274                let Some(table_with_rls) = self.authorize_relational_table_select(
7275                    table,
7276                    &scope as &dyn super::statement_frame::ReadFrame,
7277                )?
7278                else {
7279                    return Ok(RuntimeQueryResult {
7280                        query: query_str.to_string(),
7281                        mode,
7282                        statement,
7283                        engine: "runtime-table-rls",
7284                        result: crate::storage::query::unified::UnifiedResult::empty(),
7285                        affected_rows: 0,
7286                        statement_type: "select",
7287                    });
7288                };
7289                Ok(RuntimeQueryResult {
7290                    query: query_str.to_string(),
7291                    mode,
7292                    statement,
7293                    engine: "runtime-table",
7294                    result: execute_runtime_table_query(
7295                        &self.inner.db,
7296                        &table_with_rls,
7297                        Some(&self.inner.index_store),
7298                    )?,
7299                    affected_rows: 0,
7300                    statement_type: "select",
7301                })
7302            }
7303            QueryExpr::Join(join) => {
7304                let scope = self.ai_scope();
7305                let Some(join_with_rls) = self.authorize_relational_join_select(
7306                    join,
7307                    &scope as &dyn super::statement_frame::ReadFrame,
7308                )?
7309                else {
7310                    return Ok(RuntimeQueryResult {
7311                        query: query_str.to_string(),
7312                        mode,
7313                        statement,
7314                        engine: "runtime-join-rls",
7315                        result: crate::storage::query::unified::UnifiedResult::empty(),
7316                        affected_rows: 0,
7317                        statement_type: "select",
7318                    });
7319                };
7320                Ok(RuntimeQueryResult {
7321                    query: query_str.to_string(),
7322                    mode,
7323                    statement,
7324                    engine: "runtime-join",
7325                    result: execute_runtime_join_query(&self.inner.db, &join_with_rls)?,
7326                    affected_rows: 0,
7327                    statement_type: "select",
7328                })
7329            }
7330            QueryExpr::Vector(vector) => Ok(RuntimeQueryResult {
7331                query: query_str.to_string(),
7332                mode,
7333                statement,
7334                engine: "runtime-vector",
7335                result: execute_runtime_vector_query(&self.inner.db, &vector)?,
7336                affected_rows: 0,
7337                statement_type: "select",
7338            }),
7339            QueryExpr::Hybrid(hybrid) => Ok(RuntimeQueryResult {
7340                query: query_str.to_string(),
7341                mode,
7342                statement,
7343                engine: "runtime-hybrid",
7344                result: execute_runtime_hybrid_query(&self.inner.db, &hybrid)?,
7345                affected_rows: 0,
7346                statement_type: "select",
7347            }),
7348            QueryExpr::Insert(ref insert) if super::red_schema::is_virtual_table(&insert.table) => {
7349                Err(RedDBError::Query(
7350                    super::red_schema::READ_ONLY_ERROR.to_string(),
7351                ))
7352            }
7353            QueryExpr::Update(ref update) if super::red_schema::is_virtual_table(&update.table) => {
7354                Err(RedDBError::Query(
7355                    super::red_schema::READ_ONLY_ERROR.to_string(),
7356                ))
7357            }
7358            QueryExpr::Delete(ref delete) if super::red_schema::is_virtual_table(&delete.table) => {
7359                Err(RedDBError::Query(
7360                    super::red_schema::READ_ONLY_ERROR.to_string(),
7361                ))
7362            }
7363            QueryExpr::Insert(ref insert) => self
7364                .with_deferred_store_wal_for_dml(self.insert_may_emit_events(insert), || {
7365                    self.execute_insert(query_str, insert)
7366                }),
7367            QueryExpr::Update(ref update) => self
7368                .with_deferred_store_wal_for_dml(self.update_may_emit_events(update), || {
7369                    self.execute_update(query_str, update)
7370                }),
7371            QueryExpr::Delete(ref delete) => self
7372                .with_deferred_store_wal_for_dml(self.delete_may_emit_events(delete), || {
7373                    self.execute_delete(query_str, delete)
7374                }),
7375            QueryExpr::SearchCommand(ref cmd) => self.execute_search_command(query_str, cmd),
7376            QueryExpr::Ask(ref ask) => self.execute_ask(query_str, ask),
7377            _ => Err(RedDBError::Query(format!(
7378                "prepared-statement execution does not support {statement} statements"
7379            ))),
7380        }
7381    }
7382
7383    /// Ultra-fast path: detect `SELECT * FROM table WHERE _entity_id = N` by string pattern
7384    /// and execute it without SQL parsing or planning. Returns None if pattern doesn't match.
7385    fn try_fast_entity_lookup(&self, query: &str) -> Option<RedDBResult<RuntimeQueryResult>> {
7386        // Pattern: "SELECT * FROM <table> WHERE _entity_id = <id>"
7387        // or "SELECT * FROM <table> WHERE _entity_id =<id>"
7388        let q = query.trim();
7389        if !q.starts_with("SELECT") && !q.starts_with("select") {
7390            return None;
7391        }
7392
7393        // Find "WHERE _entity_id = " or "WHERE _entity_id ="
7394        let where_pos = q
7395            .find("WHERE _entity_id")
7396            .or_else(|| q.find("where _entity_id"))?;
7397        let after_field = &q[where_pos + 16..].trim_start(); // skip "WHERE _entity_id"
7398        let after_eq = after_field.strip_prefix('=')?.trim_start();
7399
7400        // Parse the entity ID number
7401        let id_str = after_eq.trim();
7402        let entity_id: u64 = id_str.parse().ok()?;
7403
7404        // Extract table name: between "FROM " and " WHERE"
7405        let from_pos = q.find("FROM ").or_else(|| q.find("from "))? + 5;
7406        let table = q[from_pos..where_pos].trim();
7407        if table.is_empty()
7408            || table.contains(' ') && !table.contains(" AS ") && !table.contains(" as ")
7409        {
7410            return None; // complex query, fall through
7411        }
7412        let table_name = table.split_whitespace().next()?;
7413
7414        // Direct entity lookup — skips SQL parse, plan cache, result
7415        // cache, view rewriter, RLS gate. Safe because the gating in
7416        // `execute_query` guarantees no scope override / no
7417        // transaction context is active. MVCC visibility is still
7418        // honoured against the current snapshot.
7419        let store = self.inner.db.store();
7420        let entity = store
7421            .get(
7422                table_name,
7423                crate::storage::unified::EntityId::new(entity_id),
7424            )
7425            .filter(entity_visible_under_current_snapshot);
7426
7427        let count = if entity.is_some() { 1u64 } else { 0 };
7428
7429        // Materialize a record so downstream consumers that walk
7430        // `result.records` (embedded runtime API, decrypt pass, CLI)
7431        // see the row. Previously only `pre_serialized_json` was
7432        // filled, which caused those consumers to see zero rows and
7433        // skewed benchmarks.
7434        let records: Vec<crate::storage::query::unified::UnifiedRecord> = entity
7435            .as_ref()
7436            .and_then(|e| runtime_table_record_from_entity(e.clone()))
7437            .into_iter()
7438            .collect();
7439
7440        let json = match entity {
7441            Some(ref e) => execute_runtime_serialize_single_entity(e),
7442            None => r#"{"columns":[],"record_count":0,"selection":{"scope":"any"},"records":[]}"#
7443                .to_string(),
7444        };
7445
7446        Some(Ok(RuntimeQueryResult {
7447            query: query.to_string(),
7448            mode: crate::storage::query::modes::QueryMode::Sql,
7449            statement: "select",
7450            engine: "fast-entity-lookup",
7451            result: crate::storage::query::unified::UnifiedResult {
7452                columns: Vec::new(),
7453                records,
7454                stats: crate::storage::query::unified::QueryStats {
7455                    rows_scanned: count,
7456                    ..Default::default()
7457                },
7458                pre_serialized_json: Some(json),
7459            },
7460            affected_rows: 0,
7461            statement_type: "select",
7462        }))
7463    }
7464
7465    fn result_cache_backend(&self) -> RuntimeResultCacheBackend {
7466        match self
7467            .config_string(RESULT_CACHE_BACKEND_KEY, RESULT_CACHE_DEFAULT_BACKEND)
7468            .as_str()
7469        {
7470            "blob_cache" => RuntimeResultCacheBackend::BlobCache,
7471            "shadow" => RuntimeResultCacheBackend::Shadow,
7472            _ => RuntimeResultCacheBackend::Legacy,
7473        }
7474    }
7475
7476    pub(super) fn get_result_cache_entry(&self, key: &str) -> Option<RuntimeQueryResult> {
7477        match self.result_cache_backend() {
7478            RuntimeResultCacheBackend::Legacy => self.get_legacy_result_cache_entry(key),
7479            RuntimeResultCacheBackend::BlobCache => self.get_blob_result_cache_entry(key),
7480            RuntimeResultCacheBackend::Shadow => {
7481                let legacy = self.get_legacy_result_cache_entry(key);
7482                let blob = self.get_blob_result_cache_entry(key);
7483                if let (Some(ref legacy), Some(ref blob)) = (&legacy, &blob) {
7484                    if result_cache_fingerprint(legacy) != result_cache_fingerprint(blob) {
7485                        self.inner
7486                            .result_cache_shadow_divergences
7487                            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
7488                        tracing::warn!(
7489                            key,
7490                            metric = crate::runtime::METRIC_CACHE_SHADOW_DIVERGENCE_TOTAL,
7491                            "result cache shadow backend diverged from legacy"
7492                        );
7493                    }
7494                }
7495                legacy
7496            }
7497        }
7498    }
7499
7500    fn get_legacy_result_cache_entry(&self, key: &str) -> Option<RuntimeQueryResult> {
7501        let cache = self.inner.result_cache.read();
7502        cache.0.get(key).and_then(|entry| {
7503            if entry.cached_at.elapsed().as_secs() < RESULT_CACHE_TTL_SECS {
7504                Some(entry.result.clone())
7505            } else {
7506                None
7507            }
7508        })
7509    }
7510
7511    fn get_blob_result_cache_entry(&self, key: &str) -> Option<RuntimeQueryResult> {
7512        let hit = self
7513            .inner
7514            .result_blob_cache
7515            .get(RESULT_CACHE_BLOB_NAMESPACE, key)?;
7516        {
7517            let cache = self.inner.result_blob_entries.read();
7518            if let Some(entry) = cache.0.get(key) {
7519                return Some(entry.result.clone());
7520            }
7521        }
7522
7523        let (result, scopes) = decode_result_cache_payload(hit.value())?;
7524        let mut cache = self.inner.result_blob_entries.write();
7525        let (ref mut map, ref mut order) = *cache;
7526        if !map.contains_key(key) {
7527            order.push_back(key.to_string());
7528        }
7529        map.insert(
7530            key.to_string(),
7531            RuntimeResultCacheEntry {
7532                result: result.clone(),
7533                cached_at: std::time::Instant::now(),
7534                scopes,
7535            },
7536        );
7537        trim_result_cache(map, order);
7538        Some(result)
7539    }
7540
7541    pub(super) fn put_result_cache_entry(&self, key: &str, entry: RuntimeResultCacheEntry) {
7542        match self.result_cache_backend() {
7543            RuntimeResultCacheBackend::Legacy => self.put_legacy_result_cache_entry(key, entry),
7544            RuntimeResultCacheBackend::BlobCache => self.put_blob_result_cache_entry(key, entry),
7545            RuntimeResultCacheBackend::Shadow => {
7546                self.put_legacy_result_cache_entry(key, entry.clone());
7547                self.put_blob_result_cache_entry(key, entry);
7548            }
7549        }
7550    }
7551
7552    fn put_legacy_result_cache_entry(&self, key: &str, entry: RuntimeResultCacheEntry) {
7553        let mut cache = self.inner.result_cache.write();
7554        let (ref mut map, ref mut order) = *cache;
7555        if !map.contains_key(key) {
7556            order.push_back(key.to_string());
7557        }
7558        map.insert(key.to_string(), entry);
7559        trim_result_cache(map, order);
7560    }
7561
7562    fn put_blob_result_cache_entry(&self, key: &str, entry: RuntimeResultCacheEntry) {
7563        let policy = crate::storage::cache::BlobCachePolicy::default()
7564            .ttl_ms(RESULT_CACHE_TTL_SECS * 1000)
7565            .priority(200);
7566        let dependencies = entry.scopes.iter().cloned().collect::<Vec<_>>();
7567        let bytes = encode_result_cache_payload(&entry)
7568            .unwrap_or_else(|| result_cache_fingerprint(&entry.result).into_bytes());
7569        let put = crate::storage::cache::BlobCachePut::new(bytes)
7570            .with_dependencies(dependencies)
7571            .with_policy(policy);
7572        if self
7573            .inner
7574            .result_blob_cache
7575            .put(RESULT_CACHE_BLOB_NAMESPACE, key, put)
7576            .is_err()
7577        {
7578            return;
7579        }
7580
7581        let mut cache = self.inner.result_blob_entries.write();
7582        let (ref mut map, ref mut order) = *cache;
7583        if !map.contains_key(key) {
7584            order.push_back(key.to_string());
7585        }
7586        map.insert(key.to_string(), entry);
7587        trim_result_cache(map, order);
7588    }
7589
7590    pub fn result_cache_shadow_divergences(&self) -> u64 {
7591        self.inner
7592            .result_cache_shadow_divergences
7593            .load(std::sync::atomic::Ordering::Relaxed)
7594    }
7595
7596    /// Invalidate the result cache (call after any write operation).
7597    /// Full clear — use for DDL (DROP TABLE, schema changes) or when table is unknown.
7598    pub fn invalidate_result_cache(&self) {
7599        let mut cache = self.inner.result_cache.write();
7600        cache.0.clear();
7601        cache.1.clear();
7602        let mut blob_entries = self.inner.result_blob_entries.write();
7603        blob_entries.0.clear();
7604        blob_entries.1.clear();
7605        self.inner
7606            .result_blob_cache
7607            .invalidate_namespace(RESULT_CACHE_BLOB_NAMESPACE);
7608        let mut ask_entries = self.inner.ask_answer_cache_entries.write();
7609        ask_entries.0.clear();
7610        ask_entries.1.clear();
7611        self.inner
7612            .result_blob_cache
7613            .invalidate_namespace(ASK_ANSWER_CACHE_NAMESPACE);
7614    }
7615
7616    /// Invalidate only result cache entries that declared a dependency on `table`.
7617    /// Cheaper than a full clear: unrelated tables keep their cached results.
7618    pub(crate) fn invalidate_result_cache_for_table(&self, table: &str) {
7619        // Hot-path probe both backends before taking write locks. The blob
7620        // backend is node-local, same as the legacy result cache.
7621        let legacy_has_match = {
7622            let cache = self.inner.result_cache.read();
7623            let (ref map, _) = *cache;
7624            !map.is_empty() && map.values().any(|entry| entry.scopes.contains(table))
7625        };
7626        let blob_has_match = {
7627            let cache = self.inner.result_blob_entries.read();
7628            let (ref map, _) = *cache;
7629            !map.is_empty() && map.values().any(|entry| entry.scopes.contains(table))
7630        };
7631        if legacy_has_match {
7632            let mut cache = self.inner.result_cache.write();
7633            let (ref mut map, ref mut order) = *cache;
7634            map.retain(|_, entry| !entry.scopes.contains(table));
7635            order.retain(|key| map.contains_key(key));
7636        }
7637
7638        if matches!(
7639            self.result_cache_backend(),
7640            RuntimeResultCacheBackend::BlobCache | RuntimeResultCacheBackend::Shadow
7641        ) {
7642            let mut blob_entries = self.inner.result_blob_entries.write();
7643            let (ref mut blob_map, ref mut blob_order) = *blob_entries;
7644            blob_map.clear();
7645            blob_order.clear();
7646            self.inner
7647                .result_blob_cache
7648                .invalidate_namespace(RESULT_CACHE_BLOB_NAMESPACE);
7649        } else if blob_has_match {
7650            let mut blob_entries = self.inner.result_blob_entries.write();
7651            let (ref mut blob_map, ref mut blob_order) = *blob_entries;
7652            blob_map.retain(|_, entry| !entry.scopes.contains(table));
7653            blob_order.retain(|key| blob_map.contains_key(key));
7654        }
7655        let mut ask_entries = self.inner.ask_answer_cache_entries.write();
7656        ask_entries.0.clear();
7657        ask_entries.1.clear();
7658        self.inner
7659            .result_blob_cache
7660            .invalidate_namespace(ASK_ANSWER_CACHE_NAMESPACE);
7661    }
7662
7663    pub(crate) fn invalidate_plan_cache(&self) {
7664        self.inner.query_cache.write().clear();
7665        self.inner
7666            .ddl_epoch
7667            .fetch_add(1, std::sync::atomic::Ordering::Release);
7668    }
7669
7670    /// Read the monotonic DDL epoch counter. Bumped by every
7671    /// `invalidate_plan_cache` call so prepared-statement holders can
7672    /// detect schema drift between PREPARE and EXECUTE.
7673    pub fn ddl_epoch(&self) -> u64 {
7674        self.inner
7675            .ddl_epoch
7676            .load(std::sync::atomic::Ordering::Acquire)
7677    }
7678
7679    pub(crate) fn clear_table_planner_stats(&self, table: &str) {
7680        let store = self.inner.db.store();
7681        crate::storage::query::planner::stats_catalog::clear_table_stats(store.as_ref(), table);
7682        self.invalidate_plan_cache();
7683    }
7684
7685    /// Replay `tenant_tables.*.column` keys from red_config at boot so
7686    /// `CREATE TABLE ... TENANT BY (col)` declarations persist across
7687    /// restarts (Phase 2.5.4). Reads every row of the `red_config`
7688    /// collection, picks the keys matching the tenant-marker shape,
7689    /// and calls `register_tenant_table` for each.
7690    ///
7691    /// Safe no-op when `red_config` doesn't exist (first boot on a
7692    /// fresh datadir).
7693    pub(crate) fn rehydrate_tenant_tables(&self) {
7694        let store = self.inner.db.store();
7695        let Some(manager) = store.get_collection("red_config") else {
7696            return;
7697        };
7698        // Replay in insertion order (SegmentManager iteration). Multiple
7699        // toggles on the same table leave several rows behind — the
7700        // last one processed wins because each register/unregister
7701        // call overwrites the in-memory state.
7702        for entity in manager.query_all(|_| true) {
7703            let crate::storage::unified::entity::EntityData::Row(row) = &entity.data else {
7704                continue;
7705            };
7706            let Some(named) = &row.named else { continue };
7707            let Some(crate::storage::schema::Value::Text(key)) = named.get("key") else {
7708                continue;
7709            };
7710            // Shape: tenant_tables.{table}.column
7711            let Some(rest) = key.strip_prefix("tenant_tables.") else {
7712                continue;
7713            };
7714            let Some((table, suffix)) = rest.rsplit_once('.') else {
7715                // Issue #205 — a `tenant_tables.*` row that doesn't
7716                // split cleanly is a schema-shape regression: the
7717                // metadata writer must always emit the `.column`
7718                // suffix, so reaching this branch means an upgrade
7719                // with incompatible state or external tampering.
7720                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7721                    collection: "red_config".to_string(),
7722                    detail: format!("malformed tenant_tables key: {key}"),
7723                }
7724                .emit_global();
7725                continue;
7726            };
7727            if suffix != "column" {
7728                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7729                    collection: "red_config".to_string(),
7730                    detail: format!("unexpected tenant_tables suffix: {key}"),
7731                }
7732                .emit_global();
7733                continue;
7734            }
7735            match named.get("value") {
7736                Some(crate::storage::schema::Value::Text(column)) => {
7737                    self.register_tenant_table(table, column);
7738                }
7739                // Null / missing value = DISABLE TENANCY marker.
7740                Some(crate::storage::schema::Value::Null) | None => {
7741                    self.unregister_tenant_table(table);
7742                }
7743                _ => {}
7744            }
7745        }
7746    }
7747
7748    /// Replay every persisted `MaterializedViewDescriptor` from the
7749    /// `red_materialized_view_defs` system collection (issue #593
7750    /// slice 9a). For each descriptor, re-parse the original SQL,
7751    /// extract the `QueryExpr::CreateView` it produced, and populate
7752    /// the in-memory registries (`inner.views` and
7753    /// `inner.materialized_views`) directly — no write paths run, so
7754    /// rehydrate does not re-persist what it just read.
7755    ///
7756    /// Malformed rows (missing `name`/`source_sql`, parse errors) are
7757    /// skipped with a `SchemaCorruption` operator event so a single
7758    /// bad entry does not block startup.
7759    pub(crate) fn rehydrate_materialized_view_descriptors(&self) {
7760        let store = self.inner.db.store();
7761        let descriptors = crate::runtime::continuous_materialized_view::load_all(store.as_ref());
7762        for descriptor in descriptors {
7763            let parsed = match crate::storage::query::parser::parse(&descriptor.source_sql) {
7764                Ok(qc) => qc,
7765                Err(err) => {
7766                    crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7767                        collection:
7768                            crate::runtime::continuous_materialized_view::CATALOG_COLLECTION
7769                                .to_string(),
7770                        detail: format!(
7771                            "failed to re-parse materialized-view source for {}: {err}",
7772                            descriptor.name
7773                        ),
7774                    }
7775                    .emit_global();
7776                    continue;
7777                }
7778            };
7779            let crate::storage::query::ast::QueryExpr::CreateView(create) = parsed.query else {
7780                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7781                    collection: crate::runtime::continuous_materialized_view::CATALOG_COLLECTION
7782                        .to_string(),
7783                    detail: format!(
7784                        "materialized-view source for {} did not re-parse as CREATE VIEW",
7785                        descriptor.name
7786                    ),
7787                }
7788                .emit_global();
7789                continue;
7790            };
7791            // Populate in-memory view registry.
7792            let view_name = create.name.clone();
7793            self.inner
7794                .views
7795                .write()
7796                .insert(view_name.clone(), Arc::new(create));
7797            // Materialized cache slot (data empty until next REFRESH).
7798            use crate::storage::cache::result::{MaterializedViewDef, RefreshPolicy};
7799            let refresh = match descriptor.refresh_every_ms {
7800                Some(ms) => RefreshPolicy::Periodic(std::time::Duration::from_millis(ms)),
7801                None => RefreshPolicy::Manual,
7802            };
7803            let def = MaterializedViewDef {
7804                name: view_name.clone(),
7805                query: format!("<parsed view {}>", view_name),
7806                dependencies: descriptor.source_collections.clone(),
7807                refresh,
7808                retention_duration_ms: descriptor.retention_duration_ms,
7809            };
7810            self.inner.materialized_views.write().register(def);
7811        }
7812        // A rehydrated view shape may differ from any plans the cache
7813        // bootstrapped before this method ran — flush to be safe.
7814        self.invalidate_plan_cache();
7815    }
7816
7817    pub(crate) fn rehydrate_declared_column_schemas(&self) {
7818        let store = self.inner.db.store();
7819        for contract in self.inner.db.collection_contracts() {
7820            let columns: Vec<String> = contract
7821                .declared_columns
7822                .iter()
7823                .map(|column| column.name.clone())
7824                .collect();
7825            let Some(manager) = store.get_collection(&contract.name) else {
7826                continue;
7827            };
7828            manager.set_column_schema_if_empty(columns);
7829        }
7830    }
7831
7832    /// Register a table as tenant-scoped (Phase 2.5.4). Installs the
7833    /// in-memory column mapping, the implicit RLS policy, and enables
7834    /// row-level security on the table. Idempotent — re-registering
7835    /// the same `(table, column)` replaces the prior auto-policy.
7836    pub fn register_tenant_table(&self, table: &str, column: &str) {
7837        use crate::storage::query::ast::{
7838            CompareOp, CreatePolicyQuery, Expr, FieldRef, Filter, Span,
7839        };
7840        self.inner
7841            .tenant_tables
7842            .write()
7843            .insert(table.to_string(), column.to_string());
7844
7845        // Build the policy: col = CURRENT_TENANT()
7846        // Uses CompareExpr so the comparison happens at runtime against
7847        // the thread-local tenant value read by the CURRENT_TENANT
7848        // scalar. Spans are synthetic — there's no source location for
7849        // an auto-generated policy.
7850        let lhs = Expr::Column {
7851            field: FieldRef::TableColumn {
7852                table: table.to_string(),
7853                column: column.to_string(),
7854            },
7855            span: Span::synthetic(),
7856        };
7857        let rhs = Expr::FunctionCall {
7858            name: "CURRENT_TENANT".to_string(),
7859            args: Vec::new(),
7860            span: Span::synthetic(),
7861        };
7862        let policy_filter = Filter::CompareExpr {
7863            lhs,
7864            op: CompareOp::Eq,
7865            rhs,
7866        };
7867
7868        let policy = CreatePolicyQuery {
7869            name: "__tenant_iso".to_string(),
7870            table: table.to_string(),
7871            action: None, // None = ALL actions (SELECT/INSERT/UPDATE/DELETE)
7872            role: None,   // None = every role
7873            using: Box::new(policy_filter),
7874            // Auto-tenancy defaults to Table targets. Collections of
7875            // other kinds (graph / vector / queue / timeseries) that
7876            // opt in via `ALTER ... ENABLE TENANCY` should use the
7877            // matching kind — but for now we keep the auto-policy
7878            // kind-agnostic so the evaluator can apply it to any
7879            // entity living in the collection.
7880            target_kind: crate::storage::query::ast::PolicyTargetKind::Table,
7881        };
7882
7883        // Replace any prior auto-policy for this table (column rename).
7884        self.inner.rls_policies.write().insert(
7885            (table.to_string(), "__tenant_iso".to_string()),
7886            Arc::new(policy),
7887        );
7888        self.inner
7889            .rls_enabled_tables
7890            .write()
7891            .insert(table.to_string());
7892
7893        // Auto-build a hash index on the tenant column. Every read/write
7894        // against a tenant-scoped table carries an implicit
7895        // `col = CURRENT_TENANT()` predicate from the auto-policy, so an
7896        // index on that column is on the hot path of every query. Without
7897        // it, every SELECT/UPDATE/DELETE degrades to a full scan.
7898        self.ensure_tenant_index(table, column);
7899    }
7900
7901    /// Auto-create the hash index that backs the tenant-iso RLS predicate.
7902    /// Skipped when:
7903    ///   * the column is dotted (nested path — flat secondary indices
7904    ///     don't cover those today; RLS still works via the policy)
7905    ///   * `__tenant_idx_{table}` already exists (idempotent on rehydrate)
7906    ///   * the user already registered an index whose first column matches
7907    ///     (avoids redundant duplicates of a user-defined composite)
7908    fn ensure_tenant_index(&self, table: &str, column: &str) {
7909        if column.contains('.') {
7910            return;
7911        }
7912        let index_name = format!("__tenant_idx_{table}");
7913        let registry = self.inner.index_store.list_indices(table);
7914        if registry.iter().any(|idx| idx.name == index_name) {
7915            return;
7916        }
7917        if registry
7918            .iter()
7919            .any(|idx| idx.columns.first().map(|c| c.as_str()) == Some(column))
7920        {
7921            return;
7922        }
7923
7924        let store = self.inner.db.store();
7925        let Some(manager) = store.get_collection(table) else {
7926            return;
7927        };
7928        let entities = manager.query_all(|_| true);
7929        let entity_fields: Vec<(
7930            crate::storage::unified::EntityId,
7931            Vec<(String, crate::storage::schema::Value)>,
7932        )> = entities
7933            .iter()
7934            .map(|e| {
7935                let fields = match &e.data {
7936                    crate::storage::EntityData::Row(row) => {
7937                        if let Some(ref named) = row.named {
7938                            named.iter().map(|(k, v)| (k.clone(), v.clone())).collect()
7939                        } else if let Some(ref schema) = row.schema {
7940                            schema
7941                                .iter()
7942                                .zip(row.columns.iter())
7943                                .map(|(k, v)| (k.clone(), v.clone()))
7944                                .collect()
7945                        } else {
7946                            Vec::new()
7947                        }
7948                    }
7949                    crate::storage::EntityData::Node(node) => node
7950                        .properties
7951                        .iter()
7952                        .map(|(k, v)| (k.clone(), v.clone()))
7953                        .collect(),
7954                    _ => Vec::new(),
7955                };
7956                (e.id, fields)
7957            })
7958            .collect();
7959
7960        let columns = vec![column.to_string()];
7961        if self
7962            .inner
7963            .index_store
7964            .create_index(
7965                &index_name,
7966                table,
7967                &columns,
7968                super::index_store::IndexMethodKind::Hash,
7969                false,
7970                &entity_fields,
7971            )
7972            .is_err()
7973        {
7974            return;
7975        }
7976        self.inner
7977            .index_store
7978            .register(super::index_store::RegisteredIndex {
7979                name: index_name,
7980                collection: table.to_string(),
7981                columns,
7982                method: super::index_store::IndexMethodKind::Hash,
7983                unique: false,
7984            });
7985        self.invalidate_plan_cache();
7986    }
7987
7988    /// Drop the auto-generated tenant index, if one exists. Called from
7989    /// `unregister_tenant_table` so DISABLE TENANCY / DROP TABLE clean up.
7990    fn drop_tenant_index(&self, table: &str) {
7991        let index_name = format!("__tenant_idx_{table}");
7992        self.inner.index_store.drop_index(&index_name, table);
7993    }
7994
7995    /// Retrieve the tenant column for a table, if any (Phase 2.5.4).
7996    /// Used by the INSERT auto-fill path to know which column to
7997    /// populate with `current_tenant()` when the user didn't name it.
7998    pub fn tenant_column(&self, table: &str) -> Option<String> {
7999        self.inner.tenant_tables.read().get(table).cloned()
8000    }
8001
8002    /// Remove a table's tenant registration (Phase 2.5.4). Called by
8003    /// DROP TABLE / ALTER TABLE DISABLE TENANCY. Removes the auto-policy
8004    /// but leaves any user-installed explicit policies intact.
8005    pub fn unregister_tenant_table(&self, table: &str) {
8006        self.inner.tenant_tables.write().remove(table);
8007        self.inner
8008            .rls_policies
8009            .write()
8010            .remove(&(table.to_string(), "__tenant_iso".to_string()));
8011        self.drop_tenant_index(table);
8012        // Only clear RLS enablement if no other policies remain.
8013        let has_other_policies = self
8014            .inner
8015            .rls_policies
8016            .read()
8017            .keys()
8018            .any(|(t, _)| t == table);
8019        if !has_other_policies {
8020            self.inner.rls_enabled_tables.write().remove(table);
8021        }
8022    }
8023
8024    /// Record that the running transaction has marked `id` in `collection`
8025    /// for deletion (Phase 2.3.2b MVCC tombstones). `stamper_xid` is the
8026    /// xid that was written into `xmax` — either the parent txn xid or
8027    /// the innermost savepoint sub-xid. Savepoint rollback filters by
8028    /// this xid to revive only its own tombstones.
8029    pub(crate) fn record_pending_tombstone(
8030        &self,
8031        conn_id: u64,
8032        collection: &str,
8033        id: crate::storage::unified::entity::EntityId,
8034        stamper_xid: crate::storage::transaction::snapshot::Xid,
8035        previous_xmax: crate::storage::transaction::snapshot::Xid,
8036    ) {
8037        self.inner
8038            .pending_tombstones
8039            .write()
8040            .entry(conn_id)
8041            .or_default()
8042            .push((collection.to_string(), id, stamper_xid, previous_xmax));
8043    }
8044
8045    pub(crate) fn record_pending_versioned_update(
8046        &self,
8047        conn_id: u64,
8048        collection: &str,
8049        old_id: crate::storage::unified::entity::EntityId,
8050        new_id: crate::storage::unified::entity::EntityId,
8051        stamper_xid: crate::storage::transaction::snapshot::Xid,
8052        previous_xmax: crate::storage::transaction::snapshot::Xid,
8053    ) {
8054        self.inner
8055            .pending_versioned_updates
8056            .write()
8057            .entry(conn_id)
8058            .or_default()
8059            .push((
8060                collection.to_string(),
8061                old_id,
8062                new_id,
8063                stamper_xid,
8064                previous_xmax,
8065            ));
8066    }
8067
8068    fn with_deferred_store_wal_if_transaction<T>(
8069        &self,
8070        f: impl FnOnce() -> RedDBResult<T>,
8071    ) -> RedDBResult<T> {
8072        let conn_id = current_connection_id();
8073        if !self.inner.tx_contexts.read().contains_key(&conn_id) {
8074            return f();
8075        }
8076
8077        crate::storage::UnifiedStore::begin_deferred_store_wal_capture();
8078        let result = f();
8079        let captured = crate::storage::UnifiedStore::take_deferred_store_wal_capture();
8080        match result {
8081            Ok(value) => {
8082                self.record_pending_store_wal_actions(conn_id, captured);
8083                Ok(value)
8084            }
8085            Err(err) => Err(err),
8086        }
8087    }
8088
8089    fn with_deferred_store_wal_for_dml<T>(
8090        &self,
8091        capture_autocommit_events: bool,
8092        f: impl FnOnce() -> RedDBResult<T>,
8093    ) -> RedDBResult<T> {
8094        let conn_id = current_connection_id();
8095        if self.inner.tx_contexts.read().contains_key(&conn_id) {
8096            return self.with_deferred_store_wal_if_transaction(f);
8097        }
8098        if !capture_autocommit_events {
8099            return f();
8100        }
8101
8102        crate::storage::UnifiedStore::begin_deferred_store_wal_capture();
8103        let result = f();
8104        let captured = crate::storage::UnifiedStore::take_deferred_store_wal_capture();
8105        self.inner
8106            .db
8107            .store()
8108            .append_deferred_store_wal_actions(captured)
8109            .map_err(|err| RedDBError::Internal(err.to_string()))?;
8110        result
8111    }
8112
8113    fn insert_may_emit_events(&self, query: &InsertQuery) -> bool {
8114        !query.suppress_events
8115            && self.collection_has_event_subscriptions_for_operation(
8116                &query.table,
8117                crate::catalog::SubscriptionOperation::Insert,
8118            )
8119    }
8120
8121    fn update_may_emit_events(&self, query: &UpdateQuery) -> bool {
8122        !query.suppress_events
8123            && self.collection_has_event_subscriptions_for_operation(
8124                &query.table,
8125                crate::catalog::SubscriptionOperation::Update,
8126            )
8127    }
8128
8129    fn delete_may_emit_events(&self, query: &DeleteQuery) -> bool {
8130        !query.suppress_events
8131            && self.collection_has_event_subscriptions_for_operation(
8132                &query.table,
8133                crate::catalog::SubscriptionOperation::Delete,
8134            )
8135    }
8136
8137    fn collection_has_event_subscriptions_for_operation(
8138        &self,
8139        collection: &str,
8140        operation: crate::catalog::SubscriptionOperation,
8141    ) -> bool {
8142        let Some(contract) = self.db().collection_contract_arc(collection) else {
8143            return false;
8144        };
8145        contract.subscriptions.iter().any(|subscription| {
8146            subscription.enabled
8147                && (subscription.ops_filter.is_empty()
8148                    || subscription.ops_filter.contains(&operation))
8149        })
8150    }
8151
8152    fn record_pending_store_wal_actions(
8153        &self,
8154        conn_id: u64,
8155        actions: crate::storage::unified::DeferredStoreWalActions,
8156    ) {
8157        if actions.is_empty() {
8158            return;
8159        }
8160        let mut guard = self.inner.pending_store_wal_actions.write();
8161        guard.entry(conn_id).or_default().extend(actions);
8162    }
8163
8164    fn flush_pending_store_wal_actions(&self, conn_id: u64) -> RedDBResult<()> {
8165        let Some(actions) = self
8166            .inner
8167            .pending_store_wal_actions
8168            .write()
8169            .remove(&conn_id)
8170        else {
8171            return Ok(());
8172        };
8173        self.inner
8174            .db
8175            .store()
8176            .append_deferred_store_wal_actions(actions)
8177            .map_err(|err| RedDBError::Internal(err.to_string()))
8178    }
8179
8180    fn discard_pending_store_wal_actions(&self, conn_id: u64) {
8181        self.inner
8182            .pending_store_wal_actions
8183            .write()
8184            .remove(&conn_id);
8185    }
8186
8187    fn xid_conflicts_with_snapshot(
8188        &self,
8189        xid: crate::storage::transaction::snapshot::Xid,
8190        snapshot: &crate::storage::transaction::snapshot::Snapshot,
8191        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
8192    ) -> bool {
8193        xid != 0
8194            && !own_xids.contains(&xid)
8195            && !self.inner.snapshot_manager.is_aborted(xid)
8196            && !self.inner.snapshot_manager.is_active(xid)
8197            && (xid > snapshot.xid || snapshot.in_progress.contains(&xid))
8198    }
8199
8200    fn conflict_error(
8201        collection: &str,
8202        logical_id: crate::storage::unified::entity::EntityId,
8203        xid: crate::storage::transaction::snapshot::Xid,
8204    ) -> RedDBError {
8205        RedDBError::Query(format!(
8206            "serialization conflict: table row {collection}/{} was modified by concurrent transaction {xid}",
8207            logical_id.raw()
8208        ))
8209    }
8210
8211    fn check_logical_row_conflict(
8212        &self,
8213        collection: &str,
8214        logical_id: crate::storage::unified::entity::EntityId,
8215        excluded_ids: &[crate::storage::unified::entity::EntityId],
8216        snapshot: &crate::storage::transaction::snapshot::Snapshot,
8217        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
8218    ) -> RedDBResult<()> {
8219        let store = self.inner.db.store();
8220        let Some(manager) = store.get_collection(collection) else {
8221            return Ok(());
8222        };
8223
8224        for candidate in manager.query_all(|_| true) {
8225            if excluded_ids.contains(&candidate.id) || candidate.logical_id() != logical_id {
8226                continue;
8227            }
8228            if self.xid_conflicts_with_snapshot(candidate.xmin, snapshot, own_xids) {
8229                return Err(Self::conflict_error(collection, logical_id, candidate.xmin));
8230            }
8231            if self.xid_conflicts_with_snapshot(candidate.xmax, snapshot, own_xids) {
8232                return Err(Self::conflict_error(collection, logical_id, candidate.xmax));
8233            }
8234        }
8235        Ok(())
8236    }
8237
8238    pub(crate) fn check_table_row_write_conflicts(
8239        &self,
8240        conn_id: u64,
8241        snapshot: &crate::storage::transaction::snapshot::Snapshot,
8242        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
8243    ) -> RedDBResult<()> {
8244        let versioned_updates = self
8245            .inner
8246            .pending_versioned_updates
8247            .read()
8248            .get(&conn_id)
8249            .cloned()
8250            .unwrap_or_default();
8251        let tombstones = self
8252            .inner
8253            .pending_tombstones
8254            .read()
8255            .get(&conn_id)
8256            .cloned()
8257            .unwrap_or_default();
8258
8259        let store = self.inner.db.store();
8260        for (collection, old_id, new_id, xid, previous_xmax) in versioned_updates {
8261            let Some(manager) = store.get_collection(&collection) else {
8262                continue;
8263            };
8264            let Some(old) = manager.get(old_id) else {
8265                continue;
8266            };
8267            let logical_id = old.logical_id();
8268            if self.xid_conflicts_with_snapshot(previous_xmax, snapshot, own_xids) {
8269                return Err(Self::conflict_error(&collection, logical_id, previous_xmax));
8270            }
8271            if old.xmax != xid && self.xid_conflicts_with_snapshot(old.xmax, snapshot, own_xids) {
8272                return Err(Self::conflict_error(&collection, logical_id, old.xmax));
8273            }
8274            self.check_logical_row_conflict(
8275                &collection,
8276                logical_id,
8277                &[old_id, new_id],
8278                snapshot,
8279                own_xids,
8280            )?;
8281        }
8282
8283        for (collection, id, xid, previous_xmax) in tombstones {
8284            let Some(manager) = store.get_collection(&collection) else {
8285                continue;
8286            };
8287            let Some(entity) = manager.get(id) else {
8288                continue;
8289            };
8290            let logical_id = entity.logical_id();
8291            if self.xid_conflicts_with_snapshot(previous_xmax, snapshot, own_xids) {
8292                return Err(Self::conflict_error(&collection, logical_id, previous_xmax));
8293            }
8294            if entity.xmax != xid
8295                && self.xid_conflicts_with_snapshot(entity.xmax, snapshot, own_xids)
8296            {
8297                return Err(Self::conflict_error(&collection, logical_id, entity.xmax));
8298            }
8299            self.check_logical_row_conflict(&collection, logical_id, &[id], snapshot, own_xids)?;
8300        }
8301
8302        Ok(())
8303    }
8304
8305    pub(crate) fn restore_pending_write_stamps(&self, conn_id: u64) {
8306        let versioned_updates = self
8307            .inner
8308            .pending_versioned_updates
8309            .read()
8310            .get(&conn_id)
8311            .cloned()
8312            .unwrap_or_default();
8313        let tombstones = self
8314            .inner
8315            .pending_tombstones
8316            .read()
8317            .get(&conn_id)
8318            .cloned()
8319            .unwrap_or_default();
8320
8321        let store = self.inner.db.store();
8322        for (collection, old_id, _new_id, xid, _previous_xmax) in versioned_updates {
8323            if let Some(manager) = store.get_collection(&collection) {
8324                if let Some(mut entity) = manager.get(old_id) {
8325                    entity.set_xmax(xid);
8326                    let _ = manager.update(entity);
8327                }
8328            }
8329        }
8330        for (collection, id, xid, _previous_xmax) in tombstones {
8331            if let Some(manager) = store.get_collection(&collection) {
8332                if let Some(mut entity) = manager.get(id) {
8333                    entity.set_xmax(xid);
8334                    let _ = manager.update(entity);
8335                }
8336            }
8337        }
8338    }
8339
8340    pub(crate) fn finalize_pending_versioned_updates(&self, conn_id: u64) {
8341        self.inner
8342            .pending_versioned_updates
8343            .write()
8344            .remove(&conn_id);
8345    }
8346
8347    pub(crate) fn revive_pending_versioned_updates(&self, conn_id: u64) {
8348        let Some(pending) = self
8349            .inner
8350            .pending_versioned_updates
8351            .write()
8352            .remove(&conn_id)
8353        else {
8354            return;
8355        };
8356
8357        let store = self.inner.db.store();
8358        for (collection, old_id, new_id, xid, previous_xmax) in pending {
8359            if let Some(manager) = store.get_collection(&collection) {
8360                if let Some(mut old) = manager.get(old_id) {
8361                    if old.xmax == xid {
8362                        old.set_xmax(previous_xmax);
8363                        let _ = manager.update(old);
8364                    }
8365                }
8366            }
8367            let _ = store.delete_batch(&collection, &[new_id]);
8368        }
8369    }
8370
8371    pub(crate) fn revive_versioned_updates_since(&self, conn_id: u64, stamper_xid: u64) -> usize {
8372        let mut guard = self.inner.pending_versioned_updates.write();
8373        let Some(pending) = guard.get_mut(&conn_id) else {
8374            return 0;
8375        };
8376
8377        let store = self.inner.db.store();
8378        let mut reverted = 0usize;
8379        pending.retain(|(collection, old_id, new_id, xid, previous_xmax)| {
8380            if *xid < stamper_xid {
8381                return true;
8382            }
8383            if let Some(manager) = store.get_collection(collection) {
8384                if let Some(mut old) = manager.get(*old_id) {
8385                    if old.xmax == *xid {
8386                        old.set_xmax(*previous_xmax);
8387                        let _ = manager.update(old);
8388                    }
8389                }
8390            }
8391            let _ = store.delete_batch(collection, &[*new_id]);
8392            reverted += 1;
8393            false
8394        });
8395        if pending.is_empty() {
8396            guard.remove(&conn_id);
8397        }
8398        reverted
8399    }
8400
8401    /// Flush tombstones on COMMIT. The xmax stamp is already the durable
8402    /// delete marker; commit only drops the rollback journal and emits
8403    /// side effects. Physical reclamation is left for VACUUM so old
8404    /// snapshots can still resolve the pre-delete row version.
8405    pub(crate) fn finalize_pending_tombstones(&self, conn_id: u64) {
8406        let Some(pending) = self.inner.pending_tombstones.write().remove(&conn_id) else {
8407            return;
8408        };
8409        if pending.is_empty() {
8410            return;
8411        }
8412
8413        let store = self.inner.db.store();
8414        for (collection, id, _xid, _previous_xmax) in pending {
8415            store.context_index().remove_entity(id);
8416            self.cdc_emit(
8417                crate::replication::cdc::ChangeOperation::Delete,
8418                &collection,
8419                id.raw(),
8420                "entity",
8421            );
8422        }
8423    }
8424
8425    /// Revive tombstones on ROLLBACK — reset `xmax` to 0 so the tuples
8426    /// become visible again to future snapshots. Best-effort: a row
8427    /// already reclaimed by a concurrent VACUUM stays gone, but VACUUM
8428    /// never reclaims tuples whose xmax is still referenced by any
8429    /// active snapshot, so this case is only reachable via external
8430    /// storage corruption.
8431    pub(crate) fn revive_pending_tombstones(&self, conn_id: u64) {
8432        let Some(pending) = self.inner.pending_tombstones.write().remove(&conn_id) else {
8433            return;
8434        };
8435
8436        let store = self.inner.db.store();
8437        for (collection, id, xid, previous_xmax) in pending {
8438            let Some(manager) = store.get_collection(&collection) else {
8439                continue;
8440            };
8441            if let Some(mut entity) = manager.get(id) {
8442                if entity.xmax == xid {
8443                    entity.set_xmax(previous_xmax);
8444                    let _ = manager.update(entity);
8445                }
8446            }
8447        }
8448    }
8449
8450    pub(crate) fn finalize_pending_kv_watch_events(&self, conn_id: u64) {
8451        let Some(pending) = self.inner.pending_kv_watch_events.write().remove(&conn_id) else {
8452            return;
8453        };
8454        for event in pending {
8455            self.cdc_emit_kv(
8456                event.op,
8457                &event.collection,
8458                &event.key,
8459                0,
8460                event.before,
8461                event.after,
8462            );
8463        }
8464    }
8465
8466    pub(crate) fn discard_pending_kv_watch_events(&self, conn_id: u64) {
8467        self.inner.pending_kv_watch_events.write().remove(&conn_id);
8468    }
8469
8470    /// Materialise the entire graph store while applying MVCC visibility
8471    /// AND per-collection RLS to each candidate node and edge. Mirrors
8472    /// `materialize_graph` but routes every entity through the same
8473    /// gate the SELECT path uses, with the correct `PolicyTargetKind`
8474    /// per entity kind (`Nodes` for graph nodes, `Edges` for graph
8475    /// edges). Returns the filtered `GraphStore` plus the
8476    /// `node_id → properties` map the executor needs for `RETURN n.*`
8477    /// projections.
8478    fn materialize_graph_with_rls(
8479        &self,
8480    ) -> RedDBResult<(
8481        crate::storage::engine::GraphStore,
8482        std::collections::HashMap<
8483            String,
8484            std::collections::HashMap<String, crate::storage::schema::Value>,
8485        >,
8486        crate::storage::query::unified::EdgeProperties,
8487    )> {
8488        use crate::storage::engine::GraphStore;
8489        use crate::storage::query::ast::{PolicyAction, PolicyTargetKind};
8490        use crate::storage::unified::entity::{EntityData, EntityKind};
8491        use std::collections::{HashMap, HashSet};
8492
8493        let store = self.inner.db.store();
8494        let snap_ctx = capture_current_snapshot();
8495        let role = current_auth_identity().map(|(_, r)| r.as_str().to_string());
8496
8497        let graph = GraphStore::new();
8498        let mut node_properties: HashMap<String, HashMap<String, crate::storage::schema::Value>> =
8499            HashMap::new();
8500        let mut edge_properties: crate::storage::query::unified::EdgeProperties = HashMap::new();
8501        let mut allowed_nodes: HashSet<String> = HashSet::new();
8502
8503        // Per-collection cached compiled filters — Nodes-kind for
8504        // first pass, Edges-kind for the second. None entries mean
8505        // "RLS enabled, zero matching policy → deny all of this kind".
8506        let mut node_rls: HashMap<String, Option<crate::storage::query::ast::Filter>> =
8507            HashMap::new();
8508        let mut edge_rls: HashMap<String, Option<crate::storage::query::ast::Filter>> =
8509            HashMap::new();
8510
8511        let collections = store.list_collections();
8512
8513        // First pass — gather nodes.
8514        for collection in &collections {
8515            let Some(manager) = store.get_collection(collection) else {
8516                continue;
8517            };
8518            let entities = manager.query_all(|_| true);
8519            for entity in entities {
8520                if !entity_visible_with_context(snap_ctx.as_ref(), &entity) {
8521                    continue;
8522                }
8523                let EntityKind::GraphNode(ref node) = entity.kind else {
8524                    continue;
8525                };
8526                if !node_passes_rls(self, collection, role.as_deref(), &mut node_rls, &entity) {
8527                    continue;
8528                }
8529                let id_str = entity.id.raw().to_string();
8530                graph
8531                    .add_node_with_label(
8532                        &id_str,
8533                        &node.label,
8534                        &super::graph_node_label(&node.node_type),
8535                    )
8536                    .map_err(|err| RedDBError::Query(err.to_string()))?;
8537                allowed_nodes.insert(id_str.clone());
8538                if let EntityData::Node(node_data) = &entity.data {
8539                    node_properties.insert(id_str, node_data.properties.clone());
8540                }
8541            }
8542        }
8543
8544        // Second pass — gather edges. An edge appears only when both
8545        // endpoint nodes survived the RLS pass AND the edge itself
8546        // passes its own RLS gate.
8547        for collection in &collections {
8548            let Some(manager) = store.get_collection(collection) else {
8549                continue;
8550            };
8551            let entities = manager.query_all(|_| true);
8552            for entity in entities {
8553                if !entity_visible_with_context(snap_ctx.as_ref(), &entity) {
8554                    continue;
8555                }
8556                let EntityKind::GraphEdge(ref edge) = entity.kind else {
8557                    continue;
8558                };
8559                if !allowed_nodes.contains(&edge.from_node)
8560                    || !allowed_nodes.contains(&edge.to_node)
8561                {
8562                    continue;
8563                }
8564                if !edge_passes_rls(self, collection, role.as_deref(), &mut edge_rls, &entity) {
8565                    continue;
8566                }
8567                let weight = match &entity.data {
8568                    EntityData::Edge(e) => e.weight,
8569                    _ => edge.weight as f32 / 1000.0,
8570                };
8571                let edge_label = super::graph_edge_label(&edge.label);
8572                graph
8573                    .add_edge_with_label(&edge.from_node, &edge.to_node, &edge_label, weight)
8574                    .map_err(|err| RedDBError::Query(err.to_string()))?;
8575                if let EntityData::Edge(edge_data) = &entity.data {
8576                    edge_properties.insert(
8577                        (edge.from_node.clone(), edge_label, edge.to_node.clone()),
8578                        edge_data.properties.clone(),
8579                    );
8580                }
8581            }
8582        }
8583
8584        // Suppress unused-PolicyAction/PolicyTargetKind warnings — both
8585        // are used inside the helper closures via the per-kind helpers
8586        // declared at the bottom of this file.
8587        let _ = (PolicyAction::Select, PolicyTargetKind::Nodes);
8588
8589        Ok((graph, node_properties, edge_properties))
8590    }
8591
8592    /// Phase 1.1 MVCC universal: post-save hook that stamps `xmin` on a
8593    /// freshly-inserted entity when the current connection holds an
8594    /// open transaction. Used by graph / vector / queue / timeseries
8595    /// write paths that go through the DevX builder API (`db.node(...)
8596    /// .save()` and friends) — those live in the storage crate and
8597    /// can't reach `current_xid()` without crossing layers, so the
8598    /// application layer calls this helper right after `save()` to
8599    /// finalise the MVCC stamp.
8600    ///
8601    /// Autocommit (outside BEGIN) is a no-op — no extra lookup or
8602    /// write, so the non-transactional hot path stays untouched.
8603    ///
8604    /// Best-effort: if the collection or entity disappears between
8605    /// the save and the stamp (concurrent DROP), we silently skip.
8606    pub(crate) fn stamp_xmin_if_in_txn(
8607        &self,
8608        collection: &str,
8609        id: crate::storage::unified::entity::EntityId,
8610    ) {
8611        let Some(xid) = self.current_xid() else {
8612            return;
8613        };
8614        let store = self.inner.db.store();
8615        let Some(manager) = store.get_collection(collection) else {
8616            return;
8617        };
8618        if let Some(mut entity) = manager.get(id) {
8619            entity.set_xmin(xid);
8620            let _ = manager.update(entity);
8621        }
8622    }
8623
8624    /// Revive tombstones stamped by `stamper_xid` or any sub-xid
8625    /// allocated after it (Phase 2.3.2e savepoint rollback). Any
8626    /// pending entries with `xid < stamper_xid` stay queued because
8627    /// they belong to the enclosing scope — they'll either flush on
8628    /// COMMIT or revive on an outer ROLLBACK TO SAVEPOINT.
8629    ///
8630    /// Returns the number of tuples whose `xmax` was wiped back to 0.
8631    pub(crate) fn revive_tombstones_since(&self, conn_id: u64, stamper_xid: u64) -> usize {
8632        let mut guard = self.inner.pending_tombstones.write();
8633        let Some(pending) = guard.get_mut(&conn_id) else {
8634            return 0;
8635        };
8636
8637        let store = self.inner.db.store();
8638        let mut revived = 0usize;
8639        pending.retain(|(collection, id, xid, previous_xmax)| {
8640            if *xid < stamper_xid {
8641                // Stamped before the savepoint — keep in queue.
8642                return true;
8643            }
8644            if let Some(manager) = store.get_collection(collection) {
8645                if let Some(mut entity) = manager.get(*id) {
8646                    if entity.xmax == *xid {
8647                        entity.set_xmax(*previous_xmax);
8648                        let _ = manager.update(entity);
8649                        revived += 1;
8650                    }
8651                }
8652            }
8653            false
8654        });
8655        if pending.is_empty() {
8656            guard.remove(&conn_id);
8657        }
8658        revived
8659    }
8660
8661    /// Return the snapshot the current connection should use for visibility
8662    /// checks (Phase 2.3 PG parity).
8663    ///
8664    /// * If the connection is inside a BEGIN-wrapped transaction, reuse
8665    ///   the snapshot stored in its `TxnContext`.
8666    /// * Otherwise (autocommit), capture a fresh snapshot tied to an
8667    ///   implicit xid=0 — the read path treats pre-MVCC rows as always
8668    ///   visible so this degrades to "see everything committed".
8669    pub fn current_snapshot(&self) -> crate::storage::transaction::snapshot::Snapshot {
8670        let conn_id = current_connection_id();
8671        if let Some(ctx) = self.inner.tx_contexts.read().get(&conn_id).cloned() {
8672            return ctx.snapshot;
8673        }
8674        // Autocommit: take a fresh snapshot bounded by `peek_next_xid` so
8675        // every already-committed xid (which is strictly less) passes the
8676        // `xmin <= snap.xid` gate, while concurrently-active xids land in
8677        // the `in_progress` set and stay hidden until they commit. Using
8678        // xid=0 would incorrectly hide every MVCC-stamped tuple.
8679        let high_water = self.inner.snapshot_manager.peek_next_xid();
8680        self.inner.snapshot_manager.snapshot(high_water)
8681    }
8682
8683    /// Xid of the current connection's active transaction, or `None` when
8684    /// running outside a BEGIN/COMMIT block. Write paths call this to
8685    /// decide whether to stamp `xmin`/`xmax` on tuples.
8686    /// Phase 2.3.2e: when a savepoint is open, `writer_xid` returns the
8687    /// sub-xid so new writes can be selectively rolled back. Otherwise
8688    /// the parent txn's xid is returned, matching pre-savepoint
8689    /// behaviour. Callers that need the enclosing *transaction* xid
8690    /// (e.g. VACUUM min-active calculations) should read `ctx.xid`
8691    /// directly.
8692    pub fn current_xid(&self) -> Option<crate::storage::transaction::snapshot::Xid> {
8693        let conn_id = current_connection_id();
8694        self.inner
8695            .tx_contexts
8696            .read()
8697            .get(&conn_id)
8698            .map(|ctx| ctx.writer_xid())
8699    }
8700
8701    /// Access the shared `SnapshotManager` — useful for VACUUM to compute
8702    /// the oldest-active xid when reclaiming dead tuples.
8703    pub fn snapshot_manager(&self) -> Arc<crate::storage::transaction::snapshot::SnapshotManager> {
8704        Arc::clone(&self.inner.snapshot_manager)
8705    }
8706
8707    fn mvcc_vacuum_cutoff_xid(&self) -> crate::storage::transaction::snapshot::Xid {
8708        let manager = &self.inner.snapshot_manager;
8709        let next_xid = manager.peek_next_xid();
8710        let mut cutoff = next_xid;
8711        if let Some(oldest_active) = manager.oldest_active_xid() {
8712            cutoff = cutoff.min(oldest_active);
8713        }
8714        if let Some(oldest_pinned) = manager.oldest_pinned_xid() {
8715            cutoff = cutoff.min(oldest_pinned);
8716        }
8717        let retention_xids = self.config_u64("runtime.mvcc.vacuum_retention_xids", 0);
8718        if retention_xids > 0 {
8719            cutoff = cutoff.min(next_xid.saturating_sub(retention_xids));
8720        }
8721        cutoff
8722    }
8723
8724    fn rebuild_runtime_indexes_for_table(&self, table: &str) -> RedDBResult<()> {
8725        let registered = self.inner.index_store.list_indices(table);
8726        if registered.is_empty() {
8727            return Ok(());
8728        }
8729        let store = self.inner.db.store();
8730        let Some(manager) = store.get_collection(table) else {
8731            return Ok(());
8732        };
8733        let entity_fields = manager
8734            .query_all(|entity| matches!(entity.kind, crate::storage::EntityKind::TableRow { .. }))
8735            .into_iter()
8736            .map(|entity| (entity.id, table_row_index_fields(&entity)))
8737            .collect::<Vec<_>>();
8738
8739        for index in registered {
8740            self.inner.index_store.drop_index(&index.name, table);
8741            self.inner
8742                .index_store
8743                .create_index(
8744                    &index.name,
8745                    table,
8746                    &index.columns,
8747                    index.method,
8748                    index.unique,
8749                    &entity_fields,
8750                )
8751                .map_err(RedDBError::Internal)?;
8752            self.inner.index_store.register(index);
8753        }
8754        self.invalidate_plan_cache();
8755        Ok(())
8756    }
8757
8758    /// Own-tx xids (parent + open/released savepoints) for the current
8759    /// connection. Transports + tests that build a `SnapshotContext`
8760    /// manually (outside the `execute_query` scope) need this set so
8761    /// the writer's own uncommitted tuples stay visible to self.
8762    pub fn current_txn_own_xids(
8763        &self,
8764    ) -> std::collections::HashSet<crate::storage::transaction::snapshot::Xid> {
8765        let mut set = std::collections::HashSet::new();
8766        if let Some(ctx) = self.inner.tx_contexts.read().get(&current_connection_id()) {
8767            set.insert(ctx.xid);
8768            for (_, sub) in &ctx.savepoints {
8769                set.insert(*sub);
8770            }
8771            for sub in &ctx.released_sub_xids {
8772                set.insert(*sub);
8773            }
8774        }
8775        set
8776    }
8777
8778    /// Access the shared `ForeignTableRegistry` (Phase 3.2 PG parity).
8779    ///
8780    /// Callers use this to check whether a table name is a registered
8781    /// foreign table (`registry.is_foreign_table(name)`) and, if so, to
8782    /// scan it (`registry.scan(name)`). The read-path rewriter consults
8783    /// this before dispatching into native-collection lookup.
8784    pub fn foreign_tables(&self) -> Arc<crate::storage::fdw::ForeignTableRegistry> {
8785        Arc::clone(&self.inner.foreign_tables)
8786    }
8787
8788    /// Is Row-Level Security enabled for this table? (Phase 2.5 PG parity)
8789    pub fn is_rls_enabled(&self, table: &str) -> bool {
8790        self.inner.rls_enabled_tables.read().contains(table)
8791    }
8792
8793    /// Collect the USING predicates that apply to this `(table, role, action)`.
8794    ///
8795    /// Returned filters should be OR-combined (a row passes RLS when *any*
8796    /// matching policy accepts it) and then AND-ed into the query's WHERE.
8797    /// When the table has RLS disabled this returns an empty Vec — callers
8798    /// can fast-path back to the unfiltered read.
8799    pub fn matching_rls_policies(
8800        &self,
8801        table: &str,
8802        role: Option<&str>,
8803        action: crate::storage::query::ast::PolicyAction,
8804    ) -> Vec<crate::storage::query::ast::Filter> {
8805        // Default kind = Table preserves the pre-Phase-2.5.5 behaviour:
8806        // callers that don't name a kind only see Table-scoped
8807        // policies (which is what execute SELECT / UPDATE / DELETE
8808        // expect).
8809        self.matching_rls_policies_for_kind(
8810            table,
8811            role,
8812            action,
8813            crate::storage::query::ast::PolicyTargetKind::Table,
8814        )
8815    }
8816
8817    /// Kind-aware variant used by cross-model scans (Phase 2.5.5).
8818    ///
8819    /// Graph scans request `Nodes` / `Edges`, vector ANN requests
8820    /// `Vectors`, queue consumers request `Messages`, and timeseries
8821    /// range scans request `Points`. Policies tagged with a
8822    /// different kind are skipped so a graph-scoped policy doesn't
8823    /// accidentally gate a table SELECT on the same collection.
8824    pub fn matching_rls_policies_for_kind(
8825        &self,
8826        table: &str,
8827        role: Option<&str>,
8828        action: crate::storage::query::ast::PolicyAction,
8829        kind: crate::storage::query::ast::PolicyTargetKind,
8830    ) -> Vec<crate::storage::query::ast::Filter> {
8831        if !self.is_rls_enabled(table) {
8832            return Vec::new();
8833        }
8834        let policies = self.inner.rls_policies.read();
8835        policies
8836            .iter()
8837            .filter_map(|((t, _), p)| {
8838                if t != table {
8839                    return None;
8840                }
8841                // Kind gate — Table policies also apply to every
8842                // other kind *iff* the policy predicate evaluates
8843                // against entity fields that exist uniformly; the
8844                // caller's kind filter is the stricter check, so
8845                // match literally. Auto-tenancy policies stamp
8846                // Table and the caller passes the concrete kind —
8847                // we allow Table policies to apply cross-kind for
8848                // backwards compat.
8849                if p.target_kind != kind
8850                    && p.target_kind != crate::storage::query::ast::PolicyTargetKind::Table
8851                {
8852                    return None;
8853                }
8854                // Action gate — `None` means "ALL" actions.
8855                if let Some(a) = p.action {
8856                    if a != action {
8857                        return None;
8858                    }
8859                }
8860                // Role gate — `None` means "any role".
8861                if let Some(p_role) = p.role.as_deref() {
8862                    match role {
8863                        Some(r) if r == p_role => {}
8864                        _ => return None,
8865                    }
8866                }
8867                Some((*p.using).clone())
8868            })
8869            .collect()
8870    }
8871
8872    pub(crate) fn refresh_table_planner_stats(&self, table: &str) {
8873        let store = self.inner.db.store();
8874        if let Some(stats) =
8875            crate::storage::query::planner::stats_catalog::analyze_collection(store.as_ref(), table)
8876        {
8877            crate::storage::query::planner::stats_catalog::persist_table_stats(
8878                store.as_ref(),
8879                &stats,
8880            );
8881        } else {
8882            crate::storage::query::planner::stats_catalog::clear_table_stats(store.as_ref(), table);
8883        }
8884        self.invalidate_plan_cache();
8885    }
8886
8887    pub(crate) fn note_table_write(&self, table: &str) {
8888        // Skip the write lock when the table is already marked
8889        // dirty. With single-row UPDATEs in a loop this used to
8890        // grab the planner_dirty_tables write lock N times even
8891        // though the first call already flipped the flag.
8892        let already_dirty = self.inner.planner_dirty_tables.read().contains(table);
8893        if !already_dirty {
8894            self.inner
8895                .planner_dirty_tables
8896                .write()
8897                .insert(table.to_string());
8898        }
8899        self.invalidate_result_cache_for_table(table);
8900    }
8901
8902    /// Wrap the planner's `RuntimeQueryExplain` as rows on a
8903    /// `RuntimeQueryResult` so callers over the SQL interface see the
8904    /// plan tree in the same shape a SELECT produces.
8905    ///
8906    /// Columns: `op`, `source`, `est_rows`, `est_cost`, `depth`.
8907    /// Nodes are walked depth-first; `depth` counts from 0 at the
8908    /// root so a text renderer can indent without re-walking.
8909    fn explain_as_rows(&self, raw_query: &str, inner_sql: &str) -> RedDBResult<RuntimeQueryResult> {
8910        let explain = self.explain_query(inner_sql)?;
8911
8912        let columns = vec![
8913            "op".to_string(),
8914            "source".to_string(),
8915            "est_rows".to_string(),
8916            "est_cost".to_string(),
8917            "depth".to_string(),
8918        ];
8919
8920        let mut records: Vec<crate::storage::query::unified::UnifiedRecord> = Vec::new();
8921
8922        // Prepend `CteScan` markers when the query carried a leading
8923        // WITH clause. The CTE bodies are already inlined into the
8924        // main plan tree, but operators reading EXPLAIN need to see
8925        // which named CTEs were resolved — without this row the plan
8926        // would look indistinguishable from a hand-inlined query.
8927        for name in &explain.cte_materializations {
8928            use std::sync::Arc;
8929            let mut rec = crate::storage::query::unified::UnifiedRecord::default();
8930            rec.set_arc(Arc::from("op"), Value::text("CteScan".to_string()));
8931            rec.set_arc(Arc::from("source"), Value::text(name.clone()));
8932            rec.set_arc(Arc::from("est_rows"), Value::Float(0.0));
8933            rec.set_arc(Arc::from("est_cost"), Value::Float(0.0));
8934            rec.set_arc(Arc::from("depth"), Value::Integer(0));
8935            records.push(rec);
8936        }
8937
8938        walk_plan_node(&explain.logical_plan.root, 0, &mut records);
8939
8940        let result = crate::storage::query::unified::UnifiedResult {
8941            columns,
8942            records,
8943            stats: Default::default(),
8944            pre_serialized_json: None,
8945        };
8946
8947        Ok(RuntimeQueryResult {
8948            query: raw_query.to_string(),
8949            mode: explain.mode,
8950            statement: "explain",
8951            engine: "runtime-explain",
8952            result,
8953            affected_rows: 0,
8954            statement_type: "select",
8955        })
8956    }
8957
8958    // -----------------------------------------------------------------
8959    // Granular RBAC — privilege gate + GRANT/REVOKE/ALTER USER dispatch
8960    // -----------------------------------------------------------------
8961
8962    /// Project a `QueryExpr` to the (action, resource) pair the
8963    /// privilege engine cares about. Returns `Ok(())` for statements
8964    /// that don't touch user data (transaction control, SHOW, SET, etc.).
8965    pub(super) fn check_query_privilege(
8966        &self,
8967        expr: &crate::storage::query::ast::QueryExpr,
8968    ) -> Result<(), String> {
8969        use crate::auth::privileges::{Action, AuthzContext, Resource};
8970        use crate::auth::UserId;
8971        use crate::storage::query::ast::QueryExpr;
8972
8973        // No auth store wired (embedded mode / fresh DB / tests) → bypass.
8974        // The bootstrap path itself goes through `execute_query` so this
8975        // is the only sensible default; once auth is wired, the gate
8976        // becomes active.
8977        let auth_store = match self.inner.auth_store.read().clone() {
8978            Some(s) => s,
8979            None => return Ok(()),
8980        };
8981
8982        // Resolve principal + role from the thread-local identity.
8983        // Anonymous (no identity) is allowed to read the bootstrap path
8984        // only when auth_store says so; we treat missing identity as
8985        // platform-admin-equivalent here so embedded test harnesses
8986        // continue to work without setting an identity.
8987        let (username, role) = match current_auth_identity() {
8988            Some(p) => p,
8989            None => return Ok(()),
8990        };
8991        let tenant = current_tenant();
8992
8993        let ctx = AuthzContext {
8994            principal: &username,
8995            effective_role: role,
8996            tenant: tenant.as_deref(),
8997        };
8998        let principal_id = UserId::from_parts(tenant.as_deref(), &username);
8999
9000        // Map QueryExpr → (Action, Resource).
9001        let (action, resource) = match expr {
9002            QueryExpr::Table(t) => (Action::Select, Resource::table_from_name(&t.table)),
9003            QueryExpr::QueueSelect(q) => (Action::Select, Resource::table_from_name(&q.queue)),
9004            QueryExpr::Graph(g) => {
9005                if auth_store.iam_authorization_enabled() {
9006                    self.check_graph_property_projection_privilege(
9007                        &auth_store,
9008                        &principal_id,
9009                        role,
9010                        tenant.as_deref(),
9011                        g,
9012                    )?;
9013                    return Ok(());
9014                }
9015                return Ok(());
9016            }
9017            QueryExpr::Vector(v) => {
9018                if auth_store.iam_authorization_enabled() {
9019                    self.check_table_like_column_projection_privilege(
9020                        &auth_store,
9021                        &principal_id,
9022                        role,
9023                        tenant.as_deref(),
9024                        &v.collection,
9025                        &["content".to_string()],
9026                    )?;
9027                    return Ok(());
9028                }
9029                return Ok(());
9030            }
9031            QueryExpr::Insert(i) => (Action::Insert, Resource::table_from_name(&i.table)),
9032            QueryExpr::Update(u) => (Action::Update, Resource::table_from_name(&u.table)),
9033            QueryExpr::Delete(d) => (Action::Delete, Resource::table_from_name(&d.table)),
9034            // Joins inherit the read privilege from any constituent
9035            // table — for now we emit a single Select on the database
9036            // (admins bypass; non-admins need a Database/Schema grant).
9037            QueryExpr::Join(_) => (Action::Select, Resource::Database),
9038            // GRANT / REVOKE / ALTER USER are authority statements;
9039            // require Admin (the helper methods enforce).
9040            QueryExpr::Grant(_) | QueryExpr::Revoke(_) | QueryExpr::AlterUser(_) => {
9041                return if role == crate::auth::Role::Admin {
9042                    Ok(())
9043                } else {
9044                    Err(format!(
9045                        "principal=`{}` role=`{:?}` cannot issue ACL/auth DDL",
9046                        username, role
9047                    ))
9048                };
9049            }
9050            QueryExpr::CreateIamPolicy { id, .. } => {
9051                return self.check_policy_management_privilege(
9052                    &auth_store,
9053                    &principal_id,
9054                    role,
9055                    tenant.as_deref(),
9056                    "policy:put",
9057                    "policy",
9058                    id,
9059                );
9060            }
9061            QueryExpr::DropIamPolicy { id } => {
9062                return self.check_policy_management_privilege(
9063                    &auth_store,
9064                    &principal_id,
9065                    role,
9066                    tenant.as_deref(),
9067                    "policy:drop",
9068                    "policy",
9069                    id,
9070                );
9071            }
9072            QueryExpr::AttachPolicy { policy_id, .. } => {
9073                return self.check_policy_management_privilege(
9074                    &auth_store,
9075                    &principal_id,
9076                    role,
9077                    tenant.as_deref(),
9078                    "policy:attach",
9079                    "policy",
9080                    policy_id,
9081                );
9082            }
9083            QueryExpr::DetachPolicy { policy_id, .. } => {
9084                return self.check_policy_management_privilege(
9085                    &auth_store,
9086                    &principal_id,
9087                    role,
9088                    tenant.as_deref(),
9089                    "policy:detach",
9090                    "policy",
9091                    policy_id,
9092                );
9093            }
9094            QueryExpr::ShowPolicies { .. } | QueryExpr::ShowEffectivePermissions { .. } => {
9095                return Ok(());
9096            }
9097            QueryExpr::SimulatePolicy { .. } => {
9098                return self.check_policy_management_privilege(
9099                    &auth_store,
9100                    &principal_id,
9101                    role,
9102                    tenant.as_deref(),
9103                    "policy:simulate",
9104                    "policy",
9105                    "*",
9106                );
9107            }
9108            // DROP and TRUNCATE — Write-role gate + per-collection IAM policy
9109            // when IAM mode is active. Other DDL stays role-only for now.
9110            QueryExpr::DropTable(q) => {
9111                return self.check_ddl_collection_privilege(
9112                    &auth_store,
9113                    &principal_id,
9114                    role,
9115                    tenant.as_deref(),
9116                    &username,
9117                    "drop",
9118                    &q.name,
9119                );
9120            }
9121            QueryExpr::DropGraph(q) => {
9122                return self.check_ddl_collection_privilege(
9123                    &auth_store,
9124                    &principal_id,
9125                    role,
9126                    tenant.as_deref(),
9127                    &username,
9128                    "drop",
9129                    &q.name,
9130                );
9131            }
9132            QueryExpr::DropVector(q) => {
9133                return self.check_ddl_collection_privilege(
9134                    &auth_store,
9135                    &principal_id,
9136                    role,
9137                    tenant.as_deref(),
9138                    &username,
9139                    "drop",
9140                    &q.name,
9141                );
9142            }
9143            QueryExpr::DropDocument(q) => {
9144                return self.check_ddl_collection_privilege(
9145                    &auth_store,
9146                    &principal_id,
9147                    role,
9148                    tenant.as_deref(),
9149                    &username,
9150                    "drop",
9151                    &q.name,
9152                );
9153            }
9154            QueryExpr::DropKv(q) => {
9155                return self.check_ddl_collection_privilege(
9156                    &auth_store,
9157                    &principal_id,
9158                    role,
9159                    tenant.as_deref(),
9160                    &username,
9161                    "drop",
9162                    &q.name,
9163                );
9164            }
9165            QueryExpr::DropCollection(q) => {
9166                return self.check_ddl_collection_privilege(
9167                    &auth_store,
9168                    &principal_id,
9169                    role,
9170                    tenant.as_deref(),
9171                    &username,
9172                    "drop",
9173                    &q.name,
9174                );
9175            }
9176            QueryExpr::Truncate(q) => {
9177                return self.check_ddl_collection_privilege(
9178                    &auth_store,
9179                    &principal_id,
9180                    role,
9181                    tenant.as_deref(),
9182                    &username,
9183                    "truncate",
9184                    &q.name,
9185                );
9186            }
9187            // Remaining DDL — gate on Write role. Fine-grained grants TBD.
9188            QueryExpr::CreateTable(_)
9189            | QueryExpr::CreateCollection(_)
9190            | QueryExpr::CreateVector(_)
9191            | QueryExpr::AlterTable(_)
9192            | QueryExpr::CreateIndex(_)
9193            | QueryExpr::DropIndex(_)
9194            | QueryExpr::CreateSchema(_)
9195            | QueryExpr::DropSchema(_)
9196            | QueryExpr::CreateSequence(_)
9197            | QueryExpr::DropSequence(_)
9198            | QueryExpr::CreateView(_)
9199            | QueryExpr::DropView(_)
9200            | QueryExpr::RefreshMaterializedView(_)
9201            | QueryExpr::CreatePolicy(_)
9202            | QueryExpr::DropPolicy(_)
9203            | QueryExpr::CreateServer(_)
9204            | QueryExpr::DropServer(_)
9205            | QueryExpr::CreateForeignTable(_)
9206            | QueryExpr::DropForeignTable(_)
9207            | QueryExpr::CreateTimeSeries(_)
9208            | QueryExpr::DropTimeSeries(_)
9209            | QueryExpr::CreateQueue(_)
9210            | QueryExpr::AlterQueue(_)
9211            | QueryExpr::DropQueue(_)
9212            | QueryExpr::CreateTree(_)
9213            | QueryExpr::DropTree(_) => {
9214                return if role >= crate::auth::Role::Write {
9215                    Ok(())
9216                } else {
9217                    Err(format!(
9218                        "principal=`{}` role=`{:?}` cannot issue DDL",
9219                        username, role
9220                    ))
9221                };
9222            }
9223            // Migration DDL — CREATE MIGRATION requires Write role (schema author).
9224            QueryExpr::CreateMigration(_) => {
9225                return if role >= crate::auth::Role::Write {
9226                    Ok(())
9227                } else {
9228                    Err(format!(
9229                        "principal=`{}` role=`{:?}` cannot issue CREATE MIGRATION",
9230                        username, role
9231                    ))
9232                };
9233            }
9234            // APPLY / ROLLBACK change data and schema — require Admin.
9235            QueryExpr::ApplyMigration(_) | QueryExpr::RollbackMigration(_) => {
9236                return if role == crate::auth::Role::Admin {
9237                    Ok(())
9238                } else {
9239                    Err(format!(
9240                        "principal=`{}` role=`{:?}` cannot issue APPLY/ROLLBACK MIGRATION",
9241                        username, role
9242                    ))
9243                };
9244            }
9245            // EXPLAIN MIGRATION is read-only — any authenticated principal.
9246            QueryExpr::ExplainMigration(_) => return Ok(()),
9247            // Everything else (SET, SHOW, transaction control, graph
9248            // commands, queue/tree commands, MaintenanceCommand …)
9249            // is allowed for any authenticated principal.
9250            _ => return Ok(()),
9251        };
9252
9253        if auth_store.iam_authorization_enabled() {
9254            let iam_action = legacy_action_to_iam(action);
9255            let iam_resource = legacy_resource_to_iam(&resource, tenant.as_deref());
9256            let iam_ctx = runtime_iam_context(
9257                role,
9258                tenant.as_deref(),
9259                auth_store.principal_is_system_owned(&principal_id),
9260            );
9261            if !auth_store.check_policy_authz(&principal_id, iam_action, &iam_resource, &iam_ctx) {
9262                return Err(format!(
9263                    "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
9264                    username, iam_action, iam_resource.kind, iam_resource.name
9265                ));
9266            }
9267
9268            if let QueryExpr::Table(table) = expr {
9269                self.check_table_column_projection_privilege(
9270                    &auth_store,
9271                    &principal_id,
9272                    &iam_ctx,
9273                    table,
9274                )?;
9275            }
9276
9277            if let QueryExpr::Update(update) = expr {
9278                let columns = update_set_target_columns(update);
9279                if !columns.is_empty() {
9280                    let request = column_access_request_for_table_update(&update.table, columns);
9281                    let outcome =
9282                        auth_store.check_column_projection_authz(&principal_id, &request, &iam_ctx);
9283                    if let Some(denied) = outcome.first_denied_column() {
9284                        return Err(format!(
9285                            "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM column policy",
9286                            username, iam_action, denied.resource.kind, denied.resource.name
9287                        ));
9288                    }
9289                    if !outcome.allowed() {
9290                        return Err(format!(
9291                            "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
9292                            username,
9293                            iam_action,
9294                            outcome.table_resource.kind,
9295                            outcome.table_resource.name
9296                        ));
9297                    }
9298                }
9299
9300                if let Some(columns) = update_returning_columns_for_policy(self, update) {
9301                    let request = column_access_request_for_table_select(&update.table, columns);
9302                    let outcome =
9303                        auth_store.check_column_projection_authz(&principal_id, &request, &iam_ctx);
9304                    if let Some(denied) = outcome.first_denied_column() {
9305                        return Err(format!(
9306                            "principal=`{}` action=`select` resource=`{}:{}` denied by IAM column policy",
9307                            username, denied.resource.kind, denied.resource.name
9308                        ));
9309                    }
9310                    if !outcome.allowed() {
9311                        return Err(format!(
9312                            "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
9313                            username, outcome.table_resource.kind, outcome.table_resource.name
9314                        ));
9315                    }
9316                }
9317            }
9318
9319            Ok(())
9320        } else {
9321            auth_store
9322                .check_grant(&ctx, action, &resource)
9323                .map_err(|e| e.to_string())
9324        }
9325    }
9326
9327    fn check_table_column_projection_privilege(
9328        &self,
9329        auth_store: &Arc<crate::auth::store::AuthStore>,
9330        principal: &crate::auth::UserId,
9331        ctx: &crate::auth::policies::EvalContext,
9332        table: &crate::storage::query::ast::TableQuery,
9333    ) -> Result<(), String> {
9334        use crate::auth::{ColumnAccessRequest, ColumnDecisionEffect};
9335
9336        let columns = requested_table_columns_for_policy(table);
9337        if columns.is_empty() {
9338            return Ok(());
9339        }
9340
9341        let request = ColumnAccessRequest::select(table.table.clone(), columns);
9342        let outcome = auth_store.check_column_projection_authz(principal, &request, ctx);
9343        if outcome.allowed() {
9344            return Ok(());
9345        }
9346
9347        if !matches!(
9348            outcome.table_decision,
9349            crate::auth::policies::Decision::Allow { .. }
9350                | crate::auth::policies::Decision::AdminBypass
9351        ) {
9352            return Err(format!(
9353                "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
9354                principal, outcome.table_resource.kind, outcome.table_resource.name
9355            ));
9356        }
9357
9358        let denied = outcome
9359            .first_denied_column()
9360            .filter(|decision| decision.effective == ColumnDecisionEffect::Denied);
9361        match denied {
9362            Some(decision) => Err(format!(
9363                "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
9364                principal, decision.resource.kind, decision.resource.name
9365            )),
9366            None => Ok(()),
9367        }
9368    }
9369
9370    fn check_graph_property_projection_privilege(
9371        &self,
9372        auth_store: &Arc<crate::auth::store::AuthStore>,
9373        principal: &crate::auth::UserId,
9374        role: crate::auth::Role,
9375        tenant: Option<&str>,
9376        query: &crate::storage::query::ast::GraphQuery,
9377    ) -> Result<(), String> {
9378        let columns = explicit_graph_projection_properties(query);
9379        if columns.is_empty() {
9380            return Ok(());
9381        }
9382        self.check_table_like_column_projection_privilege(
9383            auth_store, principal, role, tenant, "graph", &columns,
9384        )
9385    }
9386
9387    fn check_table_like_column_projection_privilege(
9388        &self,
9389        auth_store: &Arc<crate::auth::store::AuthStore>,
9390        principal: &crate::auth::UserId,
9391        role: crate::auth::Role,
9392        tenant: Option<&str>,
9393        table: &str,
9394        columns: &[String],
9395    ) -> Result<(), String> {
9396        let iam_ctx = runtime_iam_context(
9397            role,
9398            tenant,
9399            auth_store.principal_is_system_owned(principal),
9400        );
9401        let request =
9402            crate::auth::ColumnAccessRequest::select(table.to_string(), columns.iter().cloned());
9403        let outcome = auth_store.check_column_projection_authz(principal, &request, &iam_ctx);
9404        if outcome.allowed() {
9405            return Ok(());
9406        }
9407        let denied = outcome
9408            .first_denied_column()
9409            .map(|d| d.resource.name.clone())
9410            .unwrap_or_else(|| format!("{table}.<unknown>"));
9411        Err(format!(
9412            "principal=`{}` action=`select` resource=`column:{}` denied by IAM policy",
9413            principal, denied
9414        ))
9415    }
9416
9417    fn check_policy_management_privilege(
9418        &self,
9419        auth_store: &Arc<crate::auth::store::AuthStore>,
9420        principal: &crate::auth::UserId,
9421        role: crate::auth::Role,
9422        tenant: Option<&str>,
9423        action: &str,
9424        resource_kind: &str,
9425        resource_name: &str,
9426    ) -> Result<(), String> {
9427        if !auth_store.iam_authorization_enabled() {
9428            return if role == crate::auth::Role::Admin {
9429                Ok(())
9430            } else {
9431                Err(format!(
9432                    "principal=`{}` role=`{:?}` cannot issue ACL/auth DDL",
9433                    principal, role
9434                ))
9435            };
9436        }
9437
9438        let mut resource = crate::auth::policies::ResourceRef::new(
9439            resource_kind.to_string(),
9440            resource_name.to_string(),
9441        );
9442        if let Some(t) = tenant {
9443            resource = resource.with_tenant(t.to_string());
9444        }
9445        let ctx = runtime_iam_context(
9446            role,
9447            tenant,
9448            auth_store.principal_is_system_owned(principal),
9449        );
9450        if auth_store.check_policy_authz(principal, action, &resource, &ctx) {
9451            Ok(())
9452        } else {
9453            Err(format!(
9454                "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
9455                principal, action, resource.kind, resource.name
9456            ))
9457        }
9458    }
9459
9460    /// IAM privilege check for DROP / TRUNCATE on a named collection.
9461    ///
9462    /// In legacy mode (IAM not enabled): requires Write role.
9463    /// In IAM mode: requires an explicit `drop` / `truncate` policy on
9464    /// `collection:<name>`; admin authority allows the action only when no
9465    /// explicit Deny matches.
9466    /// Records an audit log entry for both allow and deny outcomes.
9467    fn check_ddl_collection_privilege(
9468        &self,
9469        auth_store: &Arc<crate::auth::store::AuthStore>,
9470        principal: &crate::auth::UserId,
9471        role: crate::auth::Role,
9472        tenant: Option<&str>,
9473        username: &str,
9474        action: &str,
9475        collection: &str,
9476    ) -> Result<(), String> {
9477        if role < crate::auth::Role::Write {
9478            let msg = format!(
9479                "principal=`{}` role=`{:?}` cannot issue DDL",
9480                username, role
9481            );
9482            self.inner.audit_log.record(
9483                action,
9484                username,
9485                collection,
9486                "denied",
9487                crate::json::Value::Null,
9488            );
9489            return Err(msg);
9490        }
9491
9492        if !auth_store.iam_authorization_enabled() {
9493            self.inner.audit_log.record(
9494                action,
9495                username,
9496                collection,
9497                "ok",
9498                crate::json::Value::Null,
9499            );
9500            return Ok(());
9501        }
9502
9503        let resource_name = collection.to_string();
9504        let mut resource = crate::auth::policies::ResourceRef::new(
9505            "collection".to_string(),
9506            resource_name.clone(),
9507        );
9508        if let Some(t) = tenant {
9509            resource = resource.with_tenant(t.to_string());
9510        }
9511        let ctx = runtime_iam_context(
9512            role,
9513            tenant,
9514            auth_store.principal_is_system_owned(principal),
9515        );
9516        if auth_store.check_policy_authz(principal, action, &resource, &ctx) {
9517            self.inner.audit_log.record(
9518                action,
9519                username,
9520                &resource_name,
9521                "ok",
9522                crate::json::Value::Null,
9523            );
9524            Ok(())
9525        } else {
9526            self.inner.audit_log.record(
9527                action,
9528                username,
9529                &resource_name,
9530                "denied",
9531                crate::json::Value::Null,
9532            );
9533            Err(format!(
9534                "principal=`{}` action=`{}` resource=`collection:{}` denied by IAM policy",
9535                username, action, resource_name
9536            ))
9537        }
9538    }
9539
9540    /// Translate the parsed [`GrantStmt`] into AuthStore mutations.
9541    fn execute_grant_statement(
9542        &self,
9543        query: &str,
9544        stmt: &crate::storage::query::ast::GrantStmt,
9545    ) -> RedDBResult<RuntimeQueryResult> {
9546        use crate::auth::privileges::{Action, GrantPrincipal, Resource};
9547        use crate::auth::UserId;
9548        use crate::storage::query::ast::{GrantObjectKind, GrantPrincipalRef};
9549
9550        let auth_store = self
9551            .inner
9552            .auth_store
9553            .read()
9554            .clone()
9555            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9556
9557        // Granter identity + role.
9558        let (gname, grole) = current_auth_identity().ok_or_else(|| {
9559            RedDBError::Query("GRANT requires an authenticated principal".to_string())
9560        })?;
9561        let granter = UserId::from_parts(current_tenant().as_deref(), &gname);
9562        let granter_role = grole;
9563
9564        // Build the action set.
9565        let mut actions: Vec<Action> = Vec::new();
9566        if stmt.all {
9567            actions.push(Action::All);
9568        } else {
9569            for kw in &stmt.actions {
9570                let a = Action::from_keyword(kw).ok_or_else(|| {
9571                    RedDBError::Query(format!("unknown privilege keyword `{}`", kw))
9572                })?;
9573                actions.push(a);
9574            }
9575        }
9576
9577        // Audit emit (printed; structured emission is Agent #4's lane).
9578        let mut applied = 0usize;
9579        for obj in &stmt.objects {
9580            let resource = match stmt.object_kind {
9581                GrantObjectKind::Table => Resource::Table {
9582                    schema: obj.schema.clone(),
9583                    table: obj.name.clone(),
9584                },
9585                GrantObjectKind::Schema => Resource::Schema(obj.name.clone()),
9586                GrantObjectKind::Database => Resource::Database,
9587                GrantObjectKind::Function => Resource::Function {
9588                    schema: obj.schema.clone(),
9589                    name: obj.name.clone(),
9590                },
9591            };
9592            for principal in &stmt.principals {
9593                let p = match principal {
9594                    GrantPrincipalRef::Public => GrantPrincipal::Public,
9595                    GrantPrincipalRef::Group(g) => GrantPrincipal::Group(g.clone()),
9596                    GrantPrincipalRef::User { tenant, name } => {
9597                        GrantPrincipal::User(UserId::from_parts(tenant.as_deref(), name))
9598                    }
9599                };
9600                // Tenant of the grant follows the granter's tenant
9601                // (cross-tenant guard inside `AuthStore::grant`).
9602                let tenant = granter.tenant.clone();
9603                auth_store
9604                    .grant(
9605                        &granter,
9606                        granter_role,
9607                        p.clone(),
9608                        resource.clone(),
9609                        actions.clone(),
9610                        stmt.with_grant_option,
9611                        tenant.clone(),
9612                    )
9613                    .map_err(|e| RedDBError::Query(e.to_string()))?;
9614
9615                // IAM policy translation: every GRANT also lands as a
9616                // synthetic `_grant_<id>` policy attached to the
9617                // principal so the new evaluator sees it.
9618                if let Some(policy) =
9619                    grant_to_iam_policy(&p, &resource, &actions, tenant.as_deref())
9620                {
9621                    let pid = policy.id.clone();
9622                    auth_store
9623                        .put_policy_internal(policy)
9624                        .map_err(|e| RedDBError::Query(e.to_string()))?;
9625                    let attachment = match &p {
9626                        GrantPrincipal::User(uid) => {
9627                            crate::auth::store::PrincipalRef::User(uid.clone())
9628                        }
9629                        GrantPrincipal::Group(group) => {
9630                            crate::auth::store::PrincipalRef::Group(group.clone())
9631                        }
9632                        GrantPrincipal::Public => crate::auth::store::PrincipalRef::Group(
9633                            crate::auth::store::PUBLIC_IAM_GROUP.to_string(),
9634                        ),
9635                    };
9636                    auth_store
9637                        .attach_policy(attachment, &pid)
9638                        .map_err(|e| RedDBError::Query(e.to_string()))?;
9639                }
9640                applied += 1;
9641                tracing::info!(
9642                    target: "audit",
9643                    principal = %granter,
9644                    action = "grant",
9645                    "GRANT applied"
9646                );
9647            }
9648        }
9649
9650        self.invalidate_result_cache();
9651        Ok(RuntimeQueryResult::ok_message(
9652            query.to_string(),
9653            &format!("GRANT applied to {} target(s)", applied),
9654            "grant",
9655        ))
9656    }
9657
9658    /// Translate the parsed [`RevokeStmt`] into AuthStore mutations.
9659    fn execute_revoke_statement(
9660        &self,
9661        query: &str,
9662        stmt: &crate::storage::query::ast::RevokeStmt,
9663    ) -> RedDBResult<RuntimeQueryResult> {
9664        use crate::auth::privileges::{Action, GrantPrincipal, Resource};
9665        use crate::auth::UserId;
9666        use crate::storage::query::ast::{GrantObjectKind, GrantPrincipalRef};
9667
9668        let auth_store = self
9669            .inner
9670            .auth_store
9671            .read()
9672            .clone()
9673            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9674
9675        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
9676            RedDBError::Query("REVOKE requires an authenticated principal".to_string())
9677        })?;
9678        let granter_role = grole;
9679
9680        let actions: Vec<Action> = if stmt.all {
9681            vec![Action::All]
9682        } else {
9683            stmt.actions
9684                .iter()
9685                .map(|kw| Action::from_keyword(kw).unwrap_or(Action::Select))
9686                .collect()
9687        };
9688
9689        let mut total_removed = 0usize;
9690        for obj in &stmt.objects {
9691            let resource = match stmt.object_kind {
9692                GrantObjectKind::Table => Resource::Table {
9693                    schema: obj.schema.clone(),
9694                    table: obj.name.clone(),
9695                },
9696                GrantObjectKind::Schema => Resource::Schema(obj.name.clone()),
9697                GrantObjectKind::Database => Resource::Database,
9698                GrantObjectKind::Function => Resource::Function {
9699                    schema: obj.schema.clone(),
9700                    name: obj.name.clone(),
9701                },
9702            };
9703            for principal in &stmt.principals {
9704                let p = match principal {
9705                    GrantPrincipalRef::Public => GrantPrincipal::Public,
9706                    GrantPrincipalRef::Group(g) => GrantPrincipal::Group(g.clone()),
9707                    GrantPrincipalRef::User { tenant, name } => {
9708                        GrantPrincipal::User(UserId::from_parts(tenant.as_deref(), name))
9709                    }
9710                };
9711                let removed = auth_store
9712                    .revoke(granter_role, &p, &resource, &actions)
9713                    .map_err(|e| RedDBError::Query(e.to_string()))?;
9714                let _removed_policies =
9715                    auth_store.delete_synthetic_grant_policies(&p, &resource, &actions);
9716                total_removed += removed;
9717            }
9718        }
9719
9720        self.invalidate_result_cache();
9721        Ok(RuntimeQueryResult::ok_message(
9722            query.to_string(),
9723            &format!("REVOKE removed {} grant(s)", total_removed),
9724            "revoke",
9725        ))
9726    }
9727
9728    /// Translate the parsed [`AlterUserStmt`] into AuthStore mutations.
9729    fn execute_alter_user_statement(
9730        &self,
9731        query: &str,
9732        stmt: &crate::storage::query::ast::AlterUserStmt,
9733    ) -> RedDBResult<RuntimeQueryResult> {
9734        use crate::auth::privileges::UserAttributes;
9735        use crate::auth::UserId;
9736        use crate::storage::query::ast::AlterUserAttribute;
9737
9738        let auth_store = self
9739            .inner
9740            .auth_store
9741            .read()
9742            .clone()
9743            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9744
9745        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
9746            RedDBError::Query("ALTER USER requires an authenticated principal".to_string())
9747        })?;
9748        if grole != crate::auth::Role::Admin {
9749            return Err(RedDBError::Query(
9750                "ALTER USER requires Admin role".to_string(),
9751            ));
9752        }
9753
9754        let target = UserId::from_parts(stmt.tenant.as_deref(), &stmt.username);
9755
9756        // Apply attributes incrementally — each one reads the current
9757        // record, mutates the relevant field, writes back.
9758        let mut attrs = auth_store.user_attributes(&target);
9759        let mut enable_change: Option<bool> = None;
9760
9761        for a in &stmt.attributes {
9762            match a {
9763                AlterUserAttribute::ValidUntil(ts) => {
9764                    // Parse ISO-ish timestamp → ms since epoch. Fall
9765                    // back to integer-ms parsing for callers that pass
9766                    // `'1234567890123'`.
9767                    let ms = parse_timestamp_to_ms(ts).ok_or_else(|| {
9768                        RedDBError::Query(format!("invalid VALID UNTIL timestamp `{ts}`"))
9769                    })?;
9770                    attrs.valid_until = Some(ms);
9771                }
9772                AlterUserAttribute::ConnectionLimit(n) => {
9773                    if *n < 0 {
9774                        return Err(RedDBError::Query(
9775                            "CONNECTION LIMIT must be non-negative".to_string(),
9776                        ));
9777                    }
9778                    attrs.connection_limit = Some(*n as u32);
9779                }
9780                AlterUserAttribute::SetSearchPath(p) => {
9781                    attrs.search_path = Some(p.clone());
9782                }
9783                AlterUserAttribute::AddGroup(g) => {
9784                    if !attrs.groups.iter().any(|existing| existing == g) {
9785                        attrs.groups.push(g.clone());
9786                        attrs.groups.sort();
9787                    }
9788                }
9789                AlterUserAttribute::DropGroup(g) => {
9790                    attrs.groups.retain(|existing| existing != g);
9791                }
9792                AlterUserAttribute::Enable => enable_change = Some(true),
9793                AlterUserAttribute::Disable => enable_change = Some(false),
9794                AlterUserAttribute::Password(_) => {
9795                    // Out of scope — accept the AST but no-op so the
9796                    // parser stays compatible with future password
9797                    // rotation work.
9798                }
9799            }
9800        }
9801
9802        auth_store
9803            .set_user_attributes(&target, attrs)
9804            .map_err(|e| RedDBError::Query(e.to_string()))?;
9805        if let Some(en) = enable_change {
9806            auth_store
9807                .set_user_enabled(&target, en)
9808                .map_err(|e| RedDBError::Query(e.to_string()))?;
9809        }
9810        self.invalidate_result_cache();
9811        tracing::info!(
9812            target: "audit",
9813            principal = %target,
9814            action = "alter_user",
9815            "ALTER USER applied"
9816        );
9817
9818        Ok(RuntimeQueryResult::ok_message(
9819            query.to_string(),
9820            &format!("ALTER USER {} applied", target),
9821            "alter_user",
9822        ))
9823    }
9824
9825    // -----------------------------------------------------------------
9826    // IAM policy executors
9827    // -----------------------------------------------------------------
9828
9829    fn execute_create_iam_policy(
9830        &self,
9831        query: &str,
9832        id: &str,
9833        json: &str,
9834    ) -> RedDBResult<RuntimeQueryResult> {
9835        use crate::auth::policies::Policy;
9836
9837        let auth_store = self
9838            .inner
9839            .auth_store
9840            .read()
9841            .clone()
9842            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9843
9844        // Parse + validate. The kernel rejects oversize / bad shape /
9845        // bad action keywords. If the supplied id differs from the JSON
9846        // id, override it with the SQL-provided id (the JSON id is
9847        // optional context — the SQL DDL form is authoritative).
9848        let mut policy = Policy::from_json_str(json)
9849            .map_err(|e| RedDBError::Query(format!("policy parse: {e}")))?;
9850        if policy.id != id {
9851            policy.id = id.to_string();
9852        }
9853        let pid = policy.id.clone();
9854        auth_store
9855            .put_policy(policy)
9856            .map_err(|e| RedDBError::Query(e.to_string()))?;
9857
9858        let principal = current_auth_identity()
9859            .map(|(u, _)| u)
9860            .unwrap_or_else(|| "anonymous".into());
9861        tracing::info!(
9862            target: "audit",
9863            principal = %principal,
9864            action = "iam:policy.put",
9865            matched_policy_id = %pid,
9866            "CREATE POLICY applied"
9867        );
9868        self.inner.audit_log.record(
9869            "iam/policy.put",
9870            &principal,
9871            &pid,
9872            "ok",
9873            crate::json::Value::Null,
9874        );
9875
9876        self.invalidate_result_cache();
9877        Ok(RuntimeQueryResult::ok_message(
9878            query.to_string(),
9879            &format!("policy `{pid}` stored"),
9880            "create_iam_policy",
9881        ))
9882    }
9883
9884    fn execute_drop_iam_policy(&self, query: &str, id: &str) -> RedDBResult<RuntimeQueryResult> {
9885        let auth_store = self
9886            .inner
9887            .auth_store
9888            .read()
9889            .clone()
9890            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9891        auth_store
9892            .delete_policy(id)
9893            .map_err(|e| RedDBError::Query(e.to_string()))?;
9894
9895        let principal = current_auth_identity()
9896            .map(|(u, _)| u)
9897            .unwrap_or_else(|| "anonymous".into());
9898        tracing::info!(
9899            target: "audit",
9900            principal = %principal,
9901            action = "iam:policy.drop",
9902            matched_policy_id = %id,
9903            "DROP POLICY applied"
9904        );
9905        self.inner.audit_log.record(
9906            "iam/policy.drop",
9907            &principal,
9908            id,
9909            "ok",
9910            crate::json::Value::Null,
9911        );
9912
9913        self.invalidate_result_cache();
9914        Ok(RuntimeQueryResult::ok_message(
9915            query.to_string(),
9916            &format!("policy `{id}` dropped"),
9917            "drop_iam_policy",
9918        ))
9919    }
9920
9921    fn execute_attach_policy(
9922        &self,
9923        query: &str,
9924        policy_id: &str,
9925        principal: &crate::storage::query::ast::PolicyPrincipalRef,
9926    ) -> RedDBResult<RuntimeQueryResult> {
9927        use crate::auth::store::PrincipalRef;
9928        use crate::auth::UserId;
9929        use crate::storage::query::ast::PolicyPrincipalRef;
9930
9931        let auth_store = self
9932            .inner
9933            .auth_store
9934            .read()
9935            .clone()
9936            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9937        let p = match principal {
9938            PolicyPrincipalRef::User(u) => {
9939                PrincipalRef::User(UserId::from_parts(u.tenant.as_deref(), &u.username))
9940            }
9941            PolicyPrincipalRef::Group(g) => PrincipalRef::Group(g.clone()),
9942        };
9943        let pretty_target = principal_label(principal);
9944        auth_store
9945            .attach_policy(p, policy_id)
9946            .map_err(|e| RedDBError::Query(e.to_string()))?;
9947
9948        let principal_str = current_auth_identity()
9949            .map(|(u, _)| u)
9950            .unwrap_or_else(|| "anonymous".into());
9951        tracing::info!(
9952            target: "audit",
9953            principal = %principal_str,
9954            action = "iam:policy.attach",
9955            matched_policy_id = %policy_id,
9956            target = %pretty_target,
9957            "ATTACH POLICY applied"
9958        );
9959        self.inner.audit_log.record(
9960            "iam/policy.attach",
9961            &principal_str,
9962            &pretty_target,
9963            "ok",
9964            crate::json::Value::Null,
9965        );
9966
9967        self.invalidate_result_cache();
9968        Ok(RuntimeQueryResult::ok_message(
9969            query.to_string(),
9970            &format!("policy `{policy_id}` attached to {pretty_target}"),
9971            "attach_policy",
9972        ))
9973    }
9974
9975    fn execute_detach_policy(
9976        &self,
9977        query: &str,
9978        policy_id: &str,
9979        principal: &crate::storage::query::ast::PolicyPrincipalRef,
9980    ) -> RedDBResult<RuntimeQueryResult> {
9981        use crate::auth::store::PrincipalRef;
9982        use crate::auth::UserId;
9983        use crate::storage::query::ast::PolicyPrincipalRef;
9984
9985        let auth_store = self
9986            .inner
9987            .auth_store
9988            .read()
9989            .clone()
9990            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9991        let p = match principal {
9992            PolicyPrincipalRef::User(u) => {
9993                PrincipalRef::User(UserId::from_parts(u.tenant.as_deref(), &u.username))
9994            }
9995            PolicyPrincipalRef::Group(g) => PrincipalRef::Group(g.clone()),
9996        };
9997        let pretty_target = principal_label(principal);
9998        auth_store
9999            .detach_policy(p, policy_id)
10000            .map_err(|e| RedDBError::Query(e.to_string()))?;
10001
10002        let principal_str = current_auth_identity()
10003            .map(|(u, _)| u)
10004            .unwrap_or_else(|| "anonymous".into());
10005        tracing::info!(
10006            target: "audit",
10007            principal = %principal_str,
10008            action = "iam:policy.detach",
10009            matched_policy_id = %policy_id,
10010            target = %pretty_target,
10011            "DETACH POLICY applied"
10012        );
10013        self.inner.audit_log.record(
10014            "iam/policy.detach",
10015            &principal_str,
10016            &pretty_target,
10017            "ok",
10018            crate::json::Value::Null,
10019        );
10020
10021        self.invalidate_result_cache();
10022        Ok(RuntimeQueryResult::ok_message(
10023            query.to_string(),
10024            &format!("policy `{policy_id}` detached from {pretty_target}"),
10025            "detach_policy",
10026        ))
10027    }
10028
10029    fn execute_show_policies(
10030        &self,
10031        query: &str,
10032        filter: Option<&crate::storage::query::ast::PolicyPrincipalRef>,
10033    ) -> RedDBResult<RuntimeQueryResult> {
10034        use crate::auth::UserId;
10035        use crate::storage::query::ast::PolicyPrincipalRef;
10036        use crate::storage::query::unified::UnifiedRecord;
10037        use crate::storage::schema::Value as SchemaValue;
10038        use std::sync::Arc;
10039
10040        let auth_store = self
10041            .inner
10042            .auth_store
10043            .read()
10044            .clone()
10045            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10046
10047        let pols = match filter {
10048            None => auth_store.list_policies(),
10049            Some(PolicyPrincipalRef::User(u)) => {
10050                let id = UserId::from_parts(u.tenant.as_deref(), &u.username);
10051                auth_store.effective_policies(&id)
10052            }
10053            Some(PolicyPrincipalRef::Group(g)) => auth_store.group_policies(g),
10054        };
10055
10056        let mut records = Vec::with_capacity(pols.len());
10057        for p in pols.iter() {
10058            let mut rec = UnifiedRecord::default();
10059            rec.set_arc(Arc::from("id"), SchemaValue::text(p.id.clone()));
10060            rec.set_arc(
10061                Arc::from("statements"),
10062                SchemaValue::Integer(p.statements.len() as i64),
10063            );
10064            rec.set_arc(
10065                Arc::from("tenant"),
10066                p.tenant
10067                    .as_deref()
10068                    .map(|t| SchemaValue::text(t.to_string()))
10069                    .unwrap_or(SchemaValue::Null),
10070            );
10071            rec.set_arc(Arc::from("json"), SchemaValue::text(p.to_json_string()));
10072            records.push(rec);
10073        }
10074        let mut result = crate::storage::query::unified::UnifiedResult::empty();
10075        result.records = records;
10076        Ok(RuntimeQueryResult {
10077            query: query.to_string(),
10078            mode: crate::storage::query::modes::QueryMode::Sql,
10079            statement: "show_policies",
10080            engine: "iam-policies",
10081            result,
10082            affected_rows: 0,
10083            statement_type: "select",
10084        })
10085    }
10086
10087    fn execute_show_effective_permissions(
10088        &self,
10089        query: &str,
10090        user: &crate::storage::query::ast::PolicyUserRef,
10091        resource: Option<&crate::storage::query::ast::PolicyResourceRef>,
10092    ) -> RedDBResult<RuntimeQueryResult> {
10093        use crate::auth::UserId;
10094        use crate::storage::query::unified::UnifiedRecord;
10095        use crate::storage::schema::Value as SchemaValue;
10096        use std::sync::Arc;
10097
10098        let auth_store = self
10099            .inner
10100            .auth_store
10101            .read()
10102            .clone()
10103            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10104        let id = UserId::from_parts(user.tenant.as_deref(), &user.username);
10105        let pols = auth_store.effective_policies(&id);
10106
10107        // Show one row per (policy, statement) tuple, plus any
10108        // resource-level filter passed by the caller.
10109        let mut records = Vec::new();
10110        for p in pols.iter() {
10111            for (idx, st) in p.statements.iter().enumerate() {
10112                if let Some(_r) = resource {
10113                    // Naive filter: render statement targets to strings
10114                    // and skip if no match. Conservative default = include
10115                    // (the simulator handles fine-grained matching).
10116                }
10117                let mut rec = UnifiedRecord::default();
10118                rec.set_arc(Arc::from("policy_id"), SchemaValue::text(p.id.clone()));
10119                rec.set_arc(
10120                    Arc::from("statement_index"),
10121                    SchemaValue::Integer(idx as i64),
10122                );
10123                rec.set_arc(
10124                    Arc::from("sid"),
10125                    st.sid
10126                        .as_deref()
10127                        .map(|s| SchemaValue::text(s.to_string()))
10128                        .unwrap_or(SchemaValue::Null),
10129                );
10130                rec.set_arc(
10131                    Arc::from("effect"),
10132                    SchemaValue::text(match st.effect {
10133                        crate::auth::policies::Effect::Allow => "allow",
10134                        crate::auth::policies::Effect::Deny => "deny",
10135                    }),
10136                );
10137                rec.set_arc(
10138                    Arc::from("actions"),
10139                    SchemaValue::Integer(st.actions.len() as i64),
10140                );
10141                rec.set_arc(
10142                    Arc::from("resources"),
10143                    SchemaValue::Integer(st.resources.len() as i64),
10144                );
10145                records.push(rec);
10146            }
10147        }
10148        let mut result = crate::storage::query::unified::UnifiedResult::empty();
10149        result.records = records;
10150        Ok(RuntimeQueryResult {
10151            query: query.to_string(),
10152            mode: crate::storage::query::modes::QueryMode::Sql,
10153            statement: "show_effective_permissions",
10154            engine: "iam-policies",
10155            result,
10156            affected_rows: 0,
10157            statement_type: "select",
10158        })
10159    }
10160
10161    fn execute_simulate_policy(
10162        &self,
10163        query: &str,
10164        user: &crate::storage::query::ast::PolicyUserRef,
10165        action: &str,
10166        resource: &crate::storage::query::ast::PolicyResourceRef,
10167    ) -> RedDBResult<RuntimeQueryResult> {
10168        use crate::auth::policies::ResourceRef;
10169        use crate::auth::store::SimCtx;
10170        use crate::auth::UserId;
10171        use crate::storage::query::unified::UnifiedRecord;
10172        use crate::storage::schema::Value as SchemaValue;
10173        use std::sync::Arc;
10174
10175        let auth_store = self
10176            .inner
10177            .auth_store
10178            .read()
10179            .clone()
10180            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10181        let id = UserId::from_parts(user.tenant.as_deref(), &user.username);
10182        let r = ResourceRef::new(resource.kind.clone(), resource.name.clone());
10183        let outcome = auth_store.simulate(&id, action, &r, SimCtx::default());
10184
10185        let principal_str = current_auth_identity()
10186            .map(|(u, _)| u)
10187            .unwrap_or_else(|| "anonymous".into());
10188        let (decision_str, matched_pid, matched_sid) = decision_to_strings(&outcome.decision);
10189        tracing::info!(
10190            target: "audit",
10191            principal = %principal_str,
10192            action = "iam:policy.simulate",
10193            decision = %decision_str,
10194            matched_policy_id = ?matched_pid,
10195            matched_sid = ?matched_sid,
10196            "SIMULATE issued"
10197        );
10198        self.inner.audit_log.record(
10199            "iam/policy.simulate",
10200            &principal_str,
10201            &id.to_string(),
10202            "ok",
10203            crate::json::Value::Null,
10204        );
10205
10206        let mut rec = UnifiedRecord::default();
10207        rec.set_arc(Arc::from("decision"), SchemaValue::text(decision_str));
10208        rec.set_arc(
10209            Arc::from("matched_policy_id"),
10210            matched_pid
10211                .map(SchemaValue::text)
10212                .unwrap_or(SchemaValue::Null),
10213        );
10214        rec.set_arc(
10215            Arc::from("matched_sid"),
10216            matched_sid
10217                .map(SchemaValue::text)
10218                .unwrap_or(SchemaValue::Null),
10219        );
10220        rec.set_arc(Arc::from("reason"), SchemaValue::text(outcome.reason));
10221        rec.set_arc(
10222            Arc::from("trail_len"),
10223            SchemaValue::Integer(outcome.trail.len() as i64),
10224        );
10225        let mut result = crate::storage::query::unified::UnifiedResult::empty();
10226        result.records = vec![rec];
10227        Ok(RuntimeQueryResult {
10228            query: query.to_string(),
10229            mode: crate::storage::query::modes::QueryMode::Sql,
10230            statement: "simulate_policy",
10231            engine: "iam-policies",
10232            result,
10233            affected_rows: 0,
10234            statement_type: "select",
10235        })
10236    }
10237}
10238
10239/// Translate a parsed GRANT into a synthetic IAM policy whose id
10240/// starts with `_grant_<unique>`. PUBLIC is represented as an
10241/// implicit IAM group; legacy GROUP grants are still rejected by the
10242/// grant store and are not translated here.
10243fn grant_to_iam_policy(
10244    principal: &crate::auth::privileges::GrantPrincipal,
10245    resource: &crate::auth::privileges::Resource,
10246    actions: &[crate::auth::privileges::Action],
10247    tenant: Option<&str>,
10248) -> Option<crate::auth::policies::Policy> {
10249    use crate::auth::policies::{
10250        compile_action, ActionPattern, Effect, Policy, ResourcePattern, Statement,
10251    };
10252    use crate::auth::privileges::{Action, GrantPrincipal, Resource};
10253
10254    if matches!(principal, GrantPrincipal::Group(_)) {
10255        return None;
10256    }
10257
10258    let now = crate::auth::now_ms();
10259    let id = format!("_grant_{:x}_{:x}", now, std::process::id());
10260
10261    let resource_str = match resource {
10262        Resource::Database => "table:*".to_string(),
10263        Resource::Schema(s) => format!("table:{s}.*"),
10264        Resource::Table { schema, table } => match schema {
10265            Some(s) => format!("table:{s}.{table}"),
10266            None => format!("table:{table}"),
10267        },
10268        Resource::Function { schema, name } => match schema {
10269            Some(s) => format!("function:{s}.{name}"),
10270            None => format!("function:{name}"),
10271        },
10272    };
10273
10274    // Compile actions — fall back to `*` only when the grant included
10275    // `Action::All`. Map every other action keyword to its lowercase
10276    // form so it lines up with the kernel's allowlist.
10277    let action_patterns: Vec<ActionPattern> = if actions.contains(&Action::All) {
10278        vec![ActionPattern::Wildcard]
10279    } else {
10280        actions
10281            .iter()
10282            .map(|a| compile_action(&a.as_str().to_ascii_lowercase()))
10283            .collect()
10284    };
10285    if action_patterns.is_empty() {
10286        return None;
10287    }
10288
10289    // Inline resource compilation matching the kernel's `compile_resource`:
10290    //   * `*` → wildcard
10291    //   * contains `*` → glob
10292    //   * `kind:name` → exact
10293    let resource_patterns = if resource_str == "*" {
10294        vec![ResourcePattern::Wildcard]
10295    } else if resource_str.contains('*') {
10296        vec![ResourcePattern::Glob(resource_str.clone())]
10297    } else if let Some((kind, name)) = resource_str.split_once(':') {
10298        vec![ResourcePattern::Exact {
10299            kind: kind.to_string(),
10300            name: name.to_string(),
10301        }]
10302    } else {
10303        vec![ResourcePattern::Wildcard]
10304    };
10305
10306    let policy = Policy {
10307        id,
10308        version: 1,
10309        tenant: tenant.map(|t| t.to_string()),
10310        created_at: now,
10311        updated_at: now,
10312        statements: vec![Statement {
10313            sid: None,
10314            effect: Effect::Allow,
10315            actions: action_patterns,
10316            resources: resource_patterns,
10317            condition: None,
10318        }],
10319    };
10320    if policy.validate().is_err() {
10321        return None;
10322    }
10323    Some(policy)
10324}
10325
10326fn legacy_action_to_iam(action: crate::auth::privileges::Action) -> &'static str {
10327    use crate::auth::privileges::Action;
10328    match action {
10329        Action::Select => "select",
10330        Action::Insert => "insert",
10331        Action::Update => "update",
10332        Action::Delete => "delete",
10333        Action::Truncate => "truncate",
10334        Action::References => "references",
10335        Action::Execute => "execute",
10336        Action::Usage => "usage",
10337        Action::All => "*",
10338    }
10339}
10340
10341fn update_set_target_columns(query: &crate::storage::query::ast::UpdateQuery) -> Vec<String> {
10342    let mut columns = Vec::new();
10343    for (column, _) in &query.assignment_exprs {
10344        if !columns.iter().any(|seen| seen == column) {
10345            columns.push(column.clone());
10346        }
10347    }
10348    columns
10349}
10350
10351fn column_access_request_for_table_update(
10352    table_name: &str,
10353    columns: Vec<String>,
10354) -> crate::auth::ColumnAccessRequest {
10355    match table_name.split_once('.') {
10356        Some((schema, table)) => {
10357            crate::auth::ColumnAccessRequest::update(table.to_string(), columns)
10358                .with_schema(schema.to_string())
10359        }
10360        None => crate::auth::ColumnAccessRequest::update(table_name.to_string(), columns),
10361    }
10362}
10363
10364fn column_access_request_for_table_select(
10365    table_name: &str,
10366    columns: Vec<String>,
10367) -> crate::auth::ColumnAccessRequest {
10368    match table_name.split_once('.') {
10369        Some((schema, table)) => {
10370            crate::auth::ColumnAccessRequest::select(table.to_string(), columns)
10371                .with_schema(schema.to_string())
10372        }
10373        None => crate::auth::ColumnAccessRequest::select(table_name.to_string(), columns),
10374    }
10375}
10376
10377fn update_returning_columns_for_policy(
10378    runtime: &RedDBRuntime,
10379    query: &crate::storage::query::ast::UpdateQuery,
10380) -> Option<Vec<String>> {
10381    let items = query.returning.as_ref()?;
10382    let mut columns = Vec::new();
10383    let project_all = items
10384        .iter()
10385        .any(|item| matches!(item, crate::storage::query::ast::ReturningItem::All));
10386    if project_all {
10387        collect_returning_star_columns(runtime, query, &mut columns);
10388    } else {
10389        for item in items {
10390            let crate::storage::query::ast::ReturningItem::Column(column) = item else {
10391                continue;
10392            };
10393            push_returning_policy_column(&mut columns, column);
10394        }
10395    }
10396    (!columns.is_empty()).then_some(columns)
10397}
10398
10399fn collect_returning_star_columns(
10400    runtime: &RedDBRuntime,
10401    query: &crate::storage::query::ast::UpdateQuery,
10402    columns: &mut Vec<String>,
10403) {
10404    let store = runtime.db().store();
10405    let Some(manager) = store.get_collection(&query.table) else {
10406        return;
10407    };
10408    if let Some(schema) = manager.column_schema() {
10409        for column in schema.iter() {
10410            push_returning_policy_column(columns, column);
10411        }
10412    }
10413    for entity in manager.query_all(|_| true) {
10414        if !returning_entity_matches_update_target(&entity, query.target) {
10415            continue;
10416        }
10417        match &entity.data {
10418            crate::storage::EntityData::Row(row) => {
10419                for (column, _) in row.iter_fields() {
10420                    push_returning_policy_column(columns, column);
10421                }
10422            }
10423            crate::storage::EntityData::Node(node) => {
10424                push_returning_policy_column(columns, "label");
10425                push_returning_policy_column(columns, "node_type");
10426                for column in node.properties.keys() {
10427                    push_returning_policy_column(columns, column);
10428                }
10429            }
10430            crate::storage::EntityData::Edge(edge) => {
10431                push_returning_policy_column(columns, "label");
10432                push_returning_policy_column(columns, "from_rid");
10433                push_returning_policy_column(columns, "to_rid");
10434                push_returning_policy_column(columns, "weight");
10435                for column in edge.properties.keys() {
10436                    push_returning_policy_column(columns, column);
10437                }
10438            }
10439            _ => {}
10440        }
10441    }
10442}
10443
10444fn push_returning_policy_column(columns: &mut Vec<String>, column: &str) {
10445    if returning_public_envelope_column(column) {
10446        return;
10447    }
10448    if !columns.iter().any(|seen| seen == column) {
10449        columns.push(column.to_string());
10450    }
10451}
10452
10453fn returning_public_envelope_column(column: &str) -> bool {
10454    matches!(
10455        column.to_ascii_lowercase().as_str(),
10456        "rid" | "collection" | "kind" | "tenant" | "created_at" | "updated_at" | "red_entity_id"
10457    )
10458}
10459
10460fn returning_entity_matches_update_target(
10461    entity: &crate::storage::UnifiedEntity,
10462    target: crate::storage::query::ast::UpdateTarget,
10463) -> bool {
10464    use crate::storage::query::ast::UpdateTarget;
10465    match target {
10466        UpdateTarget::Rows => {
10467            matches!(returning_row_item_kind(entity), Some(ReturningRowKind::Row))
10468        }
10469        UpdateTarget::Documents => {
10470            matches!(
10471                returning_row_item_kind(entity),
10472                Some(ReturningRowKind::Document)
10473            )
10474        }
10475        UpdateTarget::Kv => matches!(returning_row_item_kind(entity), Some(ReturningRowKind::Kv)),
10476        UpdateTarget::Nodes => matches!(
10477            (&entity.kind, &entity.data),
10478            (
10479                crate::storage::EntityKind::GraphNode(_),
10480                crate::storage::EntityData::Node(_)
10481            )
10482        ),
10483        UpdateTarget::Edges => matches!(
10484            (&entity.kind, &entity.data),
10485            (
10486                crate::storage::EntityKind::GraphEdge(_),
10487                crate::storage::EntityData::Edge(_)
10488            )
10489        ),
10490    }
10491}
10492
10493#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10494enum ReturningRowKind {
10495    Row,
10496    Document,
10497    Kv,
10498}
10499
10500fn returning_row_item_kind(entity: &crate::storage::UnifiedEntity) -> Option<ReturningRowKind> {
10501    let row = entity.data.as_row()?;
10502    let is_kv = row.iter_fields().all(|(column, _)| {
10503        column.eq_ignore_ascii_case("key") || column.eq_ignore_ascii_case("value")
10504    });
10505    if is_kv {
10506        return Some(ReturningRowKind::Kv);
10507    }
10508    let is_document = row
10509        .iter_fields()
10510        .any(|(_, value)| matches!(value, crate::storage::schema::Value::Json(_)));
10511    if is_document {
10512        Some(ReturningRowKind::Document)
10513    } else {
10514        Some(ReturningRowKind::Row)
10515    }
10516}
10517
10518fn requested_table_columns_for_policy(
10519    table: &crate::storage::query::ast::TableQuery,
10520) -> Vec<String> {
10521    use crate::storage::query::sql_lowering::{
10522        effective_table_filter, effective_table_group_by_exprs, effective_table_having_filter,
10523        effective_table_projections,
10524    };
10525
10526    let table_name = table.table.as_str();
10527    let table_alias = table.alias.as_deref();
10528    let mut columns = std::collections::BTreeSet::new();
10529
10530    for projection in effective_table_projections(table) {
10531        collect_projection_columns(&projection, table_name, table_alias, &mut columns);
10532    }
10533    if let Some(filter) = effective_table_filter(table) {
10534        collect_filter_columns(&filter, table_name, table_alias, &mut columns);
10535    }
10536    for expr in effective_table_group_by_exprs(table) {
10537        collect_expr_columns(&expr, table_name, table_alias, &mut columns);
10538    }
10539    if let Some(filter) = effective_table_having_filter(table) {
10540        collect_filter_columns(&filter, table_name, table_alias, &mut columns);
10541    }
10542    for order in &table.order_by {
10543        if let Some(expr) = order.expr.as_ref() {
10544            collect_expr_columns(expr, table_name, table_alias, &mut columns);
10545        } else {
10546            collect_field_ref_column(&order.field, table_name, table_alias, &mut columns);
10547        }
10548    }
10549
10550    columns.into_iter().collect()
10551}
10552
10553fn collect_projection_columns(
10554    projection: &crate::storage::query::ast::Projection,
10555    table_name: &str,
10556    table_alias: Option<&str>,
10557    columns: &mut std::collections::BTreeSet<String>,
10558) {
10559    use crate::storage::query::ast::Projection;
10560    match projection {
10561        Projection::All => {
10562            columns.insert("*".to_string());
10563        }
10564        Projection::Column(column) | Projection::Alias(column, _) => {
10565            if column != "*" {
10566                columns.insert(column.clone());
10567            }
10568        }
10569        Projection::Function(_, args) => {
10570            for arg in args {
10571                collect_projection_columns(arg, table_name, table_alias, columns);
10572            }
10573        }
10574        Projection::Expression(filter, _) => {
10575            collect_filter_columns(filter, table_name, table_alias, columns);
10576        }
10577        Projection::Field(field, _) => {
10578            collect_field_ref_column(field, table_name, table_alias, columns);
10579        }
10580        // Slice 7a (#589): no runtime support yet; recurse into args so
10581        // any column references are still tracked in case a future
10582        // executor needs the column set.
10583        Projection::Window { args, .. } => {
10584            for arg in args {
10585                collect_projection_columns(arg, table_name, table_alias, columns);
10586            }
10587        }
10588    }
10589}
10590
10591fn collect_filter_columns(
10592    filter: &crate::storage::query::ast::Filter,
10593    table_name: &str,
10594    table_alias: Option<&str>,
10595    columns: &mut std::collections::BTreeSet<String>,
10596) {
10597    use crate::storage::query::ast::Filter;
10598    match filter {
10599        Filter::Compare { field, .. }
10600        | Filter::IsNull(field)
10601        | Filter::IsNotNull(field)
10602        | Filter::In { field, .. }
10603        | Filter::Between { field, .. }
10604        | Filter::Like { field, .. }
10605        | Filter::StartsWith { field, .. }
10606        | Filter::EndsWith { field, .. }
10607        | Filter::Contains { field, .. } => {
10608            collect_field_ref_column(field, table_name, table_alias, columns);
10609        }
10610        Filter::CompareFields { left, right, .. } => {
10611            collect_field_ref_column(left, table_name, table_alias, columns);
10612            collect_field_ref_column(right, table_name, table_alias, columns);
10613        }
10614        Filter::CompareExpr { lhs, rhs, .. } => {
10615            collect_expr_columns(lhs, table_name, table_alias, columns);
10616            collect_expr_columns(rhs, table_name, table_alias, columns);
10617        }
10618        Filter::And(left, right) | Filter::Or(left, right) => {
10619            collect_filter_columns(left, table_name, table_alias, columns);
10620            collect_filter_columns(right, table_name, table_alias, columns);
10621        }
10622        Filter::Not(inner) => collect_filter_columns(inner, table_name, table_alias, columns),
10623    }
10624}
10625
10626fn collect_expr_columns(
10627    expr: &crate::storage::query::ast::Expr,
10628    table_name: &str,
10629    table_alias: Option<&str>,
10630    columns: &mut std::collections::BTreeSet<String>,
10631) {
10632    use crate::storage::query::ast::Expr;
10633    match expr {
10634        Expr::Column { field, .. } => {
10635            collect_field_ref_column(field, table_name, table_alias, columns);
10636        }
10637        Expr::Literal { .. } | Expr::Parameter { .. } => {}
10638        Expr::UnaryOp { operand, .. } | Expr::Cast { inner: operand, .. } => {
10639            collect_expr_columns(operand, table_name, table_alias, columns);
10640        }
10641        Expr::BinaryOp { lhs, rhs, .. } => {
10642            collect_expr_columns(lhs, table_name, table_alias, columns);
10643            collect_expr_columns(rhs, table_name, table_alias, columns);
10644        }
10645        Expr::FunctionCall { args, .. } => {
10646            for arg in args {
10647                collect_expr_columns(arg, table_name, table_alias, columns);
10648            }
10649        }
10650        Expr::Case {
10651            branches, else_, ..
10652        } => {
10653            for (condition, value) in branches {
10654                collect_expr_columns(condition, table_name, table_alias, columns);
10655                collect_expr_columns(value, table_name, table_alias, columns);
10656            }
10657            if let Some(value) = else_ {
10658                collect_expr_columns(value, table_name, table_alias, columns);
10659            }
10660        }
10661        Expr::IsNull { operand, .. } => {
10662            collect_expr_columns(operand, table_name, table_alias, columns);
10663        }
10664        Expr::InList { target, values, .. } => {
10665            collect_expr_columns(target, table_name, table_alias, columns);
10666            for value in values {
10667                collect_expr_columns(value, table_name, table_alias, columns);
10668            }
10669        }
10670        Expr::Between {
10671            target, low, high, ..
10672        } => {
10673            collect_expr_columns(target, table_name, table_alias, columns);
10674            collect_expr_columns(low, table_name, table_alias, columns);
10675            collect_expr_columns(high, table_name, table_alias, columns);
10676        }
10677        Expr::Subquery { .. } => {}
10678        Expr::WindowFunctionCall { args, window, .. } => {
10679            for arg in args {
10680                collect_expr_columns(arg, table_name, table_alias, columns);
10681            }
10682            for e in &window.partition_by {
10683                collect_expr_columns(e, table_name, table_alias, columns);
10684            }
10685            for o in &window.order_by {
10686                collect_expr_columns(&o.expr, table_name, table_alias, columns);
10687            }
10688        }
10689    }
10690}
10691
10692fn collect_field_ref_column(
10693    field: &crate::storage::query::ast::FieldRef,
10694    table_name: &str,
10695    table_alias: Option<&str>,
10696    columns: &mut std::collections::BTreeSet<String>,
10697) {
10698    if let Some(column) = policy_column_name_from_field_ref(field, table_name, table_alias) {
10699        if column != "*" {
10700            columns.insert(column);
10701        }
10702    }
10703}
10704
10705fn policy_column_name_from_field_ref(
10706    field: &crate::storage::query::ast::FieldRef,
10707    table_name: &str,
10708    table_alias: Option<&str>,
10709) -> Option<String> {
10710    match field {
10711        crate::storage::query::ast::FieldRef::TableColumn { table, column } => {
10712            if column == "*" {
10713                return Some("*".to_string());
10714            }
10715            if table.is_empty() || table == table_name || Some(table.as_str()) == table_alias {
10716                Some(column.clone())
10717            } else {
10718                Some(format!("{table}.{column}"))
10719            }
10720        }
10721        _ => None,
10722    }
10723}
10724
10725fn legacy_resource_to_iam(
10726    resource: &crate::auth::privileges::Resource,
10727    tenant: Option<&str>,
10728) -> crate::auth::policies::ResourceRef {
10729    use crate::auth::privileges::Resource;
10730
10731    let (kind, name) = match resource {
10732        Resource::Database => ("database".to_string(), "*".to_string()),
10733        Resource::Schema(s) => ("schema".to_string(), format!("{s}.*")),
10734        Resource::Table { schema, table } => (
10735            "table".to_string(),
10736            match schema {
10737                Some(s) => format!("{s}.{table}"),
10738                None => table.clone(),
10739            },
10740        ),
10741        Resource::Function { schema, name } => (
10742            "function".to_string(),
10743            match schema {
10744                Some(s) => format!("{s}.{name}"),
10745                None => name.clone(),
10746            },
10747        ),
10748    };
10749
10750    let mut out = crate::auth::policies::ResourceRef::new(kind, name);
10751    if let Some(t) = tenant {
10752        out = out.with_tenant(t.to_string());
10753    }
10754    out
10755}
10756
10757#[derive(Debug)]
10758struct JoinTableSide {
10759    table: String,
10760    alias: String,
10761}
10762
10763fn table_side_context(expr: &QueryExpr) -> Option<JoinTableSide> {
10764    match expr {
10765        QueryExpr::Table(table) => Some(JoinTableSide {
10766            table: table.table.clone(),
10767            alias: table.alias.clone().unwrap_or_else(|| table.table.clone()),
10768        }),
10769        _ => None,
10770    }
10771}
10772
10773fn collect_projection_columns_for_table(
10774    projection: &Projection,
10775    table: &str,
10776    alias: Option<&str>,
10777    out: &mut BTreeSet<String>,
10778) {
10779    match projection {
10780        Projection::Column(column) | Projection::Alias(column, _) => {
10781            match split_qualified_column(column) {
10782                Some((qualifier, column))
10783                    if qualifier == table || alias.is_some_and(|alias| qualifier == alias) =>
10784                {
10785                    push_policy_column(column, out);
10786                }
10787                Some(_) => {}
10788                None => push_policy_column(column, out),
10789            }
10790        }
10791        Projection::Field(
10792            FieldRef::TableColumn {
10793                table: qualifier,
10794                column,
10795            },
10796            _,
10797        ) => {
10798            if qualifier.is_empty()
10799                || qualifier == table
10800                || alias.is_some_and(|alias| qualifier == alias)
10801            {
10802                push_policy_column(column, out);
10803            }
10804        }
10805        Projection::Field(
10806            FieldRef::NodeProperty {
10807                alias: qualifier,
10808                property,
10809            },
10810            _,
10811        )
10812        | Projection::Field(
10813            FieldRef::EdgeProperty {
10814                alias: qualifier,
10815                property,
10816            },
10817            _,
10818        ) => {
10819            if qualifier == table || alias.is_some_and(|alias| qualifier == alias) {
10820                push_policy_column(property, out);
10821            }
10822        }
10823        Projection::Function(_, args) => {
10824            for arg in args {
10825                collect_projection_columns_for_table(arg, table, alias, out);
10826            }
10827        }
10828        Projection::Expression(_, _) | Projection::All | Projection::Field(_, _) => {}
10829        Projection::Window { args, .. } => {
10830            for arg in args {
10831                collect_projection_columns_for_table(arg, table, alias, out);
10832            }
10833        }
10834    }
10835}
10836
10837fn collect_projection_columns_for_join_side(
10838    projection: &Projection,
10839    left: Option<&JoinTableSide>,
10840    right: Option<&JoinTableSide>,
10841    out: &mut HashMap<String, BTreeSet<String>>,
10842) -> RedDBResult<()> {
10843    match projection {
10844        Projection::Column(column) | Projection::Alias(column, _) => {
10845            if let Some((qualifier, column)) = split_qualified_column(column) {
10846                push_qualified_join_column(qualifier, column, left, right, out);
10847            } else {
10848                push_unqualified_join_column(column, left, right, out);
10849            }
10850        }
10851        Projection::Field(FieldRef::TableColumn { table, column }, _) => {
10852            if table.is_empty() {
10853                push_unqualified_join_column(column, left, right, out);
10854            } else if let Some(side) = [left, right]
10855                .into_iter()
10856                .flatten()
10857                .find(|side| table == side.table.as_str() || table == side.alias.as_str())
10858            {
10859                push_join_column(&side.table, column, out);
10860            }
10861        }
10862        Projection::Field(FieldRef::NodeProperty { alias, property }, _)
10863        | Projection::Field(FieldRef::EdgeProperty { alias, property }, _) => {
10864            push_qualified_join_column(alias, property, left, right, out);
10865        }
10866        Projection::Function(_, args) => {
10867            for arg in args {
10868                collect_projection_columns_for_join_side(arg, left, right, out)?;
10869            }
10870        }
10871        Projection::Expression(_, _) | Projection::All | Projection::Field(_, _) => {}
10872        Projection::Window { args, .. } => {
10873            for arg in args {
10874                collect_projection_columns_for_join_side(arg, left, right, out)?;
10875            }
10876        }
10877    }
10878    Ok(())
10879}
10880
10881fn split_qualified_column(column: &str) -> Option<(&str, &str)> {
10882    let (qualifier, column) = column.split_once('.')?;
10883    if qualifier.is_empty() || column.is_empty() || column.contains('.') {
10884        return None;
10885    }
10886    Some((qualifier, column))
10887}
10888
10889fn push_qualified_join_column(
10890    qualifier: &str,
10891    column: &str,
10892    left: Option<&JoinTableSide>,
10893    right: Option<&JoinTableSide>,
10894    out: &mut HashMap<String, BTreeSet<String>>,
10895) {
10896    if let Some(side) = [left, right]
10897        .into_iter()
10898        .flatten()
10899        .find(|side| qualifier == side.table.as_str() || qualifier == side.alias.as_str())
10900    {
10901        push_join_column(&side.table, column, out);
10902    }
10903}
10904
10905fn push_unqualified_join_column(
10906    column: &str,
10907    left: Option<&JoinTableSide>,
10908    right: Option<&JoinTableSide>,
10909    out: &mut HashMap<String, BTreeSet<String>>,
10910) {
10911    for side in [left, right].into_iter().flatten() {
10912        push_join_column(&side.table, column, out);
10913    }
10914}
10915
10916fn push_join_column(table: &str, column: &str, out: &mut HashMap<String, BTreeSet<String>>) {
10917    if is_policy_column_name(column) {
10918        out.entry(table.to_string())
10919            .or_default()
10920            .insert(column.to_string());
10921    }
10922}
10923
10924fn push_policy_column(column: &str, out: &mut BTreeSet<String>) {
10925    if is_policy_column_name(column) {
10926        out.insert(column.to_string());
10927    }
10928}
10929
10930fn is_policy_column_name(column: &str) -> bool {
10931    !column.is_empty()
10932        && column != "*"
10933        && !column.starts_with("LIT:")
10934        && !column.starts_with("TYPE:")
10935}
10936
10937fn runtime_iam_context(
10938    role: crate::auth::Role,
10939    tenant: Option<&str>,
10940    principal_is_system_owned: bool,
10941) -> crate::auth::policies::EvalContext {
10942    crate::auth::policies::EvalContext {
10943        principal_tenant: tenant.map(|t| t.to_string()),
10944        current_tenant: tenant.map(|t| t.to_string()),
10945        peer_ip: None,
10946        mfa_present: false,
10947        now_ms: crate::auth::now_ms(),
10948        principal_is_admin_role: role == crate::auth::Role::Admin,
10949        principal_is_system_owned,
10950        principal_is_platform_scoped: tenant.is_none(),
10951    }
10952}
10953
10954fn explicit_table_projection_columns(
10955    query: &crate::storage::query::ast::TableQuery,
10956) -> Vec<String> {
10957    use crate::storage::query::ast::{FieldRef, Projection};
10958
10959    let mut columns = Vec::new();
10960    for projection in crate::storage::query::sql_lowering::effective_table_projections(query) {
10961        match projection {
10962            Projection::Column(column) | Projection::Alias(column, _) => {
10963                push_unique(&mut columns, column)
10964            }
10965            Projection::Field(FieldRef::TableColumn { column, .. }, _) => {
10966                push_unique(&mut columns, column)
10967            }
10968            // SELECT * and expression/function projections need the
10969            // executor-wide column-policy context mapped in
10970            // docs/security/select-relational-column-policy-audit-2026-05-08.md.
10971            _ => {}
10972        }
10973    }
10974    columns
10975}
10976
10977fn explicit_graph_projection_properties(
10978    query: &crate::storage::query::ast::GraphQuery,
10979) -> Vec<String> {
10980    use crate::storage::query::ast::{FieldRef, Projection};
10981
10982    let mut columns = Vec::new();
10983    for projection in &query.return_ {
10984        match projection {
10985            Projection::Field(FieldRef::NodeProperty { property, .. }, _)
10986            | Projection::Field(FieldRef::EdgeProperty { property, .. }, _) => {
10987                push_unique(&mut columns, property.clone())
10988            }
10989            _ => {}
10990        }
10991    }
10992    columns
10993}
10994
10995fn push_unique(columns: &mut Vec<String>, column: String) {
10996    if !columns.iter().any(|existing| existing == &column) {
10997        columns.push(column);
10998    }
10999}
11000
11001fn principal_label(p: &crate::storage::query::ast::PolicyPrincipalRef) -> String {
11002    use crate::storage::query::ast::PolicyPrincipalRef;
11003    match p {
11004        PolicyPrincipalRef::User(u) => match &u.tenant {
11005            Some(t) => format!("user:{t}/{}", u.username),
11006            None => format!("user:{}", u.username),
11007        },
11008        PolicyPrincipalRef::Group(g) => format!("group:{g}"),
11009    }
11010}
11011
11012/// Render a `Decision` into the (decision, matched_policy_id, matched_sid)
11013/// shape used by every audit emit + the simulator response.
11014pub(crate) fn decision_to_strings(
11015    d: &crate::auth::policies::Decision,
11016) -> (String, Option<String>, Option<String>) {
11017    use crate::auth::policies::Decision;
11018    match d {
11019        Decision::Allow {
11020            matched_policy_id,
11021            matched_sid,
11022        } => (
11023            "allow".into(),
11024            Some(matched_policy_id.clone()),
11025            matched_sid.clone(),
11026        ),
11027        Decision::Deny {
11028            matched_policy_id,
11029            matched_sid,
11030        } => (
11031            "deny".into(),
11032            Some(matched_policy_id.clone()),
11033            matched_sid.clone(),
11034        ),
11035        Decision::DefaultDeny => ("default_deny".into(), None, None),
11036        Decision::AdminBypass => ("admin_bypass".into(), None, None),
11037    }
11038}
11039
11040fn relation_scopes_for_query(query: &QueryExpr) -> Vec<String> {
11041    let mut scopes = Vec::new();
11042    collect_relation_scopes(query, &mut scopes);
11043    scopes.sort();
11044    scopes.dedup();
11045    scopes
11046}
11047
11048fn collect_relation_scopes(query: &QueryExpr, scopes: &mut Vec<String>) {
11049    match query {
11050        QueryExpr::Table(table) => {
11051            if !table.table.is_empty() {
11052                scopes.push(table.table.clone());
11053            }
11054            if let Some(alias) = &table.alias {
11055                scopes.push(alias.clone());
11056            }
11057        }
11058        QueryExpr::Join(join) => {
11059            collect_relation_scopes(&join.left, scopes);
11060            collect_relation_scopes(&join.right, scopes);
11061        }
11062        _ => {}
11063    }
11064}
11065
11066fn query_references_outer_scope(query: &QueryExpr, outer_scopes: &[String]) -> bool {
11067    let inner_scopes = relation_scopes_for_query(query);
11068    query_expr_references_outer_scope(query, outer_scopes, &inner_scopes)
11069}
11070
11071fn query_expr_references_outer_scope(
11072    query: &QueryExpr,
11073    outer_scopes: &[String],
11074    inner_scopes: &[String],
11075) -> bool {
11076    match query {
11077        QueryExpr::Table(table) => {
11078            table.select_items.iter().any(|item| match item {
11079                crate::storage::query::ast::SelectItem::Wildcard => false,
11080                crate::storage::query::ast::SelectItem::Expr { expr, .. } => {
11081                    expr_references_outer_scope(expr, outer_scopes, inner_scopes)
11082                }
11083            }) || table
11084                .where_expr
11085                .as_ref()
11086                .is_some_and(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
11087                || table.filter.as_ref().is_some_and(|filter| {
11088                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
11089                })
11090                || table.having_expr.as_ref().is_some_and(|expr| {
11091                    expr_references_outer_scope(expr, outer_scopes, inner_scopes)
11092                })
11093                || table.having.as_ref().is_some_and(|filter| {
11094                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
11095                })
11096                || table
11097                    .group_by_exprs
11098                    .iter()
11099                    .any(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
11100                || table.order_by.iter().any(|clause| {
11101                    clause.expr.as_ref().is_some_and(|expr| {
11102                        expr_references_outer_scope(expr, outer_scopes, inner_scopes)
11103                    })
11104                })
11105        }
11106        QueryExpr::Join(join) => {
11107            query_expr_references_outer_scope(&join.left, outer_scopes, inner_scopes)
11108                || query_expr_references_outer_scope(&join.right, outer_scopes, inner_scopes)
11109                || join.filter.as_ref().is_some_and(|filter| {
11110                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
11111                })
11112                || join.return_items.iter().any(|item| match item {
11113                    crate::storage::query::ast::SelectItem::Wildcard => false,
11114                    crate::storage::query::ast::SelectItem::Expr { expr, .. } => {
11115                        expr_references_outer_scope(expr, outer_scopes, inner_scopes)
11116                    }
11117                })
11118        }
11119        _ => false,
11120    }
11121}
11122
11123fn filter_references_outer_scope(
11124    filter: &crate::storage::query::ast::Filter,
11125    outer_scopes: &[String],
11126    inner_scopes: &[String],
11127) -> bool {
11128    use crate::storage::query::ast::Filter;
11129    match filter {
11130        Filter::Compare { field, .. }
11131        | Filter::IsNull(field)
11132        | Filter::IsNotNull(field)
11133        | Filter::In { field, .. }
11134        | Filter::Between { field, .. }
11135        | Filter::Like { field, .. }
11136        | Filter::StartsWith { field, .. }
11137        | Filter::EndsWith { field, .. }
11138        | Filter::Contains { field, .. } => {
11139            field_ref_references_outer_scope(field, outer_scopes, inner_scopes)
11140        }
11141        Filter::CompareFields { left, right, .. } => {
11142            field_ref_references_outer_scope(left, outer_scopes, inner_scopes)
11143                || field_ref_references_outer_scope(right, outer_scopes, inner_scopes)
11144        }
11145        Filter::CompareExpr { lhs, rhs, .. } => {
11146            expr_references_outer_scope(lhs, outer_scopes, inner_scopes)
11147                || expr_references_outer_scope(rhs, outer_scopes, inner_scopes)
11148        }
11149        Filter::And(left, right) | Filter::Or(left, right) => {
11150            filter_references_outer_scope(left, outer_scopes, inner_scopes)
11151                || filter_references_outer_scope(right, outer_scopes, inner_scopes)
11152        }
11153        Filter::Not(inner) => filter_references_outer_scope(inner, outer_scopes, inner_scopes),
11154    }
11155}
11156
11157fn expr_references_outer_scope(
11158    expr: &crate::storage::query::ast::Expr,
11159    outer_scopes: &[String],
11160    inner_scopes: &[String],
11161) -> bool {
11162    use crate::storage::query::ast::Expr;
11163    match expr {
11164        Expr::Column { field, .. } => {
11165            field_ref_references_outer_scope(field, outer_scopes, inner_scopes)
11166        }
11167        Expr::BinaryOp { lhs, rhs, .. } => {
11168            expr_references_outer_scope(lhs, outer_scopes, inner_scopes)
11169                || expr_references_outer_scope(rhs, outer_scopes, inner_scopes)
11170        }
11171        Expr::UnaryOp { operand, .. }
11172        | Expr::Cast { inner: operand, .. }
11173        | Expr::IsNull { operand, .. } => {
11174            expr_references_outer_scope(operand, outer_scopes, inner_scopes)
11175        }
11176        Expr::FunctionCall { args, .. } => args
11177            .iter()
11178            .any(|arg| expr_references_outer_scope(arg, outer_scopes, inner_scopes)),
11179        Expr::Case {
11180            branches, else_, ..
11181        } => {
11182            branches.iter().any(|(cond, value)| {
11183                expr_references_outer_scope(cond, outer_scopes, inner_scopes)
11184                    || expr_references_outer_scope(value, outer_scopes, inner_scopes)
11185            }) || else_
11186                .as_ref()
11187                .is_some_and(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
11188        }
11189        Expr::InList { target, values, .. } => {
11190            expr_references_outer_scope(target, outer_scopes, inner_scopes)
11191                || values
11192                    .iter()
11193                    .any(|value| expr_references_outer_scope(value, outer_scopes, inner_scopes))
11194        }
11195        Expr::Between {
11196            target, low, high, ..
11197        } => {
11198            expr_references_outer_scope(target, outer_scopes, inner_scopes)
11199                || expr_references_outer_scope(low, outer_scopes, inner_scopes)
11200                || expr_references_outer_scope(high, outer_scopes, inner_scopes)
11201        }
11202        Expr::Subquery { query, .. } => query_references_outer_scope(&query.query, inner_scopes),
11203        Expr::Literal { .. } | Expr::Parameter { .. } => false,
11204        Expr::WindowFunctionCall { args, window, .. } => {
11205            args.iter()
11206                .any(|arg| expr_references_outer_scope(arg, outer_scopes, inner_scopes))
11207                || window
11208                    .partition_by
11209                    .iter()
11210                    .any(|e| expr_references_outer_scope(e, outer_scopes, inner_scopes))
11211                || window
11212                    .order_by
11213                    .iter()
11214                    .any(|o| expr_references_outer_scope(&o.expr, outer_scopes, inner_scopes))
11215        }
11216    }
11217}
11218
11219fn field_ref_references_outer_scope(
11220    field: &crate::storage::query::ast::FieldRef,
11221    outer_scopes: &[String],
11222    inner_scopes: &[String],
11223) -> bool {
11224    match field {
11225        crate::storage::query::ast::FieldRef::TableColumn { table, .. } if !table.is_empty() => {
11226            outer_scopes.iter().any(|scope| scope == table)
11227                && !inner_scopes.iter().any(|scope| scope == table)
11228        }
11229        _ => false,
11230    }
11231}
11232
11233fn first_column_values(
11234    result: crate::storage::query::unified::UnifiedResult,
11235) -> RedDBResult<Vec<Value>> {
11236    if result.columns.len() > 1 {
11237        return Err(RedDBError::Query(
11238            "expression subquery must return exactly one column".to_string(),
11239        ));
11240    }
11241    let fallback_column = result
11242        .records
11243        .first()
11244        .and_then(|record| record.column_names().into_iter().next())
11245        .map(|name| name.to_string());
11246    let column = result.columns.first().cloned().or(fallback_column);
11247    let Some(column) = column else {
11248        return Ok(Vec::new());
11249    };
11250    Ok(result
11251        .records
11252        .iter()
11253        .map(|record| record.get(column.as_str()).cloned().unwrap_or(Value::Null))
11254        .collect())
11255}
11256
11257fn parse_timestamp_to_ms(s: &str) -> Option<u128> {
11258    // Bare integer ms.
11259    if let Ok(n) = s.parse::<u128>() {
11260        return Some(n);
11261    }
11262    // Fallback: ISO-8601 like 2030-01-02 03:04:05 — accept the date
11263    // portion only (midnight UTC). Full RFC3339 parsing is a stretch
11264    // goal; the common case is `'2030-01-01'`.
11265    if let Some(date) = s.split_whitespace().next() {
11266        let parts: Vec<&str> = date.split('-').collect();
11267        if parts.len() == 3 {
11268            let (y, m, d) = (parts[0], parts[1], parts[2]);
11269            if let (Ok(y), Ok(m), Ok(d)) = (y.parse::<i64>(), m.parse::<u32>(), d.parse::<u32>()) {
11270                // Days since 1970-01-01 — simple Julian arithmetic
11271                // suitable for years 1970-2100. Good enough for test
11272                // fixtures; precise parsing lands when we wire chrono.
11273                let days_in = days_from_civil(y, m, d);
11274                return Some((days_in as u128) * 86_400_000u128);
11275            }
11276        }
11277    }
11278    None
11279}
11280
11281/// Days from Unix epoch using H. Hinnant's civil-from-days algorithm.
11282/// Robust for the entire Gregorian range; used by `parse_timestamp_to_ms`.
11283fn days_from_civil(y: i64, m: u32, d: u32) -> i64 {
11284    let y = if m <= 2 { y - 1 } else { y };
11285    let era = if y >= 0 { y } else { y - 399 } / 400;
11286    let yoe = (y - era * 400) as u64; // [0, 399]
11287    let doy = (153 * (if m > 2 { m - 3 } else { m + 9 }) as u64 + 2) / 5 + d as u64 - 1;
11288    let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
11289    era * 146097 + doe as i64 - 719468
11290}
11291
11292fn walk_plan_node(
11293    node: &crate::storage::query::planner::CanonicalLogicalNode,
11294    depth: usize,
11295    out: &mut Vec<crate::storage::query::unified::UnifiedRecord>,
11296) {
11297    use std::sync::Arc;
11298    let mut rec = crate::storage::query::unified::UnifiedRecord::default();
11299    rec.set_arc(Arc::from("op"), Value::text(node.operator.clone()));
11300    rec.set_arc(
11301        Arc::from("source"),
11302        node.source.clone().map(Value::text).unwrap_or(Value::Null),
11303    );
11304    rec.set_arc(Arc::from("est_rows"), Value::Float(node.estimated_rows));
11305    rec.set_arc(Arc::from("est_cost"), Value::Float(node.operator_cost));
11306    rec.set_arc(Arc::from("depth"), Value::Integer(depth as i64));
11307    out.push(rec);
11308    for child in &node.children {
11309        walk_plan_node(child, depth + 1, out);
11310    }
11311}