Skip to main content

reddb_server/runtime/
impl_core.rs

1use super::*;
2use crate::application::entity::metadata_to_json;
3use crate::auth::column_policy_gate::ColumnAccessRequest;
4use crate::auth::UserId;
5use crate::replication::cdc::ChangeRecord;
6use crate::replication::logical::{ApplyMode, LogicalChangeApplier};
7use crate::storage::query::ast::TableSource;
8
9thread_local! {
10    /// Current connection id for the executing statement. Set by the
11    /// per-connection wrapper (stdio/gRPC handlers) before dispatching
12    /// into `execute_query`; falls back to `0` for embedded callers.
13    static CURRENT_CONN_ID: std::cell::Cell<u64> = const { std::cell::Cell::new(0) };
14
15    /// Authenticated user + role for the executing statement (Phase 2.5.2
16    /// RLS enforcement). Set by the transport middleware after validating
17    /// credentials (password / cert / oauth); unset means "anonymous" /
18    /// "embedded" — RLS policies degrade to the role-agnostic subset.
19    ///
20    /// `None` skips RLS injection entirely; `Some((username, role))`
21    /// passes `role` to `matching_rls_policies(table, Some(role), action)`.
22    static CURRENT_AUTH_IDENTITY: std::cell::RefCell<Option<(String, crate::auth::Role)>> =
23        const { std::cell::RefCell::new(None) };
24
25    /// MVCC snapshot scoped to the currently-executing statement (Phase
26    /// 2.3.2d PG parity). `execute_query` captures it on entry and drops
27    /// it on exit; every scan consults it via
28    /// `entity_visible_under_current_snapshot` to hide tuples whose xmin
29    /// hasn't committed or whose xmax already has.
30    ///
31    /// `None` means "pre-MVCC semantics" — the read path returns every
32    /// tuple regardless of xmin/xmax. All embedded callers that bypass
33    /// `execute_query` see this default.
34    static CURRENT_SNAPSHOT: std::cell::RefCell<Option<SnapshotContext>> =
35        const { std::cell::RefCell::new(None) };
36
37    /// Cheap presence flag for `CURRENT_SNAPSHOT`. Scan hot paths
38    /// poll this instead of `borrow()`-ing the RefCell on every
39    /// row — the common case (autocommit / no MVCC session) reads
40    /// one atomic `Cell<bool>` and short-circuits, saving ~10ns × N
41    /// rows on aggregate_group / select_range scans.
42    static HAS_SNAPSHOT: std::cell::Cell<bool> = const { std::cell::Cell::new(false) };
43
44    /// Session-scoped tenant id for the current connection (Phase 2.5.3
45    /// multi-tenancy). Populated by `SET TENANT 'id'` or by transport
46    /// middleware after resolving tenant from auth claims. Read by the
47    /// `CURRENT_TENANT()` scalar function — RLS policies typically
48    /// combine it as `USING (tenant_id = CURRENT_TENANT())` to scope
49    /// every query to one tenant.
50    ///
51    /// `None` means "no tenant bound" — `CURRENT_TENANT()` returns
52    /// NULL, and RLS policies that gate on it hide every row.
53    static CURRENT_TENANT_ID: std::cell::RefCell<Option<String>> =
54        const { std::cell::RefCell::new(None) };
55
56    /// Statement-local config resolver. SQL expressions materialize the
57    /// `red_config` snapshot lazily on the first `$config.*`/`CONFIG()`
58    /// access, keeping ordinary statements on the zero-scan path.
59    static CURRENT_CONFIG_RESOLVER: std::cell::RefCell<Option<ConfigResolver>> =
60        const { std::cell::RefCell::new(None) };
61
62    /// Statement-local secret resolver. SQL expressions materialize the
63    /// vault KV snapshot lazily on first `$secret.*` access, then use
64    /// lock-free map reads for the rest of the statement.
65    static CURRENT_SECRET_RESOLVER: std::cell::RefCell<Option<SecretResolver>> =
66        const { std::cell::RefCell::new(None) };
67}
68
69fn secret_sql_value_to_string(value: &Value) -> RedDBResult<String> {
70    match value {
71        Value::Text(s) => Ok(s.to_string()),
72        Value::Integer(n) => Ok(n.to_string()),
73        Value::UnsignedInteger(n) => Ok(n.to_string()),
74        Value::Float(n) => Ok(n.to_string()),
75        Value::Boolean(b) => Ok(b.to_string()),
76        Value::Null => Err(RedDBError::Query(
77            "SET SECRET key = NULL deletes the secret; use DELETE SECRET for explicit deletes"
78                .to_string(),
79        )),
80        Value::Password(_) | Value::Secret(_) => Err(RedDBError::Query(
81            "SET SECRET accepts plain scalar literals; PASSWORD() and SECRET() are for typed columns"
82                .to_string(),
83        )),
84        _ => Err(RedDBError::Query(format!(
85            "SET SECRET does not support value type {:?} yet",
86            value.data_type()
87        ))),
88    }
89}
90
91/// Convert the rows produced by a materialized-view body into
92/// `UnifiedEntity` table rows targeting the backing collection.
93/// Issue #595 slice 9c — feeds `UnifiedStore::refresh_collection`.
94///
95/// Graph fragments and vector hits are ignored: a materialized view
96/// is a relational result set (SELECT-shaped); slices 11+ may extend
97/// this once we have a richer view body shape. Each row materialises
98/// the union of its schema-bound columns + overflow.
99fn view_records_to_entities(
100    table: &str,
101    records: &[crate::storage::query::unified::UnifiedRecord],
102) -> Vec<crate::storage::UnifiedEntity> {
103    use std::collections::HashMap;
104    let table_arc: std::sync::Arc<str> = std::sync::Arc::from(table);
105    let mut out = Vec::with_capacity(records.len());
106    for record in records {
107        let mut named: HashMap<String, crate::storage::schema::Value> = HashMap::new();
108        for (name, value) in record.iter_fields() {
109            named.insert(name.to_string(), value.clone());
110        }
111        let entity = crate::storage::UnifiedEntity::new(
112            crate::storage::EntityId::new(0),
113            crate::storage::EntityKind::TableRow {
114                table: std::sync::Arc::clone(&table_arc),
115                row_id: 0,
116            },
117            crate::storage::EntityData::Row(crate::storage::RowData {
118                columns: Vec::new(),
119                named: Some(named),
120                schema: None,
121            }),
122        );
123        out.push(entity);
124    }
125    out
126}
127
128fn system_keyed_collection_contract(
129    name: &str,
130    model: crate::catalog::CollectionModel,
131) -> crate::physical::CollectionContract {
132    let now = crate::utils::now_unix_millis() as u128;
133    crate::physical::CollectionContract {
134        name: name.to_string(),
135        declared_model: model,
136        schema_mode: crate::catalog::SchemaMode::Dynamic,
137        origin: crate::physical::ContractOrigin::Implicit,
138        version: 1,
139        created_at_unix_ms: now,
140        updated_at_unix_ms: now,
141        default_ttl_ms: None,
142        vector_dimension: None,
143        vector_metric: None,
144        context_index_fields: Vec::new(),
145        declared_columns: Vec::new(),
146        table_def: None,
147        timestamps_enabled: false,
148        context_index_enabled: false,
149        metrics_raw_retention_ms: None,
150        metrics_rollup_policies: Vec::new(),
151        metrics_tenant_identity: None,
152        metrics_namespace: None,
153        append_only: false,
154        subscriptions: Vec::new(),
155        session_key: None,
156        session_gap_ms: None,
157        retention_duration_ms: None,
158    }
159}
160
161/// Snapshot + manager pair used for read-path visibility checks.
162///
163/// The manager is needed in addition to the snapshot because `aborted`
164/// state mutates after the snapshot is captured — a ROLLBACK by a
165/// committed-at-capture-time writer must still hide its tuples. Keeping
166/// the Arc around is O(pointer) and the RwLock reads on `is_aborted`
167/// are cheap (HashSet lookup under a parking_lot read guard).
168///
169/// `own_xids` (Phase 2.3.2e) lists the xids belonging to the current
170/// connection's transaction — the parent xid plus open and released
171/// savepoint sub-xids. The visibility rule promotes rows stamped with
172/// these xids to "always visible (unless aborted)" so the writer sees
173/// its own nested-savepoint writes even though their xids exceed
174/// `snapshot.xid`.
175#[derive(Clone)]
176pub struct SnapshotContext {
177    pub snapshot: crate::storage::transaction::snapshot::Snapshot,
178    pub manager: Arc<crate::storage::transaction::snapshot::SnapshotManager>,
179    pub own_xids: std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
180    pub requires_index_fallback: bool,
181}
182
183/// Install a connection id on the current thread for the duration of a
184/// statement. Transaction state (`RuntimeInner::tx_contexts`) is keyed
185/// by this id so different connections can hold independent BEGINs.
186///
187/// Pub so transports (PG wire, gRPC, HTTP per-request spawners) and
188/// tests can emulate per-connection isolation. Call it once when
189/// binding the connection's worker thread; pair with
190/// `clear_current_connection_id` on teardown.
191pub fn set_current_connection_id(id: u64) {
192    CURRENT_CONN_ID.with(|c| c.set(id));
193}
194
195/// Reset the thread's connection id back to `0` (autocommit).
196pub fn clear_current_connection_id() {
197    CURRENT_CONN_ID.with(|c| c.set(0));
198}
199
200/// Read the connection id set by `set_current_connection_id`. Returns
201/// `0` when no wrapper installed one — auto-commit path.
202pub fn current_connection_id() -> u64 {
203    CURRENT_CONN_ID.with(|c| c.get())
204}
205
206/// Install the authenticated identity for the current thread (Phase 2.5.2
207/// RLS enforcement). Transport layers call this right after resolving
208/// auth so the query dispatch can fold RLS policies into the filter.
209pub fn set_current_auth_identity(username: String, role: crate::auth::Role) {
210    CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = Some((username, role)));
211}
212
213/// Clear the thread-local auth identity. Transports call this after the
214/// statement completes so pooled threads don't leak identities across
215/// requests.
216pub fn clear_current_auth_identity() {
217    CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = None);
218}
219
220/// Read the current-thread auth identity. `None` when no transport
221/// installed one (embedded mode / anonymous access).
222pub(crate) fn current_auth_identity() -> Option<(String, crate::auth::Role)> {
223    CURRENT_AUTH_IDENTITY.with(|cell| cell.borrow().clone())
224}
225
226/// Install the session tenant id for the current thread (Phase 2.5.3
227/// multi-tenancy). Called by `SET TENANT 'id'` dispatch and by
228/// transport middleware that resolves tenant from auth claims (e.g.
229/// JWT `tenant` claim, HTTP header, subdomain).
230pub fn set_current_tenant(tenant_id: String) {
231    CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = Some(tenant_id));
232}
233
234/// Clear the current-thread tenant — `CURRENT_TENANT()` will then
235/// return NULL and any RLS policy gated on it will hide every row.
236pub fn clear_current_tenant() {
237    CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = None);
238}
239
240/// Read the current-thread tenant id, applying overrides in priority order:
241///   1. `WITHIN TENANT '<id>' …` per-statement override (highest)
242///   2. `SET LOCAL TENANT '<id>'` transaction-local override (consulted
243///      only when the current connection has an open transaction)
244///   3. `SET TENANT '<id>'` session-level thread-local
245///   4. `None` (deny-default for RLS).
246///
247/// The transaction-local layer is read through the runtime; an embedded
248/// helper crate that has no `RedDBRuntime` access still gets correct
249/// behaviour for layers 1, 3, and 4.
250pub fn current_tenant() -> Option<String> {
251    let inherited = CURRENT_TENANT_ID.with(|cell| cell.borrow().clone());
252    if let Some(over) = current_scope_override() {
253        if over.tenant.is_active() {
254            return over.tenant.resolve(inherited);
255        }
256    }
257    if let Some(tx_local) = current_tx_local_tenant() {
258        return tx_local;
259    }
260    inherited
261}
262
263thread_local! {
264    /// Snapshot of the active connection's `tx_local_tenants` entry for
265    /// the current `execute_query` call. Outer `Some(_)` means "a
266    /// transaction-local tenant override is active for this call";
267    /// inner is the override's value (`Some(s)` overrides to `s`,
268    /// `None` overrides to NULL/cleared). Refreshed at the top of every
269    /// `execute_query` invocation and cleared by the RAII guard on
270    /// return so pooled connections cannot leak the override past the
271    /// statement that owns it.
272    static TX_LOCAL_TENANT: std::cell::RefCell<Option<Option<String>>> =
273        const { std::cell::RefCell::new(None) };
274}
275
276fn current_tx_local_tenant() -> Option<Option<String>> {
277    TX_LOCAL_TENANT.with(|cell| cell.borrow().clone())
278}
279
280/// Recognise `SET LOCAL TENANT '<id>'` / `SET LOCAL TENANT NULL` —
281/// returns `Ok(Some(Some(id)))` for an explicit value, `Ok(Some(None))`
282/// for an explicit NULL clear, `Ok(None)` when the input is not a
283/// `SET LOCAL TENANT` statement at all, and `Err` when the prefix
284/// matches but the value is malformed.
285fn parse_set_local_tenant(query: &str) -> RedDBResult<Option<Option<String>>> {
286    let mut tokens = query.split_ascii_whitespace();
287    let Some(w1) = tokens.next() else {
288        return Ok(None);
289    };
290    if !w1.eq_ignore_ascii_case("SET") {
291        return Ok(None);
292    }
293    let Some(w2) = tokens.next() else {
294        return Ok(None);
295    };
296    if !w2.eq_ignore_ascii_case("LOCAL") {
297        return Ok(None);
298    }
299    let Some(w3) = tokens.next() else {
300        return Ok(None);
301    };
302    if !w3.eq_ignore_ascii_case("TENANT") {
303        return Ok(None);
304    }
305    let rest: String = tokens.collect::<Vec<_>>().join(" ");
306    let rest = rest.trim().trim_end_matches(';').trim();
307    let value_str = rest.strip_prefix('=').map(|s| s.trim()).unwrap_or(rest);
308    if value_str.is_empty() {
309        return Err(RedDBError::Query(
310            "SET LOCAL TENANT expects a string literal or NULL".to_string(),
311        ));
312    }
313    if value_str.eq_ignore_ascii_case("NULL") {
314        return Ok(Some(None));
315    }
316    if value_str.starts_with('\'') && value_str.ends_with('\'') && value_str.len() >= 2 {
317        let inner = &value_str[1..value_str.len() - 1];
318        return Ok(Some(Some(inner.to_string())));
319    }
320    Err(RedDBError::Query(format!(
321        "SET LOCAL TENANT expects a string literal or NULL, got `{value_str}`"
322    )))
323}
324
325pub(crate) struct TxLocalTenantGuard;
326
327impl TxLocalTenantGuard {
328    pub fn install(value: Option<Option<String>>) -> Self {
329        TX_LOCAL_TENANT.with(|cell| *cell.borrow_mut() = value);
330        Self
331    }
332}
333
334impl Drop for TxLocalTenantGuard {
335    fn drop(&mut self) {
336        TX_LOCAL_TENANT.with(|cell| *cell.borrow_mut() = None);
337    }
338}
339
340thread_local! {
341    /// Stack of `WITHIN ... <stmt>` overrides active on the current
342    /// thread. Every entry corresponds to one in-flight `execute_query`
343    /// call that started with a `WITHIN` prefix; the entry is pushed
344    /// before dispatch and popped before the call returns. The stack
345    /// shape supports nested invocations (e.g. a view body that itself
346    /// re-enters execute_query).
347    static SCOPE_OVERRIDES: std::cell::RefCell<Vec<crate::runtime::within_clause::ScopeOverride>> =
348        const { std::cell::RefCell::new(Vec::new()) };
349}
350
351pub(crate) fn push_scope_override(over: crate::runtime::within_clause::ScopeOverride) {
352    SCOPE_OVERRIDES.with(|cell| cell.borrow_mut().push(over));
353}
354
355pub(crate) fn pop_scope_override() {
356    SCOPE_OVERRIDES.with(|cell| {
357        cell.borrow_mut().pop();
358    });
359}
360
361pub(crate) fn current_scope_override() -> Option<crate::runtime::within_clause::ScopeOverride> {
362    SCOPE_OVERRIDES.with(|cell| cell.borrow().last().cloned())
363}
364
365/// Cheap probe: is any `WITHIN …` scope override active on this
366/// thread? The fast-path needs to know without paying for the full
367/// `.last().cloned()` allocation — just peek at stack length.
368pub(crate) fn has_scope_override_active() -> bool {
369    SCOPE_OVERRIDES.with(|cell| !cell.borrow().is_empty())
370}
371
372/// RAII guard pairing `push_scope_override` with the matching pop, so
373/// the stack stays balanced even when the inner `execute_query` returns
374/// early via `?`.
375pub(crate) struct ScopeOverrideGuard;
376
377impl ScopeOverrideGuard {
378    pub fn install(over: crate::runtime::within_clause::ScopeOverride) -> Self {
379        push_scope_override(over);
380        Self
381    }
382}
383
384impl Drop for ScopeOverrideGuard {
385    fn drop(&mut self) {
386        pop_scope_override();
387    }
388}
389
390/// Read the current-thread auth identity, honouring per-statement
391/// `WITHIN ... USER '<u>' AS ROLE '<r>'` overrides. The override only
392/// supplies projected strings — it never grants additional privilege —
393/// so callers that need to make authorisation decisions must read from
394/// the underlying `current_auth_identity()` directly.
395pub(crate) fn current_user_projected() -> Option<String> {
396    let inherited = current_auth_identity().map(|(u, _)| u);
397    if let Some(over) = current_scope_override() {
398        if over.user.is_active() {
399            return over.user.resolve(inherited);
400        }
401    }
402    inherited
403}
404
405pub(crate) fn current_role_projected() -> Option<String> {
406    let inherited = current_auth_identity().map(|(_, r)| format!("{r:?}").to_lowercase());
407    if let Some(over) = current_scope_override() {
408        if over.role.is_active() {
409            return over.role.resolve(inherited);
410        }
411    }
412    inherited
413}
414
415pub(crate) fn current_secret_value(path: &str) -> Option<String> {
416    let key = path.to_ascii_lowercase();
417    CURRENT_SECRET_RESOLVER.with(|cell| {
418        let mut resolver = cell.borrow_mut();
419        let resolver = resolver.as_mut()?;
420        if resolver.values.is_none() {
421            resolver.values = resolver
422                .store
423                .as_ref()
424                .map(|store| store.vault_kv_snapshot());
425        }
426        let values = resolver.values.as_ref()?;
427        values.get(&key).cloned().or_else(|| {
428            key.strip_prefix("red.vault/").and_then(|rest| {
429                values
430                    .get(rest)
431                    .cloned()
432                    .or_else(|| values.get(&format!("red.secret.{rest}")).cloned())
433            })
434        })
435    })
436}
437
438struct SecretResolver {
439    store: Option<Arc<crate::auth::store::AuthStore>>,
440    values: Option<HashMap<String, String>>,
441}
442
443pub(super) struct SecretStoreGuard {
444    previous: Option<SecretResolver>,
445}
446
447impl SecretStoreGuard {
448    pub(super) fn install(store: Option<Arc<crate::auth::store::AuthStore>>) -> Self {
449        let previous = CURRENT_SECRET_RESOLVER.with(|cell| {
450            cell.replace(Some(SecretResolver {
451                store,
452                values: None,
453            }))
454        });
455        Self { previous }
456    }
457}
458
459impl Drop for SecretStoreGuard {
460    fn drop(&mut self) {
461        let previous = self.previous.take();
462        CURRENT_SECRET_RESOLVER.with(|cell| {
463            cell.replace(previous);
464        });
465    }
466}
467
468pub(crate) fn current_config_value(path: &str) -> Option<Value> {
469    let key = path.to_ascii_lowercase();
470    CURRENT_CONFIG_RESOLVER.with(|cell| {
471        let mut resolver = cell.borrow_mut();
472        let resolver = resolver.as_mut()?;
473        if resolver.values.is_none() {
474            resolver.values = Some(latest_config_snapshot(&resolver.db));
475        }
476        let values = resolver.values.as_ref()?;
477        values.get(&key).cloned().or_else(|| {
478            key.strip_prefix("red.config/")
479                .and_then(|rest| values.get(&format!("red.config.{rest}")).cloned())
480        })
481    })
482}
483
484fn update_current_config_value(path: &str, value: Value) {
485    let key = path.to_ascii_lowercase();
486    CURRENT_CONFIG_RESOLVER.with(|cell| {
487        if let Some(resolver) = cell.borrow_mut().as_mut() {
488            if let Some(values) = resolver.values.as_mut() {
489                values.insert(key, value);
490            }
491        }
492    });
493}
494
495fn update_current_secret_value(path: &str, value: Option<String>) {
496    let key = path.to_ascii_lowercase();
497    CURRENT_SECRET_RESOLVER.with(|cell| {
498        if let Some(resolver) = cell.borrow_mut().as_mut() {
499            let Some(values) = resolver.values.as_mut() else {
500                return;
501            };
502            match value {
503                Some(value) => {
504                    values.insert(key, value);
505                }
506                None => {
507                    values.remove(&key);
508                }
509            }
510        }
511    });
512}
513
514fn latest_config_snapshot(db: &RedDB) -> HashMap<String, Value> {
515    let mut latest: HashMap<String, (u64, Value)> = HashMap::new();
516
517    if let Some(manager) = db.store().get_collection("red_config") {
518        manager.for_each_entity(|entity| {
519            let Some(row) = entity.data.as_row() else {
520                return true;
521            };
522            let Some(Value::Text(key)) = row.get_field("key") else {
523                return true;
524            };
525            let value = row.get_field("value").cloned().unwrap_or(Value::Null);
526            let id = entity.id.raw();
527            let key = key.to_ascii_lowercase();
528            insert_latest_config_value(&mut latest, key.clone(), id, value.clone());
529            if let Some(rest) = key.strip_prefix("red.config.") {
530                insert_latest_config_value(&mut latest, format!("red.config/{rest}"), id, value);
531            }
532            true
533        });
534    }
535
536    if let Some(manager) = db.store().get_collection("red.config") {
537        manager.for_each_entity(|entity| {
538            let Some(row) = entity.data.as_row() else {
539                return true;
540            };
541            if matches!(row.get_field("tombstone"), Some(Value::Boolean(true))) {
542                return true;
543            }
544            let Some(Value::Text(key)) = row.get_field("key") else {
545                return true;
546            };
547            let value = row.get_field("value").cloned().unwrap_or(Value::Null);
548            insert_latest_config_value(
549                &mut latest,
550                format!("red.config/{}", key.to_ascii_lowercase()),
551                entity.id.raw(),
552                value,
553            );
554            true
555        });
556    }
557
558    latest
559        .into_iter()
560        .map(|(key, (_, value))| (key, value))
561        .collect()
562}
563
564fn insert_latest_config_value(
565    latest: &mut HashMap<String, (u64, Value)>,
566    key: String,
567    id: u64,
568    value: Value,
569) {
570    match latest.get(&key) {
571        Some((prev_id, _)) if *prev_id > id => {}
572        _ => {
573            latest.insert(key, (id, value));
574        }
575    }
576}
577
578struct ConfigResolver {
579    db: Arc<RedDB>,
580    values: Option<HashMap<String, Value>>,
581}
582
583pub(super) struct ConfigSnapshotGuard {
584    previous: Option<ConfigResolver>,
585}
586
587impl ConfigSnapshotGuard {
588    pub(super) fn install(db: Arc<RedDB>) -> Self {
589        let previous = CURRENT_CONFIG_RESOLVER
590            .with(|cell| cell.replace(Some(ConfigResolver { db, values: None })));
591        Self { previous }
592    }
593}
594
595impl Drop for ConfigSnapshotGuard {
596    fn drop(&mut self) {
597        let previous = self.previous.take();
598        CURRENT_CONFIG_RESOLVER.with(|cell| {
599            cell.replace(previous);
600        });
601    }
602}
603
604/// Install the MVCC snapshot used by the current thread for the duration
605/// of one statement. Paired with `clear_current_snapshot()` — callers
606/// should prefer the `CurrentSnapshotGuard` RAII wrapper so early returns
607/// still clean up.
608pub fn set_current_snapshot(ctx: SnapshotContext) {
609    CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = Some(ctx));
610    HAS_SNAPSHOT.with(|c| c.set(true));
611}
612
613pub fn clear_current_snapshot() {
614    CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = None);
615    HAS_SNAPSHOT.with(|c| c.set(false));
616}
617
618/// Drop-guard that restores the previous snapshot on scope exit. Safe to
619/// nest — each statement saves the caller's snapshot and puts it back
620/// instead of blindly clearing, so a top-level `execute_query` called
621/// from inside another statement dispatch (e.g. vector source subqueries)
622/// doesn't strip visibility from the outer scan.
623pub(crate) struct CurrentSnapshotGuard {
624    previous: Option<SnapshotContext>,
625}
626
627impl CurrentSnapshotGuard {
628    pub(crate) fn install(ctx: SnapshotContext) -> Self {
629        let previous = CURRENT_SNAPSHOT.with(|cell| cell.borrow().clone());
630        set_current_snapshot(ctx);
631        Self { previous }
632    }
633}
634
635impl Drop for CurrentSnapshotGuard {
636    fn drop(&mut self) {
637        let prev = self.previous.take();
638        let has = prev.is_some();
639        CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = prev);
640        HAS_SNAPSHOT.with(|c| c.set(has));
641    }
642}
643
644/// Is this entity visible under the current thread's MVCC snapshot?
645///
646/// Returns `true` (no filtering) when no snapshot is installed — that
647/// path is used by embedded callers and by operations that intentionally
648/// bypass MVCC (VACUUM, snapshot export, admin introspection).
649///
650/// When a snapshot is installed the result is
651///   `snapshot.sees(xmin, xmax) && !mgr.is_aborted(xmin) && !xmax_half_abort`
652/// where `xmax_half_abort` re-grants visibility for tuples whose
653/// deleting transaction rolled back.
654#[inline]
655pub fn entity_visible_under_current_snapshot(
656    entity: &crate::storage::unified::entity::UnifiedEntity,
657) -> bool {
658    // Fast path — one `Cell<bool>` read, no RefCell borrow. Autocommit
659    // reads (no active MVCC transaction) still hide superseded physical
660    // versions while avoiding a full snapshot-context lookup.
661    // This runs on every row of every scan; the slow path only fires
662    // inside an explicit transaction.
663    if !HAS_SNAPSHOT.with(|c| c.get()) {
664        return entity.xmax == 0;
665    }
666    CURRENT_SNAPSHOT.with(|cell| {
667        let guard = cell.borrow();
668        let Some(ctx) = guard.as_ref() else {
669            return true;
670        };
671        visibility_check(ctx, entity.xmin, entity.xmax)
672    })
673}
674
675/// Direct visibility check from raw `(xmin, xmax)` — bypasses the
676/// entity borrow for callers that already decomposed the tuple (e.g.
677/// pre-materialized scan caches). Same semantics as
678/// `entity_visible_under_current_snapshot`.
679#[inline]
680pub(crate) fn xids_visible_under_current_snapshot(xmin: u64, xmax: u64) -> bool {
681    if !HAS_SNAPSHOT.with(|c| c.get()) {
682        return true;
683    }
684    CURRENT_SNAPSHOT.with(|cell| {
685        let guard = cell.borrow();
686        let Some(ctx) = guard.as_ref() else {
687            return true;
688        };
689        visibility_check(ctx, xmin, xmax)
690    })
691}
692
693/// Clone the current thread's snapshot context. Parallel scan paths
694/// (`query_all_zoned` with `std::thread::scope`) call this on the main
695/// thread *before* spawning workers so the captured `SnapshotContext`
696/// can be moved into every worker closure. Worker threads do not
697/// inherit thread-locals, so calling `entity_visible_under_current_snapshot`
698/// from inside a spawned closure would silently skip the filter.
699pub fn capture_current_snapshot() -> Option<SnapshotContext> {
700    CURRENT_SNAPSHOT.with(|cell| cell.borrow().clone())
701}
702
703/// Whether the active read snapshot may need historical tuple versions
704/// that the current secondary indexes cannot prove. Index paths can still
705/// recheck visible candidates, but only a heap scan can discover versions
706/// whose indexed value was changed or deleted after this snapshot.
707pub(crate) fn current_snapshot_requires_index_fallback() -> bool {
708    if !HAS_SNAPSHOT.with(|c| c.get()) {
709        return false;
710    }
711    CURRENT_SNAPSHOT.with(|cell| {
712        cell.borrow()
713            .as_ref()
714            .is_some_and(|ctx| ctx.requires_index_fallback)
715    })
716}
717
718/// Frozen MVCC + identity context for callers that need to reinstall
719/// the same view across thread-local boundaries — long-lived cursors,
720/// background batchers, anything that detaches from the dispatch path
721/// and re-enters later.
722///
723/// The bundle bakes in the three thread-locals every read path
724/// consults: `SnapshotContext` (MVCC visibility), the auth identity
725/// (RLS policy gate), and the tenant id (RLS scope). A FETCH that
726/// reinstalls the bundle sees exactly the same rows as the DECLARE
727/// would have, regardless of writes that landed in between.
728///
729/// Cheap to clone — `SnapshotContext` is a clone of three
730/// `Arc`-backed fields, identity is a `(String, Role)`, tenant is a
731/// `String`. None of these contend with the read path.
732#[derive(Clone, Default)]
733pub struct SnapshotBundle {
734    pub snapshot: Option<SnapshotContext>,
735    pub auth: Option<(String, crate::auth::Role)>,
736    pub tenant: Option<String>,
737}
738
739/// Capture the three read-path thread-locals into a `SnapshotBundle`.
740/// Pairs with `with_snapshot_bundle` for re-entry.
741pub fn snapshot_bundle() -> SnapshotBundle {
742    SnapshotBundle {
743        snapshot: capture_current_snapshot(),
744        auth: current_auth_identity(),
745        tenant: CURRENT_TENANT_ID.with(|cell| cell.borrow().clone()),
746    }
747}
748
749/// Reinstall a captured `SnapshotBundle` for the duration of `f`.
750/// Restores the caller's previous thread-locals on exit (panic-safe via
751/// the explicit guard struct so a panic in `f` cannot leak the
752/// installed identity into the worker's next request).
753pub fn with_snapshot_bundle<R>(bundle: &SnapshotBundle, f: impl FnOnce() -> R) -> R {
754    struct Guard {
755        prev_snapshot: Option<SnapshotContext>,
756        prev_auth: Option<(String, crate::auth::Role)>,
757        prev_tenant: Option<String>,
758    }
759    impl Drop for Guard {
760        fn drop(&mut self) {
761            let snap = self.prev_snapshot.take();
762            let has = snap.is_some();
763            CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = snap);
764            HAS_SNAPSHOT.with(|c| c.set(has));
765            CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = self.prev_auth.take());
766            CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = self.prev_tenant.take());
767        }
768    }
769
770    let _guard = {
771        let prev_snapshot = CURRENT_SNAPSHOT.with(|cell| cell.borrow().clone());
772        let prev_auth = CURRENT_AUTH_IDENTITY.with(|cell| cell.borrow().clone());
773        let prev_tenant = CURRENT_TENANT_ID.with(|cell| cell.borrow().clone());
774
775        match bundle.snapshot.clone() {
776            Some(ctx) => set_current_snapshot(ctx),
777            None => clear_current_snapshot(),
778        }
779        CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = bundle.auth.clone());
780        CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = bundle.tenant.clone());
781
782        Guard {
783            prev_snapshot,
784            prev_auth,
785            prev_tenant,
786        }
787    };
788    f()
789}
790
791/// Apply the same visibility rules used by the thread-local helpers
792/// against a caller-provided context. Intended for parallel workers
793/// that captured the snapshot with `capture_current_snapshot()`.
794#[inline]
795pub fn entity_visible_with_context(
796    ctx: Option<&SnapshotContext>,
797    entity: &crate::storage::unified::entity::UnifiedEntity,
798) -> bool {
799    match ctx {
800        Some(ctx) => visibility_check(ctx, entity.xmin, entity.xmax),
801        None => true,
802    }
803}
804
805fn table_row_index_fields(
806    entity: &crate::storage::unified::entity::UnifiedEntity,
807) -> Vec<(String, crate::storage::schema::Value)> {
808    let crate::storage::EntityData::Row(row) = &entity.data else {
809        return Vec::new();
810    };
811    if let Some(named) = &row.named {
812        return named
813            .iter()
814            .map(|(name, value)| (name.clone(), value.clone()))
815            .collect();
816    }
817    if let Some(schema) = &row.schema {
818        return schema
819            .iter()
820            .zip(row.columns.iter())
821            .map(|(name, value)| (name.clone(), value.clone()))
822            .collect();
823    }
824    Vec::new()
825}
826
827#[inline]
828fn visibility_check(ctx: &SnapshotContext, xmin: u64, xmax: u64) -> bool {
829    // Writer aborted → tuple never existed from any future reader's view.
830    // Checked *before* the own-xids fast path so an aborted own-sub-xid
831    // (rolled-back savepoint) stays hidden from the parent.
832    if xmin != 0 && ctx.manager.is_aborted(xmin) {
833        return false;
834    }
835    // Deleter aborted → treat xmax as unset; fall back to xmin-only check.
836    let effective_xmax = if xmax != 0 && ctx.manager.is_aborted(xmax) {
837        0
838    } else {
839        xmax
840    };
841    // Phase 2.3.2e: own-tx writes are always visible to the connection
842    // that stamped them, even when xmin/xmax exceed `snapshot.xid` (as
843    // happens for sub-xids allocated by SAVEPOINT after BEGIN).
844    let own_xmin = xmin != 0 && ctx.own_xids.contains(&xmin);
845    let own_xmax = effective_xmax != 0 && ctx.own_xids.contains(&effective_xmax);
846    if own_xmax {
847        // This connection deleted the row via this xid — hide it from self.
848        return false;
849    }
850    if own_xmin {
851        return true;
852    }
853    ctx.snapshot.sees(xmin, effective_xmax)
854}
855
856fn runtime_pool_lock(runtime: &RedDBRuntime) -> std::sync::MutexGuard<'_, PoolState> {
857    runtime
858        .inner
859        .pool
860        .lock()
861        .unwrap_or_else(|poisoned| poisoned.into_inner())
862}
863
864fn cache_scope_insert(scopes: &mut HashSet<String>, name: &str) {
865    if name.is_empty() || name.starts_with("__subq_") || is_universal_query_source(name) {
866        return;
867    }
868    scopes.insert(name.to_string());
869}
870
871fn collect_table_source_scopes(scopes: &mut HashSet<String>, query: &TableQuery) {
872    match query.source.as_ref() {
873        Some(crate::storage::query::ast::TableSource::Name(name)) => {
874            cache_scope_insert(scopes, name)
875        }
876        Some(crate::storage::query::ast::TableSource::Subquery(subquery)) => {
877            collect_query_expr_result_cache_scopes(scopes, subquery);
878        }
879        None => cache_scope_insert(scopes, &query.table),
880    }
881}
882
883fn collect_vector_source_scopes(
884    scopes: &mut HashSet<String>,
885    source: &crate::storage::query::ast::VectorSource,
886) {
887    match source {
888        crate::storage::query::ast::VectorSource::Reference { collection, .. } => {
889            cache_scope_insert(scopes, collection);
890        }
891        crate::storage::query::ast::VectorSource::Subquery(subquery) => {
892            collect_query_expr_result_cache_scopes(scopes, subquery);
893        }
894        crate::storage::query::ast::VectorSource::Literal(_)
895        | crate::storage::query::ast::VectorSource::Text(_) => {}
896    }
897}
898
899fn collect_path_selector_scopes(
900    scopes: &mut HashSet<String>,
901    selector: &crate::storage::query::ast::NodeSelector,
902) {
903    if let crate::storage::query::ast::NodeSelector::ByRow { table, .. } = selector {
904        cache_scope_insert(scopes, table);
905    }
906}
907
908fn collect_query_expr_result_cache_scopes(scopes: &mut HashSet<String>, expr: &QueryExpr) {
909    match expr {
910        QueryExpr::Table(query) => collect_table_source_scopes(scopes, query),
911        QueryExpr::Join(query) => {
912            collect_query_expr_result_cache_scopes(scopes, &query.left);
913            collect_query_expr_result_cache_scopes(scopes, &query.right);
914        }
915        QueryExpr::Path(query) => {
916            collect_path_selector_scopes(scopes, &query.from);
917            collect_path_selector_scopes(scopes, &query.to);
918        }
919        QueryExpr::Vector(query) => {
920            cache_scope_insert(scopes, &query.collection);
921            collect_vector_source_scopes(scopes, &query.query_vector);
922        }
923        QueryExpr::Hybrid(query) => {
924            collect_query_expr_result_cache_scopes(scopes, &query.structured);
925            cache_scope_insert(scopes, &query.vector.collection);
926            collect_vector_source_scopes(scopes, &query.vector.query_vector);
927        }
928        QueryExpr::Insert(query) => cache_scope_insert(scopes, &query.table),
929        QueryExpr::Update(query) => cache_scope_insert(scopes, &query.table),
930        QueryExpr::Delete(query) => cache_scope_insert(scopes, &query.table),
931        QueryExpr::CreateTable(query) => cache_scope_insert(scopes, &query.name),
932        QueryExpr::CreateCollection(query) => cache_scope_insert(scopes, &query.name),
933        QueryExpr::CreateVector(query) => cache_scope_insert(scopes, &query.name),
934        QueryExpr::DropTable(query) => cache_scope_insert(scopes, &query.name),
935        QueryExpr::DropGraph(query) => cache_scope_insert(scopes, &query.name),
936        QueryExpr::DropVector(query) => cache_scope_insert(scopes, &query.name),
937        QueryExpr::DropDocument(query) => cache_scope_insert(scopes, &query.name),
938        QueryExpr::DropKv(query) => cache_scope_insert(scopes, &query.name),
939        QueryExpr::DropCollection(query) => cache_scope_insert(scopes, &query.name),
940        QueryExpr::Truncate(query) => cache_scope_insert(scopes, &query.name),
941        QueryExpr::AlterTable(query) => cache_scope_insert(scopes, &query.name),
942        QueryExpr::CreateIndex(query) => cache_scope_insert(scopes, &query.table),
943        QueryExpr::DropIndex(query) => cache_scope_insert(scopes, &query.table),
944        QueryExpr::CreateTimeSeries(query) => cache_scope_insert(scopes, &query.name),
945        QueryExpr::DropTimeSeries(query) => cache_scope_insert(scopes, &query.name),
946        QueryExpr::CreateQueue(query) => cache_scope_insert(scopes, &query.name),
947        QueryExpr::AlterQueue(query) => cache_scope_insert(scopes, &query.name),
948        QueryExpr::DropQueue(query) => cache_scope_insert(scopes, &query.name),
949        QueryExpr::QueueSelect(query) => cache_scope_insert(scopes, &query.queue),
950        QueryExpr::QueueCommand(query) => match query {
951            QueueCommand::Push { queue, .. }
952            | QueueCommand::Pop { queue, .. }
953            | QueueCommand::Peek { queue, .. }
954            | QueueCommand::Len { queue }
955            | QueueCommand::Purge { queue }
956            | QueueCommand::GroupCreate { queue, .. }
957            | QueueCommand::GroupRead { queue, .. }
958            | QueueCommand::Pending { queue, .. }
959            | QueueCommand::Claim { queue, .. }
960            | QueueCommand::Ack { queue, .. }
961            | QueueCommand::Nack { queue, .. } => cache_scope_insert(scopes, queue),
962            QueueCommand::Move {
963                source,
964                destination,
965                ..
966            } => {
967                cache_scope_insert(scopes, source);
968                cache_scope_insert(scopes, destination);
969            }
970        },
971        QueryExpr::EventsBackfill(query) => {
972            cache_scope_insert(scopes, &query.collection);
973            cache_scope_insert(scopes, &query.target_queue);
974        }
975        QueryExpr::CreateTree(query) => cache_scope_insert(scopes, &query.collection),
976        QueryExpr::DropTree(query) => cache_scope_insert(scopes, &query.collection),
977        QueryExpr::TreeCommand(query) => match query {
978            TreeCommand::Insert { collection, .. }
979            | TreeCommand::Move { collection, .. }
980            | TreeCommand::Delete { collection, .. }
981            | TreeCommand::Validate { collection, .. }
982            | TreeCommand::Rebalance { collection, .. } => cache_scope_insert(scopes, collection),
983        },
984        QueryExpr::SearchCommand(query) => match query {
985            SearchCommand::Similar { collection, .. }
986            | SearchCommand::Hybrid { collection, .. }
987            | SearchCommand::SpatialRadius { collection, .. }
988            | SearchCommand::SpatialBbox { collection, .. }
989            | SearchCommand::SpatialNearest { collection, .. } => {
990                cache_scope_insert(scopes, collection);
991            }
992            SearchCommand::Text { collection, .. }
993            | SearchCommand::Multimodal { collection, .. }
994            | SearchCommand::Index { collection, .. }
995            | SearchCommand::Context { collection, .. } => {
996                if let Some(collection) = collection.as_deref() {
997                    cache_scope_insert(scopes, collection);
998                }
999            }
1000        },
1001        QueryExpr::Ask(query) => {
1002            if let Some(collection) = query.collection.as_deref() {
1003                cache_scope_insert(scopes, collection);
1004            }
1005        }
1006        QueryExpr::ExplainAlter(query) => cache_scope_insert(scopes, &query.target.name),
1007        QueryExpr::MaintenanceCommand(cmd) => match cmd {
1008            crate::storage::query::ast::MaintenanceCommand::Vacuum { target, .. }
1009            | crate::storage::query::ast::MaintenanceCommand::Analyze { target } => {
1010                if let Some(t) = target {
1011                    cache_scope_insert(scopes, t);
1012                }
1013            }
1014        },
1015        QueryExpr::CopyFrom(cmd) => cache_scope_insert(scopes, &cmd.table),
1016        QueryExpr::CreateView(cmd) => {
1017            cache_scope_insert(scopes, &cmd.name);
1018            // Invalidating the view should also invalidate its dependencies.
1019            collect_query_expr_result_cache_scopes(scopes, &cmd.query);
1020        }
1021        QueryExpr::DropView(cmd) => cache_scope_insert(scopes, &cmd.name),
1022        QueryExpr::RefreshMaterializedView(cmd) => cache_scope_insert(scopes, &cmd.name),
1023        QueryExpr::CreatePolicy(cmd) => cache_scope_insert(scopes, &cmd.table),
1024        QueryExpr::DropPolicy(cmd) => cache_scope_insert(scopes, &cmd.table),
1025        QueryExpr::CreateServer(_) | QueryExpr::DropServer(_) => {}
1026        QueryExpr::CreateForeignTable(cmd) => cache_scope_insert(scopes, &cmd.name),
1027        QueryExpr::DropForeignTable(cmd) => cache_scope_insert(scopes, &cmd.name),
1028        QueryExpr::Graph(_)
1029        | QueryExpr::GraphCommand(_)
1030        | QueryExpr::ProbabilisticCommand(_)
1031        | QueryExpr::SetConfig { .. }
1032        | QueryExpr::ShowConfig { .. }
1033        | QueryExpr::SetSecret { .. }
1034        | QueryExpr::DeleteSecret { .. }
1035        | QueryExpr::ShowSecrets { .. }
1036        | QueryExpr::SetTenant(_)
1037        | QueryExpr::ShowTenant
1038        | QueryExpr::TransactionControl(_)
1039        | QueryExpr::CreateSchema(_)
1040        | QueryExpr::DropSchema(_)
1041        | QueryExpr::CreateSequence(_)
1042        | QueryExpr::DropSequence(_)
1043        | QueryExpr::Grant(_)
1044        | QueryExpr::Revoke(_)
1045        | QueryExpr::AlterUser(_)
1046        | QueryExpr::CreateIamPolicy { .. }
1047        | QueryExpr::DropIamPolicy { .. }
1048        | QueryExpr::AttachPolicy { .. }
1049        | QueryExpr::DetachPolicy { .. }
1050        | QueryExpr::ShowPolicies { .. }
1051        | QueryExpr::ShowEffectivePermissions { .. }
1052        | QueryExpr::SimulatePolicy { .. }
1053        | QueryExpr::CreateMigration(_)
1054        | QueryExpr::ApplyMigration(_)
1055        | QueryExpr::RollbackMigration(_)
1056        | QueryExpr::ExplainMigration(_)
1057        | QueryExpr::EventsBackfillStatus { .. } => {}
1058        QueryExpr::KvCommand(cmd) => {
1059            use crate::storage::query::ast::KvCommand;
1060            match cmd {
1061                KvCommand::Put { collection, .. }
1062                | KvCommand::InvalidateTags { collection, .. }
1063                | KvCommand::Get { collection, .. }
1064                | KvCommand::Unseal { collection, .. }
1065                | KvCommand::Rotate { collection, .. }
1066                | KvCommand::History { collection, .. }
1067                | KvCommand::List { collection, .. }
1068                | KvCommand::Purge { collection, .. }
1069                | KvCommand::Watch { collection, .. }
1070                | KvCommand::Delete { collection, .. }
1071                | KvCommand::Incr { collection, .. }
1072                | KvCommand::Cas { collection, .. } => cache_scope_insert(scopes, collection),
1073            }
1074        }
1075        QueryExpr::ConfigCommand(cmd) => {
1076            use crate::storage::query::ast::ConfigCommand;
1077            match cmd {
1078                ConfigCommand::Put { collection, .. }
1079                | ConfigCommand::Get { collection, .. }
1080                | ConfigCommand::Resolve { collection, .. }
1081                | ConfigCommand::Rotate { collection, .. }
1082                | ConfigCommand::Delete { collection, .. }
1083                | ConfigCommand::History { collection, .. }
1084                | ConfigCommand::List { collection, .. }
1085                | ConfigCommand::Watch { collection, .. }
1086                | ConfigCommand::InvalidVolatileOperation { collection, .. } => {
1087                    cache_scope_insert(scopes, collection)
1088                }
1089            }
1090        }
1091    }
1092}
1093
1094/// Combine matching RLS policies for a table + action into a single
1095/// `Filter` suitable for AND-ing into a caller's `WHERE` clause.
1096///
1097/// Returns `None` when RLS is disabled or no policy admits the caller's
1098/// role — callers use that to short-circuit the mutation (for DELETE /
1099/// UPDATE we simply skip the operation, which PG expresses as "no rows
1100/// match the policy + predicate combination").
1101pub(crate) fn rls_policy_filter(
1102    runtime: &RedDBRuntime,
1103    table: &str,
1104    action: crate::storage::query::ast::PolicyAction,
1105) -> Option<crate::storage::query::ast::Filter> {
1106    rls_policy_filter_for_kind(
1107        runtime,
1108        table,
1109        action,
1110        crate::storage::query::ast::PolicyTargetKind::Table,
1111    )
1112}
1113
1114/// Kind-aware policy filter combiner (Phase 2.5.5 RLS universal).
1115/// Graph / vector / queue / timeseries scans pass the concrete kind;
1116/// policies targeting other kinds are ignored. Legacy Table-scoped
1117/// policies still apply cross-kind — callers register auto-tenancy
1118/// policies as Table today.
1119pub(crate) fn rls_policy_filter_for_kind(
1120    runtime: &RedDBRuntime,
1121    table: &str,
1122    action: crate::storage::query::ast::PolicyAction,
1123    kind: crate::storage::query::ast::PolicyTargetKind,
1124) -> Option<crate::storage::query::ast::Filter> {
1125    use crate::storage::query::ast::Filter;
1126
1127    if !runtime.inner.rls_enabled_tables.read().contains(table) {
1128        return None;
1129    }
1130    let role = current_auth_identity().map(|(_, role)| role);
1131    let role_str = role.map(|r| r.as_str().to_string());
1132    let policies = runtime.matching_rls_policies_for_kind(table, role_str.as_deref(), action, kind);
1133    if policies.is_empty() {
1134        return None;
1135    }
1136    policies
1137        .into_iter()
1138        .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1139}
1140
1141/// Returns true when the table has RLS enforcement enabled. Convenience
1142/// shortcut so DML paths can gate the AND-combine work without reaching
1143/// into `runtime.inner.rls_enabled_tables` directly.
1144pub(crate) fn rls_is_enabled(runtime: &RedDBRuntime, table: &str) -> bool {
1145    runtime.inner.rls_enabled_tables.read().contains(table)
1146}
1147
1148/// Per-entity gate used by the graph materialiser for `GraphNode`
1149/// entities. RLS is checked against the source collection with
1150/// `kind = Nodes`, which `matching_rls_policies_for_kind` resolves to
1151/// either `Nodes`-targeted policies or legacy `Table`-targeted ones
1152/// (for back-compat with auto-tenancy declarations). Cached per
1153/// collection so big graphs only resolve the policy chain once.
1154fn node_passes_rls(
1155    runtime: &RedDBRuntime,
1156    collection: &str,
1157    role: Option<&str>,
1158    cache: &mut std::collections::HashMap<String, Option<crate::storage::query::ast::Filter>>,
1159    entity: &crate::storage::unified::entity::UnifiedEntity,
1160) -> bool {
1161    use crate::storage::query::ast::{Filter, PolicyAction, PolicyTargetKind};
1162
1163    if !runtime.inner.rls_enabled_tables.read().contains(collection) {
1164        return true;
1165    }
1166    let filter = cache.entry(collection.to_string()).or_insert_with(|| {
1167        let policies = runtime.matching_rls_policies_for_kind(
1168            collection,
1169            role,
1170            PolicyAction::Select,
1171            PolicyTargetKind::Nodes,
1172        );
1173        if policies.is_empty() {
1174            None
1175        } else {
1176            policies
1177                .into_iter()
1178                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1179        }
1180    });
1181    let Some(filter) = filter else {
1182        return false;
1183    };
1184    crate::runtime::query_exec::evaluate_entity_filter_with_db(
1185        Some(&runtime.inner.db),
1186        entity,
1187        filter,
1188        collection,
1189        collection,
1190    )
1191}
1192
1193/// Edge counterpart of `node_passes_rls`. Same caching strategy with
1194/// `kind = Edges`.
1195fn edge_passes_rls(
1196    runtime: &RedDBRuntime,
1197    collection: &str,
1198    role: Option<&str>,
1199    cache: &mut std::collections::HashMap<String, Option<crate::storage::query::ast::Filter>>,
1200    entity: &crate::storage::unified::entity::UnifiedEntity,
1201) -> bool {
1202    use crate::storage::query::ast::{Filter, PolicyAction, PolicyTargetKind};
1203
1204    if !runtime.inner.rls_enabled_tables.read().contains(collection) {
1205        return true;
1206    }
1207    let filter = cache.entry(collection.to_string()).or_insert_with(|| {
1208        let policies = runtime.matching_rls_policies_for_kind(
1209            collection,
1210            role,
1211            PolicyAction::Select,
1212            PolicyTargetKind::Edges,
1213        );
1214        if policies.is_empty() {
1215            None
1216        } else {
1217            policies
1218                .into_iter()
1219                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1220        }
1221    });
1222    let Some(filter) = filter else {
1223        return false;
1224    };
1225    crate::runtime::query_exec::evaluate_entity_filter_with_db(
1226        Some(&runtime.inner.db),
1227        entity,
1228        filter,
1229        collection,
1230        collection,
1231    )
1232}
1233
1234/// RLS policy injection (Phase 2.5.2 PG parity).
1235///
1236/// Fetch every matching policy for the current thread-local role and
1237/// fold them into the query's filter. Semantics mirror PostgreSQL:
1238///
1239/// * Multiple policies on the same table combine with **OR** — a row is
1240///   visible if *any* policy admits it.
1241/// * The combined policy predicate is **AND**-ed into the caller's
1242///   existing `WHERE` clause so explicit predicates continue to trim
1243///   the policy-allowed set.
1244/// * No matching policies + RLS enabled = zero rows (PG's
1245///   restrictive-default). Callers get `None` and return an empty
1246///   `UnifiedResult` without ever dispatching the scan.
1247///
1248/// This runs only when `RuntimeInner::rls_enabled_tables` already
1249/// contains the table name — callers gate the hot path upfront to
1250/// avoid the lock acquisition on tables without RLS.
1251///
1252/// Returns `None` when no policy admits the current role; returns
1253/// `Some(mutated_table)` with policy filters folded in otherwise.
1254fn inject_rls_filters(
1255    runtime: &RedDBRuntime,
1256    frame: &dyn super::statement_frame::ReadFrame,
1257    mut table: crate::storage::query::ast::TableQuery,
1258) -> Option<crate::storage::query::ast::TableQuery> {
1259    use crate::storage::query::ast::{Filter, PolicyAction};
1260
1261    // `None` role falls through to policies with no `TO role` clause.
1262    let role = frame.identity().map(|(_, role)| role);
1263    let role_str = role.map(|r| r.as_str().to_string());
1264    let policies =
1265        runtime.matching_rls_policies(&table.table, role_str.as_deref(), PolicyAction::Select);
1266
1267    if policies.is_empty() {
1268        // RLS enabled + no policy match = deny everything. Signal the
1269        // caller to short-circuit with an empty result set.
1270        return None;
1271    }
1272
1273    // Combine policy predicates with OR (PG's permissive default).
1274    let combined = policies
1275        .into_iter()
1276        .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1277        .expect("policies non-empty");
1278
1279    // AND into the caller's existing filter.
1280    table.filter = Some(match table.filter.take() {
1281        Some(existing) => Filter::And(Box::new(existing), Box::new(combined)),
1282        None => combined,
1283    });
1284    Some(table)
1285}
1286
1287/// Apply per-table RLS to a `JoinQuery` by folding each side's policy
1288/// predicate into the join's outer filter. Walking the merged record
1289/// at the join layer (rather than mutating the per-side scan filter)
1290/// keeps the planner's strategy choice and per-side index selection
1291/// undisturbed — the policy predicate uses the qualified `t.col` form
1292/// that resolves cleanly against the merged record's keys.
1293///
1294/// Returns `None` when any leaf has RLS enabled and no policy admits
1295/// the caller — the join short-circuits to an empty result.
1296fn inject_rls_into_join(
1297    runtime: &RedDBRuntime,
1298    frame: &dyn super::statement_frame::ReadFrame,
1299    mut join: crate::storage::query::ast::JoinQuery,
1300) -> Option<crate::storage::query::ast::JoinQuery> {
1301    use crate::storage::query::ast::Filter;
1302
1303    let mut policy_filters: Vec<Filter> = Vec::new();
1304    if !collect_join_side_policy(runtime, frame, join.left.as_ref(), &mut policy_filters) {
1305        return None;
1306    }
1307    if !collect_join_side_policy(runtime, frame, join.right.as_ref(), &mut policy_filters) {
1308        return None;
1309    }
1310
1311    if policy_filters.is_empty() {
1312        return Some(join);
1313    }
1314
1315    let combined = policy_filters
1316        .into_iter()
1317        .reduce(|acc, f| Filter::And(Box::new(acc), Box::new(f)))
1318        .expect("policy_filters non-empty");
1319
1320    join.filter = Some(match join.filter.take() {
1321        Some(existing) => Filter::And(Box::new(existing), Box::new(combined)),
1322        None => combined,
1323    });
1324
1325    Some(join)
1326}
1327
1328/// For each `Table` leaf reachable through nested joins, append the
1329/// RLS-policy filter (combined with OR across that side's matching
1330/// policies) into `out`. Returns `false` when a side has RLS enabled
1331/// but no policy admits the caller — the join must short-circuit.
1332fn collect_join_side_policy(
1333    runtime: &RedDBRuntime,
1334    frame: &dyn super::statement_frame::ReadFrame,
1335    expr: &crate::storage::query::ast::QueryExpr,
1336    out: &mut Vec<crate::storage::query::ast::Filter>,
1337) -> bool {
1338    use crate::storage::query::ast::{Filter, PolicyAction, QueryExpr};
1339    match expr {
1340        QueryExpr::Table(t) => {
1341            if !runtime.inner.rls_enabled_tables.read().contains(&t.table) {
1342                return true;
1343            }
1344            let role = frame.identity().map(|(_, role)| role);
1345            let role_str = role.map(|r| r.as_str().to_string());
1346            let policies =
1347                runtime.matching_rls_policies(&t.table, role_str.as_deref(), PolicyAction::Select);
1348            if policies.is_empty() {
1349                return false;
1350            }
1351            let combined = policies
1352                .into_iter()
1353                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1354                .expect("policies non-empty");
1355            out.push(combined);
1356            true
1357        }
1358        QueryExpr::Join(inner) => {
1359            collect_join_side_policy(runtime, frame, inner.left.as_ref(), out)
1360                && collect_join_side_policy(runtime, frame, inner.right.as_ref(), out)
1361        }
1362        _ => true,
1363    }
1364}
1365
1366/// Foreign-table post-scan filter application (Phase 3.2.2 PG parity).
1367///
1368/// Phase 3.2 FDW wrappers don't advertise filter pushdown, so the runtime
1369/// applies `WHERE` / `ORDER BY` / `LIMIT` / `OFFSET` after the wrapper
1370/// materialises all rows. Projections are best-effort — when the query
1371/// lists explicit columns we keep only those; a `SELECT *` keeps every
1372/// wrapper-emitted field verbatim.
1373///
1374/// When a wrapper later opts into pushdown (`supports_pushdown = true`)
1375/// the runtime will pass the compiled filter down instead of post-filtering.
1376fn apply_foreign_table_filters(
1377    records: Vec<crate::storage::query::unified::UnifiedRecord>,
1378    query: &crate::storage::query::ast::TableQuery,
1379) -> crate::storage::query::unified::UnifiedResult {
1380    use crate::storage::query::sql_lowering::{
1381        effective_table_filter, effective_table_projections,
1382    };
1383    use crate::storage::query::unified::UnifiedResult;
1384
1385    let filter = effective_table_filter(query);
1386    let projections = effective_table_projections(query);
1387
1388    // Step 1 — WHERE. Reuse the cross-store evaluator so the semantics
1389    // match native-collection queries (same operators, same NULL handling).
1390    let mut filtered: Vec<_> = records
1391        .into_iter()
1392        .filter(|record| match &filter {
1393            Some(f) => {
1394                super::join_filter::evaluate_runtime_filter_with_db(None, record, f, None, None)
1395            }
1396            None => true,
1397        })
1398        .collect();
1399
1400    // Step 2 — LIMIT / OFFSET. Applied after filter to match SQL semantics.
1401    if let Some(offset) = query.offset {
1402        let offset = offset as usize;
1403        if offset >= filtered.len() {
1404            filtered.clear();
1405        } else {
1406            filtered.drain(0..offset);
1407        }
1408    }
1409    if let Some(limit) = query.limit {
1410        filtered.truncate(limit as usize);
1411    }
1412
1413    // Step 3 — columns list. `SELECT *` (no explicit projections) keeps
1414    // the wrapper's column set; an explicit list trims to those names.
1415    let columns: Vec<String> = if projections.is_empty() {
1416        filtered
1417            .first()
1418            .map(|r| r.column_names().iter().map(|k| k.to_string()).collect())
1419            .unwrap_or_default()
1420    } else {
1421        projections
1422            .iter()
1423            .map(super::join_filter::projection_name)
1424            .collect()
1425    };
1426
1427    let mut result = UnifiedResult::empty();
1428    result.columns = columns;
1429    result.records = filtered;
1430    result
1431}
1432
1433/// Collect every concrete table reference inside a `QueryExpr`.
1434///
1435/// Used by view bookkeeping (dependency tracking for materialised
1436/// invalidation) and any other rewriter that needs to know the base
1437/// tables a query pulls from. Does not descend into projections/filters;
1438/// only the `FROM` side.
1439pub(crate) fn collect_table_refs(expr: &QueryExpr) -> Vec<String> {
1440    let mut scopes: HashSet<String> = HashSet::new();
1441    collect_query_expr_result_cache_scopes(&mut scopes, expr);
1442    scopes.into_iter().collect()
1443}
1444
1445fn query_expr_result_cache_scopes(expr: &QueryExpr) -> HashSet<String> {
1446    let mut scopes = HashSet::new();
1447    collect_query_expr_result_cache_scopes(&mut scopes, expr);
1448    scopes
1449}
1450
1451const RESULT_CACHE_BACKEND_KEY: &str = "runtime.result_cache.backend";
1452const RESULT_CACHE_DEFAULT_BACKEND: &str = "legacy";
1453const RESULT_CACHE_BLOB_NAMESPACE: &str = "runtime.result_cache";
1454const RESULT_CACHE_TTL_SECS: u64 = 30;
1455const RESULT_CACHE_MAX_ENTRIES: usize = 1000;
1456const RESULT_CACHE_PAYLOAD_MAGIC: &[u8; 8] = b"RDRC0001";
1457
1458#[derive(Clone, Copy, Debug, PartialEq, Eq)]
1459enum RuntimeResultCacheBackend {
1460    Legacy,
1461    BlobCache,
1462    Shadow,
1463}
1464
1465fn trim_result_cache(
1466    map: &mut HashMap<String, RuntimeResultCacheEntry>,
1467    order: &mut std::collections::VecDeque<String>,
1468) {
1469    while map.len() > RESULT_CACHE_MAX_ENTRIES {
1470        if let Some(oldest) = order.pop_front() {
1471            map.remove(&oldest);
1472        } else {
1473            break;
1474        }
1475    }
1476}
1477
1478fn result_cache_fingerprint(result: &RuntimeQueryResult) -> String {
1479    format!(
1480        "{:?}|{}|{}|{}|{}|{:?}",
1481        result.result,
1482        result.query,
1483        result.statement,
1484        result.engine,
1485        result.affected_rows,
1486        result.statement_type
1487    )
1488}
1489
1490fn mode_to_byte(mode: crate::storage::query::modes::QueryMode) -> u8 {
1491    match mode {
1492        crate::storage::query::modes::QueryMode::Sql => 0,
1493        crate::storage::query::modes::QueryMode::Gremlin => 1,
1494        crate::storage::query::modes::QueryMode::Cypher => 2,
1495        crate::storage::query::modes::QueryMode::Sparql => 3,
1496        crate::storage::query::modes::QueryMode::Path => 4,
1497        crate::storage::query::modes::QueryMode::Natural => 5,
1498        crate::storage::query::modes::QueryMode::Unknown => 255,
1499    }
1500}
1501
1502fn mode_from_byte(byte: u8) -> Option<crate::storage::query::modes::QueryMode> {
1503    match byte {
1504        0 => Some(crate::storage::query::modes::QueryMode::Sql),
1505        1 => Some(crate::storage::query::modes::QueryMode::Gremlin),
1506        2 => Some(crate::storage::query::modes::QueryMode::Cypher),
1507        3 => Some(crate::storage::query::modes::QueryMode::Sparql),
1508        4 => Some(crate::storage::query::modes::QueryMode::Path),
1509        5 => Some(crate::storage::query::modes::QueryMode::Natural),
1510        255 => Some(crate::storage::query::modes::QueryMode::Unknown),
1511        _ => None,
1512    }
1513}
1514
1515fn result_cache_static_str(value: &str) -> Option<&'static str> {
1516    match value {
1517        "select" => Some("select"),
1518        "materialized-graph" => Some("materialized-graph"),
1519        "runtime-red-schema" => Some("runtime-red-schema"),
1520        "runtime-fdw" => Some("runtime-fdw"),
1521        "runtime-table-rls" => Some("runtime-table-rls"),
1522        "runtime-table" => Some("runtime-table"),
1523        "runtime-join-rls" => Some("runtime-join-rls"),
1524        "runtime-join" => Some("runtime-join"),
1525        "runtime-vector" => Some("runtime-vector"),
1526        "runtime-hybrid" => Some("runtime-hybrid"),
1527        "runtime-secret" => Some("runtime-secret"),
1528        "runtime-config" => Some("runtime-config"),
1529        "runtime-tenant" => Some("runtime-tenant"),
1530        "runtime-explain" => Some("runtime-explain"),
1531        "runtime-tree" => Some("runtime-tree"),
1532        "runtime-kv" => Some("runtime-kv"),
1533        "runtime-queue" => Some("runtime-queue"),
1534        _ => None,
1535    }
1536}
1537
1538fn write_u32(out: &mut Vec<u8>, value: usize) -> Option<()> {
1539    let value = u32::try_from(value).ok()?;
1540    out.extend_from_slice(&value.to_le_bytes());
1541    Some(())
1542}
1543
1544fn write_string(out: &mut Vec<u8>, value: &str) -> Option<()> {
1545    write_u32(out, value.len())?;
1546    out.extend_from_slice(value.as_bytes());
1547    Some(())
1548}
1549
1550fn write_bytes(out: &mut Vec<u8>, value: &[u8]) -> Option<()> {
1551    write_u32(out, value.len())?;
1552    out.extend_from_slice(value);
1553    Some(())
1554}
1555
1556fn read_u8(input: &mut &[u8]) -> Option<u8> {
1557    let (&value, rest) = input.split_first()?;
1558    *input = rest;
1559    Some(value)
1560}
1561
1562fn read_u32(input: &mut &[u8]) -> Option<usize> {
1563    if input.len() < 4 {
1564        return None;
1565    }
1566    let value = u32::from_le_bytes(input[..4].try_into().ok()?) as usize;
1567    *input = &input[4..];
1568    Some(value)
1569}
1570
1571fn read_u64(input: &mut &[u8]) -> Option<u64> {
1572    if input.len() < 8 {
1573        return None;
1574    }
1575    let value = u64::from_le_bytes(input[..8].try_into().ok()?);
1576    *input = &input[8..];
1577    Some(value)
1578}
1579
1580fn read_string(input: &mut &[u8]) -> Option<String> {
1581    let len = read_u32(input)?;
1582    if input.len() < len {
1583        return None;
1584    }
1585    let value = String::from_utf8(input[..len].to_vec()).ok()?;
1586    *input = &input[len..];
1587    Some(value)
1588}
1589
1590fn read_bytes<'a>(input: &mut &'a [u8]) -> Option<&'a [u8]> {
1591    let len = read_u32(input)?;
1592    if input.len() < len {
1593        return None;
1594    }
1595    let value = &input[..len];
1596    *input = &input[len..];
1597    Some(value)
1598}
1599
1600fn encode_result_cache_payload(entry: &RuntimeResultCacheEntry) -> Option<Vec<u8>> {
1601    let result = &entry.result;
1602    if result.result.pre_serialized_json.is_some()
1603        || result_cache_static_str(result.statement).is_none()
1604        || result_cache_static_str(result.engine).is_none()
1605        || result_cache_static_str(result.statement_type).is_none()
1606        || result.result.records.iter().any(|record| {
1607            !record.nodes.is_empty()
1608                || !record.edges.is_empty()
1609                || !record.paths.is_empty()
1610                || !record.vector_results.is_empty()
1611        })
1612    {
1613        return None;
1614    }
1615
1616    let mut out = Vec::new();
1617    out.extend_from_slice(RESULT_CACHE_PAYLOAD_MAGIC);
1618    write_string(&mut out, &result.query)?;
1619    out.push(mode_to_byte(result.mode));
1620    write_string(&mut out, result.statement)?;
1621    write_string(&mut out, result.engine)?;
1622    out.extend_from_slice(&result.affected_rows.to_le_bytes());
1623    write_string(&mut out, result.statement_type)?;
1624
1625    write_u32(&mut out, result.result.columns.len())?;
1626    for column in &result.result.columns {
1627        write_string(&mut out, column)?;
1628    }
1629    out.extend_from_slice(&result.result.stats.nodes_scanned.to_le_bytes());
1630    out.extend_from_slice(&result.result.stats.edges_scanned.to_le_bytes());
1631    out.extend_from_slice(&result.result.stats.rows_scanned.to_le_bytes());
1632    out.extend_from_slice(&result.result.stats.exec_time_us.to_le_bytes());
1633
1634    write_u32(&mut out, result.result.records.len())?;
1635    for record in &result.result.records {
1636        let fields = record.iter_fields().collect::<Vec<_>>();
1637        write_u32(&mut out, fields.len())?;
1638        for (name, value) in fields {
1639            write_string(&mut out, name)?;
1640            let mut encoded = Vec::new();
1641            crate::storage::schema::value_codec::encode(value, &mut encoded);
1642            write_bytes(&mut out, &encoded)?;
1643        }
1644    }
1645
1646    write_u32(&mut out, entry.scopes.len())?;
1647    for scope in &entry.scopes {
1648        write_string(&mut out, scope)?;
1649    }
1650    Some(out)
1651}
1652
1653fn decode_result_cache_payload(mut input: &[u8]) -> Option<(RuntimeQueryResult, HashSet<String>)> {
1654    if input.len() < RESULT_CACHE_PAYLOAD_MAGIC.len()
1655        || &input[..RESULT_CACHE_PAYLOAD_MAGIC.len()] != RESULT_CACHE_PAYLOAD_MAGIC
1656    {
1657        return None;
1658    }
1659    input = &input[RESULT_CACHE_PAYLOAD_MAGIC.len()..];
1660
1661    let query = read_string(&mut input)?;
1662    let mode = mode_from_byte(read_u8(&mut input)?)?;
1663    let statement = result_cache_static_str(&read_string(&mut input)?)?;
1664    let engine = result_cache_static_str(&read_string(&mut input)?)?;
1665    let affected_rows = read_u64(&mut input)?;
1666    let statement_type = result_cache_static_str(&read_string(&mut input)?)?;
1667
1668    let mut columns = Vec::new();
1669    for _ in 0..read_u32(&mut input)? {
1670        columns.push(read_string(&mut input)?);
1671    }
1672    let stats = crate::storage::query::unified::QueryStats {
1673        nodes_scanned: read_u64(&mut input)?,
1674        edges_scanned: read_u64(&mut input)?,
1675        rows_scanned: read_u64(&mut input)?,
1676        exec_time_us: read_u64(&mut input)?,
1677    };
1678
1679    let mut records = Vec::new();
1680    for _ in 0..read_u32(&mut input)? {
1681        let mut record = crate::storage::query::unified::UnifiedRecord::new();
1682        for _ in 0..read_u32(&mut input)? {
1683            let name = read_string(&mut input)?;
1684            let bytes = read_bytes(&mut input)?;
1685            let (value, used) = crate::storage::schema::value_codec::decode(bytes).ok()?;
1686            if used != bytes.len() {
1687                return None;
1688            }
1689            record.set_owned(name, value);
1690        }
1691        records.push(record);
1692    }
1693
1694    let mut scopes = HashSet::new();
1695    for _ in 0..read_u32(&mut input)? {
1696        scopes.insert(read_string(&mut input)?);
1697    }
1698    if !input.is_empty() {
1699        return None;
1700    }
1701
1702    Some((
1703        RuntimeQueryResult {
1704            query,
1705            mode,
1706            statement,
1707            engine,
1708            result: crate::storage::query::unified::UnifiedResult {
1709                columns,
1710                records,
1711                stats,
1712                pre_serialized_json: None,
1713            },
1714            affected_rows,
1715            statement_type,
1716        },
1717        scopes,
1718    ))
1719}
1720
1721/// Heuristic: does the raw SQL reference a built-in whose output
1722/// varies by connection, clock, or randomness? Such queries must
1723/// skip the 30s result cache — see the call site for rationale.
1724///
1725/// ASCII case-insensitive substring match. False positives (the
1726/// token appears in a quoted string) only skip caching, which is
1727/// the conservative direction.
1728/// If `sql` starts with `EXPLAIN` followed by a non-`ALTER` token,
1729/// return the trimmed inner statement; otherwise `None`.
1730///
1731/// `EXPLAIN ALTER FOR CREATE TABLE ...` is a separate schema-diff
1732/// command handled inside the normal SQL parser, so we leave it
1733/// alone here.
1734fn strip_explain_prefix(sql: &str) -> Option<&str> {
1735    let trimmed = sql.trim_start();
1736    let (head, rest) = trimmed.split_at(
1737        trimmed
1738            .find(|c: char| c.is_whitespace())
1739            .unwrap_or(trimmed.len()),
1740    );
1741    if !head.eq_ignore_ascii_case("EXPLAIN") {
1742        return None;
1743    }
1744    let rest = rest.trim_start();
1745    if rest.is_empty() {
1746        return None;
1747    }
1748    // Peek the next token — if ALTER or ASK, defer to the normal parser.
1749    // `EXPLAIN ASK` is an executable read path: it runs retrieval and
1750    // provider selection, then short-circuits before the LLM call.
1751    let next_head_end = rest.find(|c: char| c.is_whitespace()).unwrap_or(rest.len());
1752    if rest[..next_head_end].eq_ignore_ascii_case("ALTER")
1753        || rest[..next_head_end].eq_ignore_ascii_case("ASK")
1754    {
1755        return None;
1756    }
1757    Some(rest)
1758}
1759
1760/// Cheap prefix check for a leading `WITH` keyword. Used to gate the
1761/// CTE-aware parse in `execute_query` without paying for a full
1762/// lexer pass on every statement. Treats `WITHIN` as not-a-CTE so
1763/// `WITHIN TENANT '...' SELECT ...` doesn't mis-route.
1764pub(super) fn has_with_prefix(sql: &str) -> bool {
1765    let trimmed = sql.trim_start();
1766    let head_end = trimmed
1767        .find(|c: char| c.is_whitespace() || c == '(')
1768        .unwrap_or(trimmed.len());
1769    trimmed[..head_end].eq_ignore_ascii_case("WITH")
1770}
1771
1772/// If the query is a plain SELECT whose top-level `TableQuery`
1773/// carries an `AS OF` clause, return a typed spec that the runtime
1774/// can feed to `vcs_resolve_as_of`. Returns `None` for any other
1775/// shape — joins, DML, EXPLAIN, or parse failures — so callers fall
1776/// back to the connection's regular MVCC snapshot. A cheap textual
1777/// prefilter skips the parse entirely when the source doesn't
1778/// mention `AS OF` / `as of`, keeping the autocommit hot path free.
1779fn peek_top_level_as_of(sql: &str) -> Option<crate::application::vcs::AsOfSpec> {
1780    peek_top_level_as_of_with_table(sql).map(|(spec, _)| spec)
1781}
1782
1783/// Same as `peek_top_level_as_of` but also returns the table name
1784/// targeted by the AS OF clause (when the FROM clause names a
1785/// concrete table). `None` for the table slot means scalar SELECT
1786/// or a subquery source — callers treat those as "no enforcement".
1787pub(super) fn peek_top_level_as_of_with_table(
1788    sql: &str,
1789) -> Option<(crate::application::vcs::AsOfSpec, Option<String>)> {
1790    if !sql
1791        .as_bytes()
1792        .windows(5)
1793        .any(|w| w.eq_ignore_ascii_case(b"as of"))
1794    {
1795        return None;
1796    }
1797    let parsed = crate::storage::query::parser::parse(sql).ok()?;
1798    let crate::storage::query::ast::QueryExpr::Table(table) = parsed.query else {
1799        return None;
1800    };
1801    let clause = table.as_of?;
1802    let table_name = if table.table.is_empty() || table.table == "any" {
1803        None
1804    } else {
1805        Some(table.table.clone())
1806    };
1807    let spec = match clause {
1808        crate::storage::query::ast::AsOfClause::Commit(h) => {
1809            crate::application::vcs::AsOfSpec::Commit(h)
1810        }
1811        crate::storage::query::ast::AsOfClause::Branch(b) => {
1812            crate::application::vcs::AsOfSpec::Branch(b)
1813        }
1814        crate::storage::query::ast::AsOfClause::Tag(t) => crate::application::vcs::AsOfSpec::Tag(t),
1815        crate::storage::query::ast::AsOfClause::TimestampMs(ts) => {
1816            crate::application::vcs::AsOfSpec::TimestampMs(ts)
1817        }
1818        crate::storage::query::ast::AsOfClause::Snapshot(x) => {
1819            crate::application::vcs::AsOfSpec::Snapshot(x)
1820        }
1821    };
1822    Some((spec, table_name))
1823}
1824
1825pub(super) fn query_has_volatile_builtin(sql: &str) -> bool {
1826    // Lowercase the bytes up to the first null/newline into a small
1827    // stack buffer for cheap contains() checks. Most SQL fits in the
1828    // buffer; longer queries fall back to owned lowercase.
1829    const VOLATILE_TOKENS: &[&str] = &[
1830        "pg_advisory_lock",
1831        "pg_try_advisory_lock",
1832        "pg_advisory_unlock",
1833        "random()",
1834        // NOW() / CURRENT_TIMESTAMP / CURRENT_DATE intentionally
1835        // omitted for now — they ARE volatile but today's tests rely
1836        // on caching them. Revisit once a tighter volatility story
1837        // lands.
1838    ];
1839    let lowered = sql.to_ascii_lowercase();
1840    VOLATILE_TOKENS.iter().any(|t| lowered.contains(t))
1841}
1842
1843pub(super) fn query_is_ask_statement(sql: &str) -> bool {
1844    let trimmed = sql.trim_start();
1845    let head_end = trimmed
1846        .find(|c: char| c.is_whitespace() || c == '(' || c == ';')
1847        .unwrap_or(trimmed.len());
1848    trimmed[..head_end].eq_ignore_ascii_case("ASK")
1849}
1850
1851/// Pick the `(global_mode, collection_mode)` pair for an expression,
1852/// or `None` for variants that opt out of intent-locking entirely
1853/// (admin statements like `SHOW CONFIG`, transaction control, tenant
1854/// toggles).
1855///
1856/// Phase-1 contract:
1857/// - Reads  — `(IX-compatible) (Global, IS) → (Collection, IS)`
1858/// - Writes — `(IX-compatible) (Global, IX) → (Collection, IX)`
1859/// - DDL    — `(strong)        (Global, IX) → (Collection, X)`
1860pub(super) fn intent_lock_modes_for(
1861    expr: &QueryExpr,
1862) -> Option<(
1863    crate::storage::transaction::lock::LockMode,
1864    crate::storage::transaction::lock::LockMode,
1865)> {
1866    use crate::storage::transaction::lock::LockMode::{Exclusive, IntentExclusive, IntentShared};
1867
1868    match expr {
1869        // Reads — IS / IS.
1870        QueryExpr::Table(_)
1871        | QueryExpr::Join(_)
1872        | QueryExpr::Vector(_)
1873        | QueryExpr::Hybrid(_)
1874        | QueryExpr::Graph(_)
1875        | QueryExpr::Path(_)
1876        | QueryExpr::Ask(_)
1877        | QueryExpr::SearchCommand(_)
1878        | QueryExpr::GraphCommand(_)
1879        | QueryExpr::QueueSelect(_) => Some((IntentShared, IntentShared)),
1880
1881        // Writes — IX / IX. Non-tabular mutations (vector insert,
1882        // graph node insert, queue push, timeseries point insert)
1883        // don't carry their own dispatch arm here; they ride through
1884        // the Insert variant or a command variant covered by the
1885        // read-side arm above. P1.T4 expands only the TableQuery-ish
1886        // writes; non-tabular kinds inherit when their DML variants
1887        // land in later phases.
1888        QueryExpr::Insert(_)
1889        | QueryExpr::Update(_)
1890        | QueryExpr::Delete(_)
1891        | QueryExpr::QueueCommand(QueueCommand::Move { .. }) => {
1892            Some((IntentExclusive, IntentExclusive))
1893        }
1894        QueryExpr::QueueCommand(_) => Some((IntentShared, IntentShared)),
1895
1896        // DDL — IX / X. A DDL against collection `c` blocks all
1897        // other writers + readers on `c` but leaves other collections
1898        // running (because Global stays IX, not X).
1899        QueryExpr::CreateTable(_)
1900        | QueryExpr::CreateCollection(_)
1901        | QueryExpr::CreateVector(_)
1902        | QueryExpr::DropTable(_)
1903        | QueryExpr::DropGraph(_)
1904        | QueryExpr::DropVector(_)
1905        | QueryExpr::DropDocument(_)
1906        | QueryExpr::DropKv(_)
1907        | QueryExpr::DropCollection(_)
1908        | QueryExpr::Truncate(_)
1909        | QueryExpr::AlterTable(_)
1910        | QueryExpr::CreateIndex(_)
1911        | QueryExpr::DropIndex(_)
1912        | QueryExpr::CreateTimeSeries(_)
1913        | QueryExpr::DropTimeSeries(_)
1914        | QueryExpr::CreateQueue(_)
1915        | QueryExpr::AlterQueue(_)
1916        | QueryExpr::DropQueue(_)
1917        | QueryExpr::CreateTree(_)
1918        | QueryExpr::DropTree(_)
1919        | QueryExpr::CreatePolicy(_)
1920        | QueryExpr::DropPolicy(_)
1921        | QueryExpr::CreateView(_)
1922        | QueryExpr::DropView(_)
1923        | QueryExpr::RefreshMaterializedView(_)
1924        | QueryExpr::CreateSchema(_)
1925        | QueryExpr::DropSchema(_)
1926        | QueryExpr::CreateSequence(_)
1927        | QueryExpr::DropSequence(_)
1928        | QueryExpr::CreateServer(_)
1929        | QueryExpr::DropServer(_)
1930        | QueryExpr::CreateForeignTable(_)
1931        | QueryExpr::DropForeignTable(_) => Some((IntentExclusive, Exclusive)),
1932
1933        // Admin / control — skip intent locks. `SET TENANT`,
1934        // `BEGIN / COMMIT / ROLLBACK`, `SET CONFIG`, `SHOW CONFIG`,
1935        // `VACUUM`, etc. don't touch collection data the same way
1936        // and the existing transaction layer already serialises the
1937        // pieces that matter.
1938        _ => None,
1939    }
1940}
1941
1942/// Best-effort collection inventory for an expression. Used to pick
1943/// `Collection(...)` resources for the intent-lock guard. Overshoots
1944/// are fine (take an extra IS, benign); undershoots leak writes past
1945/// DDL X locks, so err on the side of listing more names.
1946pub(super) fn collections_referenced(expr: &QueryExpr) -> Vec<String> {
1947    let mut out = Vec::new();
1948    walk_collections(expr, &mut out);
1949    out.sort();
1950    out.dedup();
1951    out
1952}
1953
1954fn walk_collections(expr: &QueryExpr, out: &mut Vec<String>) {
1955    match expr {
1956        QueryExpr::Table(t) => out.push(t.table.clone()),
1957        QueryExpr::Join(j) => {
1958            walk_collections(&j.left, out);
1959            walk_collections(&j.right, out);
1960        }
1961        QueryExpr::Insert(i) => out.push(i.table.clone()),
1962        QueryExpr::Update(u) => out.push(u.table.clone()),
1963        QueryExpr::Delete(d) => out.push(d.table.clone()),
1964        QueryExpr::QueueSelect(q) => out.push(q.queue.clone()),
1965
1966        // DDL — include the target collection so DDL takes
1967        // `(Collection, X)` and blocks concurrent readers / writers
1968        // on the same collection. Other collections stay live
1969        // because Global is still IX.
1970        QueryExpr::CreateTable(q) => out.push(q.name.clone()),
1971        QueryExpr::CreateCollection(q) => out.push(q.name.clone()),
1972        QueryExpr::CreateVector(q) => out.push(q.name.clone()),
1973        QueryExpr::DropTable(q) => out.push(q.name.clone()),
1974        QueryExpr::DropGraph(q) => out.push(q.name.clone()),
1975        QueryExpr::DropVector(q) => out.push(q.name.clone()),
1976        QueryExpr::DropDocument(q) => out.push(q.name.clone()),
1977        QueryExpr::DropKv(q) => out.push(q.name.clone()),
1978        QueryExpr::DropCollection(q) => out.push(q.name.clone()),
1979        QueryExpr::Truncate(q) => out.push(q.name.clone()),
1980        QueryExpr::AlterTable(q) => out.push(q.name.clone()),
1981        QueryExpr::CreateIndex(q) => out.push(q.table.clone()),
1982        QueryExpr::DropIndex(q) => out.push(q.table.clone()),
1983        QueryExpr::CreateTimeSeries(q) => out.push(q.name.clone()),
1984        QueryExpr::DropTimeSeries(q) => out.push(q.name.clone()),
1985        QueryExpr::CreateQueue(q) => out.push(q.name.clone()),
1986        QueryExpr::AlterQueue(q) => out.push(q.name.clone()),
1987        QueryExpr::DropQueue(q) => out.push(q.name.clone()),
1988        QueryExpr::QueueCommand(QueueCommand::Move {
1989            source,
1990            destination,
1991            ..
1992        }) => {
1993            out.push(source.clone());
1994            out.push(destination.clone());
1995        }
1996        QueryExpr::CreatePolicy(q) => out.push(q.table.clone()),
1997        QueryExpr::CreateView(q) => out.push(q.name.clone()),
1998        QueryExpr::DropView(q) => out.push(q.name.clone()),
1999        QueryExpr::RefreshMaterializedView(q) => out.push(q.name.clone()),
2000
2001        // Vector / Hybrid / Graph / Path / commands reference
2002        // collections through fields whose shape varies; without a
2003        // uniform accessor we fall back to the global lock only —
2004        // benign because every runtime path still holds the global
2005        // mode.
2006        _ => {}
2007    }
2008}
2009
2010impl RedDBRuntime {
2011    pub fn in_memory() -> RedDBResult<Self> {
2012        Self::with_options(RedDBOptions::in_memory())
2013    }
2014
2015    /// Handle to the intent-lock manager for tests + introspection.
2016    /// Production code acquires via `LockerGuard::new(rt.lock_manager())`
2017    /// rather than touching the manager directly.
2018    pub fn lock_manager(&self) -> std::sync::Arc<crate::storage::transaction::lock::LockManager> {
2019        self.inner.lock_manager.clone()
2020    }
2021
2022    #[inline(never)]
2023    pub fn with_options(options: RedDBOptions) -> RedDBResult<Self> {
2024        Self::with_pool(options, ConnectionPoolConfig::default())
2025    }
2026
2027    pub fn with_pool(
2028        options: RedDBOptions,
2029        pool_config: ConnectionPoolConfig,
2030    ) -> RedDBResult<Self> {
2031        // PLAN.md Phase 9.1 — capture wall-clock before storage
2032        // open so the cold-start phase markers can be backfilled
2033        // once Lifecycle is constructed below. Storage open
2034        // encapsulates auto-restore + WAL replay; we treat the
2035        // whole window as one combined "restore" + "wal_replay"
2036        // phase split at the same boundary because the storage
2037        // layer doesn't yet emit a finer signal.
2038        let boot_open_start_ms = std::time::SystemTime::now()
2039            .duration_since(std::time::UNIX_EPOCH)
2040            .map(|d| d.as_millis() as u64)
2041            .unwrap_or(0);
2042        let db = Arc::new(
2043            RedDB::open_with_options(&options)
2044                .map_err(|err| RedDBError::Internal(err.to_string()))?,
2045        );
2046        let result_blob_cache = crate::storage::cache::BlobCache::open_with_l2(
2047            crate::storage::cache::BlobCacheConfig::default().with_l2_path(
2048                options
2049                    .resolved_path("data.rdb")
2050                    .with_extension("result-cache.l2"),
2051            ),
2052        )
2053        .map_err(|err| {
2054            RedDBError::Internal(format!("open result Blob Cache L2 failed: {err:?}"))
2055        })?;
2056        let storage_ready_ms = std::time::SystemTime::now()
2057            .duration_since(std::time::UNIX_EPOCH)
2058            .map(|d| d.as_millis() as u64)
2059            .unwrap_or(0);
2060
2061        let runtime = Self {
2062            inner: Arc::new(RuntimeInner {
2063                db,
2064                layout: PhysicalLayout::from_options(&options),
2065                indices: IndexCatalog::register_default_vector_graph(
2066                    options.has_capability(crate::api::Capability::Table),
2067                    options.has_capability(crate::api::Capability::Graph),
2068                ),
2069                pool_config,
2070                pool: Mutex::new(PoolState::default()),
2071                started_at_unix_ms: SystemTime::now()
2072                    .duration_since(UNIX_EPOCH)
2073                    .unwrap_or_default()
2074                    .as_millis(),
2075                probabilistic: super::probabilistic_store::ProbabilisticStore::new(),
2076                index_store: super::index_store::IndexStore::new(),
2077                cdc: crate::replication::cdc::CdcBuffer::new(100_000),
2078                backup_scheduler: crate::replication::scheduler::BackupScheduler::new(3600),
2079                query_cache: parking_lot::RwLock::new(
2080                    crate::storage::query::planner::cache::PlanCache::new(1000),
2081                ),
2082                result_cache: parking_lot::RwLock::new((
2083                    HashMap::new(),
2084                    std::collections::VecDeque::new(),
2085                )),
2086                result_blob_cache,
2087                result_blob_entries: parking_lot::RwLock::new((
2088                    HashMap::new(),
2089                    std::collections::VecDeque::new(),
2090                )),
2091                ask_answer_cache_entries: parking_lot::RwLock::new((
2092                    HashSet::new(),
2093                    std::collections::VecDeque::new(),
2094                )),
2095                result_cache_shadow_divergences: std::sync::atomic::AtomicU64::new(0),
2096                ask_daily_spend: parking_lot::RwLock::new(HashMap::new()),
2097                queue_message_locks: parking_lot::RwLock::new(HashMap::new()),
2098                rmw_locks: RmwLockTable::new(),
2099                planner_dirty_tables: parking_lot::RwLock::new(HashSet::new()),
2100                ec_registry: Arc::new(crate::ec::config::EcRegistry::new()),
2101                ec_worker: crate::ec::worker::EcWorker::new(),
2102                auth_store: parking_lot::RwLock::new(None),
2103                oauth_validator: parking_lot::RwLock::new(None),
2104                views: parking_lot::RwLock::new(HashMap::new()),
2105                materialized_views: parking_lot::RwLock::new(
2106                    crate::storage::cache::result::MaterializedViewCache::new(),
2107                ),
2108                retention_sweeper: parking_lot::RwLock::new(
2109                    crate::runtime::retention_sweeper::RetentionSweeperState::new(),
2110                ),
2111                snapshot_manager: Arc::new(
2112                    crate::storage::transaction::snapshot::SnapshotManager::new(),
2113                ),
2114                tx_contexts: parking_lot::RwLock::new(HashMap::new()),
2115                tx_local_tenants: parking_lot::RwLock::new(HashMap::new()),
2116                env_config_overrides: crate::runtime::config_overlay::collect_env_overrides(),
2117                lock_manager: Arc::new({
2118                    // Sourced from the matrix: Tier B key
2119                    // `concurrency.locking.deadlock_timeout_ms`
2120                    // (default 5000). Env var wins at boot so
2121                    // operators can tune without touching red_config.
2122                    let env = crate::runtime::config_overlay::collect_env_overrides();
2123                    let timeout_ms = env
2124                        .get("concurrency.locking.deadlock_timeout_ms")
2125                        .and_then(|raw| raw.parse::<u64>().ok())
2126                        .unwrap_or_else(|| {
2127                            match crate::runtime::config_matrix::default_for(
2128                                "concurrency.locking.deadlock_timeout_ms",
2129                            ) {
2130                                Some(crate::serde_json::Value::Number(n)) => n as u64,
2131                                _ => 5000,
2132                            }
2133                        });
2134                    let cfg = crate::storage::transaction::lock::LockConfig {
2135                        default_timeout: std::time::Duration::from_millis(timeout_ms),
2136                        ..Default::default()
2137                    };
2138                    crate::storage::transaction::lock::LockManager::new(cfg)
2139                }),
2140                rls_policies: parking_lot::RwLock::new(HashMap::new()),
2141                rls_enabled_tables: parking_lot::RwLock::new(HashSet::new()),
2142                foreign_tables: Arc::new(crate::storage::fdw::ForeignTableRegistry::with_builtins()),
2143                pending_tombstones: parking_lot::RwLock::new(HashMap::new()),
2144                pending_versioned_updates: parking_lot::RwLock::new(HashMap::new()),
2145                pending_kv_watch_events: parking_lot::RwLock::new(HashMap::new()),
2146                pending_store_wal_actions: parking_lot::RwLock::new(HashMap::new()),
2147                tenant_tables: parking_lot::RwLock::new(HashMap::new()),
2148                ddl_epoch: std::sync::atomic::AtomicU64::new(0),
2149                write_gate: Arc::new(crate::runtime::write_gate::WriteGate::from_options(
2150                    &options,
2151                )),
2152                lifecycle: crate::runtime::lifecycle::Lifecycle::new(),
2153                resource_limits: crate::runtime::resource_limits::ResourceLimits::from_env(),
2154                audit_log: {
2155                    // Default audit-log path for the in-memory case
2156                    // sits in the system temp dir; persistent runs
2157                    // place it next to data.rdb.
2158                    //
2159                    // gh-471 iter 2: route through the resolved
2160                    // `LogDestination`. Performance/Max tiers emit a
2161                    // `File(...)` under `<dbname>.rdb.red/logs/`;
2162                    // lower tiers / ephemeral runs report `Stderr`
2163                    // and we keep the legacy file-next-to-data sink.
2164                    let data_path = options
2165                        .data_path
2166                        .clone()
2167                        .unwrap_or_else(|| std::env::temp_dir().join("reddb"));
2168                    let (audit_dest, _) = crate::api::tier_wiring::current_log_destinations();
2169                    Arc::new(crate::runtime::audit_log::AuditLogger::for_destination(
2170                        &audit_dest,
2171                        &data_path,
2172                    ))
2173                },
2174                lease_lifecycle: std::sync::OnceLock::new(),
2175                replica_apply_metrics: crate::replication::logical::ReplicaApplyMetrics::default(),
2176                quota_bucket: crate::runtime::quota_bucket::QuotaBucket::from_env(),
2177                schema_vocabulary: parking_lot::RwLock::new(
2178                    crate::runtime::schema_vocabulary::SchemaVocabulary::new(),
2179                ),
2180                slow_query_logger: {
2181                    // Issue #205 — slow-query sink lives in the same
2182                    // directory the audit log uses, so backup/restore
2183                    // ships them together. Threshold + sample-pct
2184                    // default conservatively (1 s, 100% sampling) so
2185                    // emitted lines are rare and complete. Operators
2186                    // tune via env / config matrix in a follow-up.
2187                    //
2188                    // gh-471 iter 2: same routing as the audit log —
2189                    // `LogDestination::File(...)` for Performance/Max
2190                    // lands under `<dbname>.rdb.red/logs/slow.log`;
2191                    // lower tiers fall back to `red-slow.log` in the
2192                    // data directory.
2193                    let fallback_dir = options
2194                        .data_path
2195                        .as_ref()
2196                        .and_then(|p| p.parent().map(std::path::PathBuf::from))
2197                        .unwrap_or_else(|| std::env::temp_dir().join("reddb"));
2198                    let threshold_ms = std::env::var("RED_SLOW_QUERY_THRESHOLD_MS")
2199                        .ok()
2200                        .and_then(|s| s.parse::<u64>().ok())
2201                        .unwrap_or(1000);
2202                    let sample_pct = std::env::var("RED_SLOW_QUERY_SAMPLE_PCT")
2203                        .ok()
2204                        .and_then(|s| s.parse::<u8>().ok())
2205                        .unwrap_or(100);
2206                    let (_, slow_dest) = crate::api::tier_wiring::current_log_destinations();
2207                    crate::telemetry::slow_query_logger::SlowQueryLogger::for_destination(
2208                        &slow_dest,
2209                        &fallback_dir,
2210                        threshold_ms,
2211                        sample_pct,
2212                    )
2213                },
2214                kv_stats: crate::runtime::KvStatsCounters::default(),
2215                metrics_ingest_stats: crate::runtime::MetricsIngestCounters::default(),
2216                metrics_tenant_activity_stats:
2217                    crate::runtime::MetricsTenantActivityCounters::default(),
2218                queue_telemetry: Arc::new(
2219                    crate::runtime::queue_telemetry::QueueTelemetryCounters::default(),
2220                ),
2221                kv_tag_index: crate::runtime::KvTagIndex::default(),
2222                chain_tip_cache: parking_lot::Mutex::new(HashMap::new()),
2223                chain_integrity_broken: parking_lot::Mutex::new(HashMap::new()),
2224            }),
2225        };
2226
2227        // Issue #205 — install the process-wide OperatorEvent sink so
2228        // emit sites buried in storage / replication / signal handlers
2229        // can record without threading an `&AuditLogger` through every
2230        // call stack. First registration wins; subsequent in-memory
2231        // runtimes (test harnesses) fall through to tracing+eprintln.
2232        crate::telemetry::operator_event::install_global_audit_sink(Arc::clone(
2233            &runtime.inner.audit_log,
2234        ));
2235
2236        // PLAN.md Phase 9.1 — backfill cold-start phase markers
2237        // from the wall-clock captured before storage open. The
2238        // entire `RedDB::open_with_options` call covers both
2239        // auto-restore (when configured) and WAL replay. We
2240        // record both phases against the same boundary today;
2241        // a follow-up will split them once the storage layer
2242        // surfaces a finer-grained event.
2243        runtime
2244            .inner
2245            .lifecycle
2246            .set_restore_started_at_ms(boot_open_start_ms);
2247        runtime
2248            .inner
2249            .lifecycle
2250            .set_restore_ready_at_ms(storage_ready_ms);
2251        runtime
2252            .inner
2253            .lifecycle
2254            .set_wal_replay_started_at_ms(boot_open_start_ms);
2255        runtime
2256            .inner
2257            .lifecycle
2258            .set_wal_replay_ready_at_ms(storage_ready_ms);
2259
2260        let restored_cdc_lsn = runtime
2261            .inner
2262            .db
2263            .replication
2264            .as_ref()
2265            .map(|repl| {
2266                repl.logical_wal_spool
2267                    .as_ref()
2268                    .map(|spool| spool.current_lsn())
2269                    .unwrap_or(0)
2270            })
2271            .unwrap_or(0)
2272            .max(runtime.config_u64("red.config.timeline.last_archived_lsn", 0));
2273        runtime.inner.cdc.set_current_lsn(restored_cdc_lsn);
2274        runtime.rehydrate_snapshot_xid_floor();
2275        runtime.bootstrap_system_keyed_collections()?;
2276        runtime.rehydrate_declared_column_schemas();
2277        runtime.load_probabilistic_state()?;
2278
2279        // Phase 2.5.4: replay `tenant_tables.{table}.column` markers so
2280        // tables declared via `TENANT BY (col)` survive restart. Each
2281        // entry re-registers the auto-policy and flips RLS on again.
2282        runtime.rehydrate_tenant_tables();
2283        // Issue #593 slice 9a — replay persisted materialized-view
2284        // descriptors so `CREATE MATERIALIZED VIEW v AS …` survives a
2285        // restart. Runs after the system-keyed collections bootstrap
2286        // and before the API opens.
2287        runtime.rehydrate_materialized_view_descriptors();
2288        if let Some(repl) = &runtime.inner.db.replication {
2289            repl.wal_buffer.set_current_lsn(restored_cdc_lsn);
2290        }
2291
2292        // Save system info to red_config on boot
2293        {
2294            let sys = SystemInfo::collect();
2295            runtime.inner.db.store().set_config_tree(
2296                "red.system",
2297                &crate::serde_json::json!({
2298                    "pid": sys.pid,
2299                    "cpu_cores": sys.cpu_cores,
2300                    "total_memory_bytes": sys.total_memory_bytes,
2301                    "available_memory_bytes": sys.available_memory_bytes,
2302                    "os": sys.os,
2303                    "arch": sys.arch,
2304                    "hostname": sys.hostname,
2305                    "started_at": SystemTime::now()
2306                        .duration_since(UNIX_EPOCH)
2307                        .unwrap_or_default()
2308                        .as_millis() as u64
2309                }),
2310            );
2311
2312            // Seed defaults on first boot (only if red_config is empty or missing defaults)
2313            let store = runtime.inner.db.store();
2314            if store
2315                .get_collection("red_config")
2316                .map(|m| m.query_all(|_| true).len())
2317                .unwrap_or(0)
2318                <= 10
2319            {
2320                store.set_config_tree("red.ai", &crate::json!({
2321                    "default": crate::json!({
2322                        "provider": "openai",
2323                        "model": crate::ai::DEFAULT_OPENAI_PROMPT_MODEL
2324                    }),
2325                    "max_embedding_inputs": 256,
2326                    "max_prompt_batch": 256,
2327                    "timeout": crate::json!({ "connect_secs": 10, "read_secs": 90, "write_secs": 30 })
2328                }));
2329                store.set_config_tree(
2330                    "red.server",
2331                    &crate::json!({
2332                        "max_scan_limit": 1000,
2333                        "max_body_size": 1048576,
2334                        "read_timeout_ms": 5000,
2335                        "write_timeout_ms": 5000
2336                    }),
2337                );
2338                store.set_config_tree(
2339                    "red.storage",
2340                    &crate::json!({
2341                        "page_size": 4096,
2342                        "page_cache_capacity": 100000,
2343                        "auto_checkpoint_pages": 1000,
2344                        "snapshot_retention": 16,
2345                        "verify_checksums": true,
2346                        "segment": crate::json!({
2347                            "max_entities": 100000,
2348                            "max_bytes": 268435456_u64,
2349                            "compression_level": 6
2350                        }),
2351                        "hnsw": crate::json!({ "m": 16, "ef_construction": 100, "ef_search": 50 }),
2352                        "ivf": crate::json!({ "n_lists": 100, "n_probes": 10 }),
2353                        "bm25": crate::json!({ "k1": 1.2, "b": 0.75 })
2354                    }),
2355                );
2356                store.set_config_tree(
2357                    "red.search",
2358                    &crate::json!({
2359                        "rag": crate::json!({
2360                            "max_chunks_per_source": 10,
2361                            "max_total_chunks": 25,
2362                            "similarity_threshold": 0.8,
2363                            "graph_depth": 2,
2364                            "min_relevance": 0.3
2365                        }),
2366                        "fusion": crate::json!({
2367                            "vector_weight": 0.5,
2368                            "graph_weight": 0.3,
2369                            "table_weight": 0.2,
2370                            "dedup_threshold": 0.85
2371                        })
2372                    }),
2373                );
2374                store.set_config_tree(
2375                    "red.auth",
2376                    &crate::json!({
2377                        "enabled": false,
2378                        "session_ttl_secs": 3600,
2379                        "require_auth": false
2380                    }),
2381                );
2382                store.set_config_tree(
2383                    "red.query",
2384                    &crate::json!({
2385                        "connection_pool": crate::json!({ "max_connections": 64, "max_idle": 16 }),
2386                        "max_recursion_depth": 1000
2387                    }),
2388                );
2389                store.set_config_tree(
2390                    "red.indexes",
2391                    &crate::json!({
2392                        "auto_select": true,
2393                        "bloom_filter": crate::json!({
2394                            "enabled": true,
2395                            "false_positive_rate": 0.01,
2396                            "prune_on_scan": true
2397                        }),
2398                        "hash": crate::json!({ "enabled": true }),
2399                        "bitmap": crate::json!({ "enabled": true, "max_cardinality": 1000 }),
2400                        "spatial": crate::json!({ "enabled": true })
2401                    }),
2402                );
2403                store.set_config_tree(
2404                    "red.memtable",
2405                    &crate::json!({
2406                        "enabled": true,
2407                        "max_bytes": 67108864_u64,
2408                        "flush_threshold": 0.75
2409                    }),
2410                );
2411                store.set_config_tree(
2412                    "red.probabilistic",
2413                    &crate::json!({
2414                        "hll_registers": 16384,
2415                        "sketch_default_width": 1000,
2416                        "sketch_default_depth": 5,
2417                        "filter_default_capacity": 100000
2418                    }),
2419                );
2420                store.set_config_tree(
2421                    "red.timeseries",
2422                    &crate::json!({
2423                        "default_chunk_size": 1024,
2424                        "compression": crate::json!({
2425                            "timestamps": "delta_of_delta",
2426                            "values": "gorilla_xor"
2427                        }),
2428                        "default_retention_days": 0
2429                    }),
2430                );
2431                store.set_config_tree(
2432                    "red.queue",
2433                    &crate::json!({
2434                        "default_max_size": 0,
2435                        "default_max_attempts": 3,
2436                        "visibility_timeout_ms": 30000,
2437                        "consumer_idle_timeout_ms": 60000
2438                    }),
2439                );
2440                store.set_config_tree(
2441                    "red.backup",
2442                    &crate::json!({
2443                        "enabled": false,
2444                        "interval_secs": 3600,
2445                        "retention_count": 24,
2446                        "upload": false,
2447                        "backend": "local"
2448                    }),
2449                );
2450                store.set_config_tree(
2451                    "red.wal",
2452                    &crate::json!({
2453                        "archive": crate::json!({
2454                            "enabled": false,
2455                            "retention_hours": 168,
2456                            "prefix": "wal/"
2457                        })
2458                    }),
2459                );
2460                store.set_config_tree(
2461                    "red.cdc",
2462                    &crate::json!({
2463                        "enabled": true,
2464                        "buffer_size": 100000
2465                    }),
2466                );
2467                store.set_config_tree(
2468                    "red.config.secret",
2469                    &crate::json!({
2470                        "auto_encrypt": true,
2471                        "auto_decrypt": true
2472                    }),
2473                );
2474            }
2475
2476            // Perf-parity config matrix: heal the Tier A (critical)
2477            // keys unconditionally on every boot. Idempotent — only
2478            // writes the default when the key is missing. Keeps
2479            // `SHOW CONFIG` showing every guarantee the operator has
2480            // (durability.mode, concurrency.locking.enabled, …) even
2481            // on long-running datadirs that predate the matrix.
2482            crate::runtime::config_matrix::heal_critical_keys(store.as_ref());
2483
2484            // Phase 5 — Lehman-Yao runtime flag. Read the Tier A
2485            // `storage.btree.lehman_yao` value from the matrix (env
2486            // > file > red_config > default) and publish it to the
2487            // storage layer's atomic so the B-tree read / split
2488            // paths can branch without re-reading the config on
2489            // every hot-path call.
2490            let lehman_yao = runtime.config_bool("storage.btree.lehman_yao", true);
2491            crate::storage::engine::btree::lehman_yao::set_enabled(lehman_yao);
2492            if lehman_yao {
2493                tracing::info!(
2494                    "storage.btree.lehman_yao=true — lock-free concurrent descent enabled"
2495                );
2496            }
2497
2498            // Config file overlay — mounted `/etc/reddb/config.json`
2499            // (override path via REDDB_CONFIG_FILE). Writes keys with
2500            // write-if-absent semantics so a later user `SET CONFIG`
2501            // always wins. Missing file = silent no-op.
2502            let overlay_path = crate::runtime::config_overlay::config_file_path();
2503            let _ =
2504                crate::runtime::config_overlay::apply_config_file(store.as_ref(), &overlay_path);
2505        }
2506
2507        // VCS ("Git for Data") — create the `red_*` metadata
2508        // collections on first boot. Idempotent: `get_or_create_collection`
2509        // is a no-op if the collection already exists.
2510        {
2511            let store = runtime.inner.db.store();
2512            for name in crate::application::vcs_collections::ALL {
2513                let _ = store.get_or_create_collection(*name);
2514            }
2515            // Seed VCS config namespace with sensible defaults on first
2516            // boot, matching the pattern used by red.ai / red.storage.
2517            store.set_config_tree(
2518                crate::application::vcs_collections::CONFIG_NAMESPACE,
2519                &crate::json!({
2520                    "default_branch": "main",
2521                    "author": crate::json!({
2522                        "name": "reddb",
2523                        "email": "reddb@localhost"
2524                    }),
2525                    "protected_branches": crate::json!(["main"]),
2526                    "closure": crate::json!({
2527                        "enabled": true,
2528                        "lazy": true
2529                    }),
2530                    "merge": crate::json!({
2531                        "default_strategy": "auto",
2532                        "fast_forward": true
2533                    })
2534                }),
2535            );
2536        }
2537
2538        // Migrations — create the `red_migrations` / `red_migration_deps`
2539        // system collections on first boot. Idempotent.
2540        {
2541            let store = runtime.inner.db.store();
2542            for name in crate::application::migration_collections::ALL {
2543                let _ = store.get_or_create_collection(*name);
2544            }
2545        }
2546
2547        // Start background maintenance thread (context index refresh +
2548        // session purge). Held by a WEAK reference to `RuntimeInner`
2549        // so dropping the last `RedDBRuntime` handle actually releases
2550        // the underlying Arc<Pager> (and its file lock). Polling at
2551        // 200ms means shutdown latency is bounded; the real 60-second
2552        // work cadence is tracked independently via a `last_work`
2553        // timestamp.
2554        //
2555        // The previous version captured `rt = runtime.clone()` by
2556        // strong reference and ran an unterminated `loop`, which held
2557        // Arc<RuntimeInner> forever — reopening a persistent database
2558        // in the same process failed with "Database is locked" because
2559        // the pager could never drop. See the regression test
2560        // `finding_1_select_after_bulk_insert_persistent_reopen`.
2561        {
2562            let weak = Arc::downgrade(&runtime.inner);
2563            std::thread::Builder::new()
2564                .name("reddb-maintenance".into())
2565                .spawn(move || {
2566                    let tick = std::time::Duration::from_millis(200);
2567                    let work_interval = std::time::Duration::from_secs(60);
2568                    let mut last_work = std::time::Instant::now();
2569                    loop {
2570                        std::thread::sleep(tick);
2571                        let Some(inner) = weak.upgrade() else {
2572                            // All strong references dropped — the
2573                            // runtime is gone, exit cleanly.
2574                            break;
2575                        };
2576                        if last_work.elapsed() >= work_interval {
2577                            let _stats = inner.db.store().context_index().stats();
2578                            last_work = std::time::Instant::now();
2579                        }
2580                    }
2581                })
2582                .ok();
2583        }
2584
2585        // Start backup scheduler if enabled via red_config
2586        {
2587            let store = runtime.inner.db.store();
2588            let mut backup_enabled = false;
2589            let mut backup_interval = 3600u64;
2590
2591            if let Some(manager) = store.get_collection("red_config") {
2592                manager.for_each_entity(|entity| {
2593                    if let Some(row) = entity.data.as_row() {
2594                        let key = row.get_field("key").and_then(|v| match v {
2595                            crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2596                            _ => None,
2597                        });
2598                        let val = row.get_field("value");
2599                        if key == Some("red.config.backup.enabled") {
2600                            backup_enabled = match val {
2601                                Some(crate::storage::schema::Value::Boolean(true)) => true,
2602                                Some(crate::storage::schema::Value::Text(s)) => &**s == "true",
2603                                _ => false,
2604                            };
2605                        } else if key == Some("red.config.backup.interval_secs") {
2606                            if let Some(crate::storage::schema::Value::Integer(n)) = val {
2607                                backup_interval = *n as u64;
2608                            }
2609                        }
2610                    }
2611                    true
2612                });
2613            }
2614
2615            if backup_enabled {
2616                runtime.inner.backup_scheduler.set_interval(backup_interval);
2617                let rt = runtime.clone();
2618                runtime
2619                    .inner
2620                    .backup_scheduler
2621                    .start(move || rt.trigger_backup().map_err(|e| format!("{}", e)));
2622            }
2623        }
2624
2625        // Load EC registry from red_config and start worker
2626        {
2627            runtime
2628                .inner
2629                .ec_registry
2630                .load_from_config_store(runtime.inner.db.store().as_ref());
2631            if !runtime.inner.ec_registry.async_configs().is_empty() {
2632                runtime.inner.ec_worker.start(
2633                    Arc::clone(&runtime.inner.ec_registry),
2634                    Arc::clone(&runtime.inner.db.store()),
2635                );
2636            }
2637        }
2638
2639        if let crate::replication::ReplicationRole::Replica { primary_addr } =
2640            runtime.inner.db.options().replication.role.clone()
2641        {
2642            let rt = runtime.clone();
2643            std::thread::Builder::new()
2644                .name("reddb-replica".into())
2645                .spawn(move || rt.run_replica_loop(primary_addr))
2646                .ok();
2647        }
2648
2649        // PLAN.md Phase 1 — Lifecycle Contract. Mark Ready once every
2650        // boot stage above has completed (WAL replay, restore-from-
2651        // remote, replica-loop spawn). Health probes flip from 503 to
2652        // 200 here; shutdown begins from this state.
2653        runtime.inner.lifecycle.mark_ready();
2654
2655        // Issue #583 slice 10 — ContinuousMaterializedView scheduler.
2656        // Low-priority background ticker that drains the cache's
2657        // `claim_due_at` set every ~50ms. Holds only a Weak<RuntimeInner>
2658        // so the thread exits cleanly when the runtime drops (≤50ms
2659        // latency between drop and exit). Materialized views without
2660        // a `REFRESH EVERY` clause stay on the manual-refresh path
2661        // and are skipped by `claim_due_at`, so the loop is a no-op
2662        // when no scheduled views exist.
2663        {
2664            let weak_inner = Arc::downgrade(&runtime.inner);
2665            std::thread::Builder::new()
2666                .name("reddb-mv-scheduler".into())
2667                .spawn(move || loop {
2668                    std::thread::sleep(std::time::Duration::from_millis(50));
2669                    let Some(inner) = weak_inner.upgrade() else {
2670                        break;
2671                    };
2672                    let rt = RedDBRuntime { inner };
2673                    rt.refresh_due_materialized_views();
2674                })
2675                .ok();
2676        }
2677
2678        // Issue #584 slice 12 — DeclarativeRetention background sweeper.
2679        // Low-priority ticker that physically reclaims rows whose
2680        // timestamp has fallen beyond the retention window. Holds a
2681        // `Weak<RuntimeInner>` so the thread exits within one tick of
2682        // the runtime drop (graceful shutdown leaves storage consistent
2683        // because each tick goes through the standard DELETE path —
2684        // there is no half-finished mutation state to clean up). The
2685        // tick interval is intentionally longer than the MV scheduler
2686        // (500ms) because retention is order-of-seconds at minimum.
2687        {
2688            let weak_inner = Arc::downgrade(&runtime.inner);
2689            std::thread::Builder::new()
2690                .name("reddb-retention-sweeper".into())
2691                .spawn(move || loop {
2692                    std::thread::sleep(std::time::Duration::from_millis(500));
2693                    let Some(inner) = weak_inner.upgrade() else {
2694                        break;
2695                    };
2696                    let rt = RedDBRuntime { inner };
2697                    rt.sweep_retention_tick(
2698                        crate::runtime::retention_sweeper::DEFAULT_SWEEPER_BATCH,
2699                    );
2700                })
2701                .ok();
2702        }
2703
2704        Ok(runtime)
2705    }
2706
2707    fn rehydrate_snapshot_xid_floor(&self) {
2708        let store = self.inner.db.store();
2709        for collection in store.list_collections() {
2710            let Some(manager) = store.get_collection(&collection) else {
2711                continue;
2712            };
2713            for entity in manager.query_all(|_| true) {
2714                self.inner
2715                    .snapshot_manager
2716                    .observe_committed_xid(entity.xmin);
2717                self.inner
2718                    .snapshot_manager
2719                    .observe_committed_xid(entity.xmax);
2720            }
2721        }
2722    }
2723
2724    /// Provision an empty Table-shaped collection that backs a
2725    /// `CREATE MATERIALIZED VIEW v` (issue #594 slice 9b of #575).
2726    /// `SELECT FROM v` reads this collection directly; the rewriter is
2727    /// configured to skip materialized views so the body is no longer
2728    /// substituted. REFRESH still writes to the cache slot — wiring it
2729    /// into this backing collection is the job of slice 9c.
2730    ///
2731    /// Idempotent: re-running for the same name leaves the existing
2732    /// collection in place (mirrors `CREATE TABLE IF NOT EXISTS`
2733    /// semantics). This keeps `CREATE OR REPLACE MATERIALIZED VIEW v`
2734    /// cheap — the body change does not invalidate already-buffered
2735    /// rows. Until 9c lands the backing is always empty anyway.
2736    pub(crate) fn ensure_materialized_view_backing(&self, name: &str) -> RedDBResult<()> {
2737        let store = self.inner.db.store();
2738        let mut changed = false;
2739        if store.get_collection(name).is_none() {
2740            store.get_or_create_collection(name);
2741            changed = true;
2742        }
2743        if self.inner.db.collection_contract(name).is_none() {
2744            self.inner
2745                .db
2746                .save_collection_contract(system_keyed_collection_contract(
2747                    name,
2748                    crate::catalog::CollectionModel::Table,
2749                ))
2750                .map_err(|err| RedDBError::Internal(err.to_string()))?;
2751            changed = true;
2752        }
2753        if changed {
2754            self.inner
2755                .db
2756                .persist_metadata()
2757                .map_err(|err| RedDBError::Internal(err.to_string()))?;
2758        }
2759        Ok(())
2760    }
2761
2762    /// Inverse of [`ensure_materialized_view_backing`] — drops the
2763    /// backing collection on `DROP MATERIALIZED VIEW v`. No-op when
2764    /// the collection was never created (e.g. a `DROP MATERIALIZED
2765    /// VIEW IF EXISTS v` against an unknown name).
2766    pub(crate) fn drop_materialized_view_backing(&self, name: &str) -> RedDBResult<()> {
2767        let store = self.inner.db.store();
2768        if store.get_collection(name).is_none() {
2769            return Ok(());
2770        }
2771        store
2772            .drop_collection(name)
2773            .map_err(|err| RedDBError::Internal(err.to_string()))?;
2774        // The contract may have been dropped already (DROP TABLE path)
2775        // — ignore "not found" errors by checking presence first.
2776        if self.inner.db.collection_contract(name).is_some() {
2777            self.inner
2778                .db
2779                .remove_collection_contract(name)
2780                .map_err(|err| RedDBError::Internal(err.to_string()))?;
2781        }
2782        self.invalidate_result_cache();
2783        self.inner
2784            .db
2785            .persist_metadata()
2786            .map_err(|err| RedDBError::Internal(err.to_string()))?;
2787        Ok(())
2788    }
2789
2790    fn bootstrap_system_keyed_collections(&self) -> RedDBResult<()> {
2791        let mut changed = false;
2792        for (name, model) in [
2793            ("red.config", crate::catalog::CollectionModel::Config),
2794            ("red.vault", crate::catalog::CollectionModel::Vault),
2795            // Issue #593 — materialized-view catalog. One row per
2796            // `CREATE MATERIALIZED VIEW`; rehydrated at boot before
2797            // the API opens.
2798            (
2799                crate::runtime::continuous_materialized_view::CATALOG_COLLECTION,
2800                crate::catalog::CollectionModel::Config,
2801            ),
2802        ] {
2803            if self.inner.db.store().get_collection(name).is_none() {
2804                self.inner.db.store().get_or_create_collection(name);
2805                changed = true;
2806            }
2807            if self.inner.db.collection_contract(name).is_none() {
2808                self.inner
2809                    .db
2810                    .save_collection_contract(system_keyed_collection_contract(name, model))
2811                    .map_err(|err| RedDBError::Internal(err.to_string()))?;
2812                changed = true;
2813            }
2814        }
2815        if changed {
2816            self.inner
2817                .db
2818                .persist_metadata()
2819                .map_err(|err| RedDBError::Internal(err.to_string()))?;
2820        }
2821        Ok(())
2822    }
2823
2824    pub fn db(&self) -> Arc<RedDB> {
2825        Arc::clone(&self.inner.db)
2826    }
2827
2828    /// Direct access to the runtime's secondary-index store.
2829    /// Used by bulk-insert entry points (gRPC binary bulk, HTTP bulk,
2830    /// wire bulk) that need to push new rows through the per-index
2831    /// maintenance hook after `store.bulk_insert` returns.
2832    pub fn index_store_ref(&self) -> &super::index_store::IndexStore {
2833        &self.inner.index_store
2834    }
2835
2836    /// Apply a DDL event to the schema-vocabulary reverse index
2837    /// (issue #120). Called by DDL execution paths after the catalog
2838    /// mutation has succeeded so the index never holds entries for
2839    /// half-applied DDL.
2840    pub(crate) fn schema_vocabulary_apply(
2841        &self,
2842        event: crate::runtime::schema_vocabulary::DdlEvent,
2843    ) {
2844        self.inner.schema_vocabulary.write().on_ddl(event);
2845    }
2846
2847    /// Lookup `token` in the schema-vocabulary reverse index. Returns
2848    /// an owned `Vec<VocabHit>` because the underlying read lock
2849    /// cannot be borrowed across the call boundary; the slice from
2850    /// `SchemaVocabulary::lookup` is cloned per hit.
2851    pub fn schema_vocabulary_lookup(
2852        &self,
2853        token: &str,
2854    ) -> Vec<crate::runtime::schema_vocabulary::VocabHit> {
2855        self.inner.schema_vocabulary.read().lookup(token).to_vec()
2856    }
2857
2858    /// Inject an AuthStore into the runtime. Called by server boot
2859    /// after the vault has been bootstrapped, so that `Value::Secret`
2860    /// auto-encrypt/decrypt can reach the vault AES key.
2861    pub fn set_auth_store(&self, store: Arc<crate::auth::store::AuthStore>) {
2862        *self.inner.auth_store.write() = Some(store);
2863    }
2864
2865    /// Snapshot the current AuthStore (if any). Used by the wire listener
2866    /// to validate bearer tokens issued via HTTP `/auth/login`.
2867    pub fn auth_store(&self) -> Option<Arc<crate::auth::store::AuthStore>> {
2868        self.inner.auth_store.read().clone()
2869    }
2870
2871    /// Read a vault KV secret from the configured AuthStore, if present.
2872    pub fn vault_kv_get(&self, key: &str) -> Option<String> {
2873        self.inner
2874            .auth_store
2875            .read()
2876            .as_ref()
2877            .and_then(|store| store.vault_kv_get(key))
2878    }
2879
2880    /// Write a vault KV secret and fail if the encrypted vault write is
2881    /// unavailable or cannot be made durable.
2882    pub fn vault_kv_try_set(&self, key: String, value: String) -> RedDBResult<()> {
2883        let store = self.inner.auth_store.read().clone().ok_or_else(|| {
2884            RedDBError::Query("secret storage requires an enabled, unsealed vault".to_string())
2885        })?;
2886        store
2887            .vault_kv_try_set(key, value)
2888            .map_err(|err| RedDBError::Query(err.to_string()))
2889    }
2890
2891    /// Inject an `OAuthValidator` into the runtime. When set, HTTP and
2892    /// wire transports try OAuth JWT validation before falling back to
2893    /// the local AuthStore lookup. Pass `None` to disable.
2894    pub fn set_oauth_validator(&self, validator: Option<Arc<crate::auth::oauth::OAuthValidator>>) {
2895        *self.inner.oauth_validator.write() = validator;
2896    }
2897
2898    /// Returns a clone of the configured `OAuthValidator` Arc, if any.
2899    /// Hot path: called per HTTP request when an Authorization header
2900    /// is present, so we hand back a cheap Arc clone.
2901    pub fn oauth_validator(&self) -> Option<Arc<crate::auth::oauth::OAuthValidator>> {
2902        self.inner.oauth_validator.read().clone()
2903    }
2904
2905    /// Returns the vault AES key (`red.secret.aes_key`) if an auth
2906    /// store is wired and a key has been generated. Used by the
2907    /// `Value::Secret` encrypt/decrypt pipeline.
2908    pub(crate) fn secret_aes_key(&self) -> Option<[u8; 32]> {
2909        let guard = self.inner.auth_store.read();
2910        guard.as_ref().and_then(|s| s.vault_secret_key())
2911    }
2912
2913    /// Resolve a boolean flag from `red_config`. Defaults to `default`
2914    /// when the key is missing or not coercible. If the same key has
2915    /// been written multiple times (SET CONFIG appends new rows), the
2916    /// most recent entity wins. Env-var overrides
2917    /// (`REDDB_<UP_DOTTED>`) take highest precedence.
2918    pub(crate) fn config_bool(&self, key: &str, default: bool) -> bool {
2919        if let Some(raw) = self.inner.env_config_overrides.get(key) {
2920            if let Some(crate::storage::schema::Value::Boolean(b)) =
2921                crate::runtime::config_overlay::coerce_env_value(key, raw)
2922            {
2923                return b;
2924            }
2925        }
2926        let store = self.inner.db.store();
2927        let Some(manager) = store.get_collection("red_config") else {
2928            return default;
2929        };
2930        let mut result = default;
2931        let mut latest_id: u64 = 0;
2932        manager.for_each_entity(|entity| {
2933            if let Some(row) = entity.data.as_row() {
2934                let entry_key = row.get_field("key").and_then(|v| match v {
2935                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2936                    _ => None,
2937                });
2938                if entry_key == Some(key) {
2939                    let id = entity.id.raw();
2940                    if id >= latest_id {
2941                        latest_id = id;
2942                        result = match row.get_field("value") {
2943                            Some(crate::storage::schema::Value::Boolean(b)) => *b,
2944                            Some(crate::storage::schema::Value::Text(s)) => {
2945                                matches!(s.as_ref(), "true" | "TRUE" | "True" | "1")
2946                            }
2947                            Some(crate::storage::schema::Value::Integer(n)) => *n != 0,
2948                            _ => default,
2949                        };
2950                    }
2951                }
2952            }
2953            true
2954        });
2955        result
2956    }
2957
2958    pub(crate) fn config_u64(&self, key: &str, default: u64) -> u64 {
2959        if let Some(raw) = self.inner.env_config_overrides.get(key) {
2960            if let Some(crate::storage::schema::Value::UnsignedInteger(n)) =
2961                crate::runtime::config_overlay::coerce_env_value(key, raw)
2962            {
2963                return n;
2964            }
2965        }
2966        let store = self.inner.db.store();
2967        let Some(manager) = store.get_collection("red_config") else {
2968            return default;
2969        };
2970        let mut result = default;
2971        let mut latest_id: u64 = 0;
2972        manager.for_each_entity(|entity| {
2973            if let Some(row) = entity.data.as_row() {
2974                let entry_key = row.get_field("key").and_then(|v| match v {
2975                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2976                    _ => None,
2977                });
2978                if entry_key == Some(key) {
2979                    let id = entity.id.raw();
2980                    if id >= latest_id {
2981                        latest_id = id;
2982                        result = match row.get_field("value") {
2983                            Some(crate::storage::schema::Value::Integer(n)) => *n as u64,
2984                            Some(crate::storage::schema::Value::UnsignedInteger(n)) => *n,
2985                            Some(crate::storage::schema::Value::Text(s)) => {
2986                                s.parse::<u64>().unwrap_or(default)
2987                            }
2988                            _ => default,
2989                        };
2990                    }
2991                }
2992            }
2993            true
2994        });
2995        result
2996    }
2997
2998    pub(crate) fn config_f64(&self, key: &str, default: f64) -> f64 {
2999        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3000            if let Ok(n) = raw.parse::<f64>() {
3001                return n;
3002            }
3003        }
3004        let store = self.inner.db.store();
3005        let Some(manager) = store.get_collection("red_config") else {
3006            return default;
3007        };
3008        let mut result = default;
3009        let mut latest_id: u64 = 0;
3010        manager.for_each_entity(|entity| {
3011            if let Some(row) = entity.data.as_row() {
3012                let entry_key = row.get_field("key").and_then(|v| match v {
3013                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3014                    _ => None,
3015                });
3016                if entry_key == Some(key) {
3017                    let id = entity.id.raw();
3018                    if id >= latest_id {
3019                        latest_id = id;
3020                        result = match row.get_field("value") {
3021                            Some(crate::storage::schema::Value::Float(n)) => *n,
3022                            Some(crate::storage::schema::Value::Integer(n)) => *n as f64,
3023                            Some(crate::storage::schema::Value::UnsignedInteger(n)) => *n as f64,
3024                            Some(crate::storage::schema::Value::Text(s)) => {
3025                                s.parse::<f64>().unwrap_or(default)
3026                            }
3027                            _ => default,
3028                        };
3029                    }
3030                }
3031            }
3032            true
3033        });
3034        result
3035    }
3036
3037    pub(crate) fn config_string(&self, key: &str, default: &str) -> String {
3038        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3039            return raw.clone();
3040        }
3041        let store = self.inner.db.store();
3042        let Some(manager) = store.get_collection("red_config") else {
3043            return default.to_string();
3044        };
3045        let mut result = default.to_string();
3046        let mut latest_id: u64 = 0;
3047        manager.for_each_entity(|entity| {
3048            if let Some(row) = entity.data.as_row() {
3049                let entry_key = row.get_field("key").and_then(|v| match v {
3050                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3051                    _ => None,
3052                });
3053                if entry_key == Some(key) {
3054                    let id = entity.id.raw();
3055                    if id >= latest_id {
3056                        latest_id = id;
3057                        if let Some(crate::storage::schema::Value::Text(value)) =
3058                            row.get_field("value")
3059                        {
3060                            result = value.to_string();
3061                        }
3062                    }
3063                }
3064            }
3065            true
3066        });
3067        result
3068    }
3069
3070    fn latest_metadata_for(
3071        &self,
3072        collection: &str,
3073        entity_id: u64,
3074    ) -> Option<crate::serde_json::Value> {
3075        self.inner
3076            .db
3077            .store()
3078            .get_metadata(collection, EntityId::new(entity_id))
3079            .map(|metadata| metadata_to_json(&metadata))
3080    }
3081
3082    fn persist_replica_lsn(&self, lsn: u64) {
3083        self.inner.db.store().set_config_tree(
3084            "red.replication",
3085            &crate::json!({
3086                "last_applied_lsn": lsn
3087            }),
3088        );
3089    }
3090
3091    fn persist_replication_health(
3092        &self,
3093        state: &str,
3094        last_error: &str,
3095        primary_lsn: Option<u64>,
3096        oldest_available_lsn: Option<u64>,
3097    ) {
3098        self.inner.db.store().set_config_tree(
3099            "red.replication",
3100            &crate::json!({
3101                "state": state,
3102                "last_error": last_error,
3103                "last_seen_primary_lsn": primary_lsn.unwrap_or(0),
3104                "last_seen_oldest_lsn": oldest_available_lsn.unwrap_or(0),
3105                "updated_at_unix_ms": SystemTime::now()
3106                    .duration_since(UNIX_EPOCH)
3107                    .unwrap_or_default()
3108                    .as_millis() as u64
3109            }),
3110        );
3111    }
3112
3113    /// Whether `SECRET('...')` literals should be encrypted with the
3114    /// vault AES key on INSERT. Default `true`.
3115    pub(crate) fn secret_auto_encrypt(&self) -> bool {
3116        self.config_bool("red.config.secret.auto_encrypt", true)
3117    }
3118
3119    /// Whether `Value::Secret` columns should be decrypted back to
3120    /// plaintext on SELECT when the vault is unsealed. Default `true`.
3121    /// Turning this off keeps secrets masked as `***` even while the
3122    /// vault is open — useful for audit trails or read-only exports.
3123    pub(crate) fn secret_auto_decrypt(&self) -> bool {
3124        self.config_bool("red.config.secret.auto_decrypt", true)
3125    }
3126
3127    /// Walk every record in `result` and swap `Value::Secret(bytes)`
3128    /// for the decrypted plaintext when the runtime has the vault
3129    /// AES key AND `red.config.secret.auto_decrypt = true`. If the
3130    /// key is missing, the vault is sealed, or auto_decrypt is off,
3131    /// secrets are left as `Value::Secret` which every formatter
3132    /// (Display, JSON) already masks as `***`.
3133    pub(crate) fn apply_secret_decryption(&self, result: &mut RuntimeQueryResult) {
3134        if !self.secret_auto_decrypt() {
3135            return;
3136        }
3137        let Some(key) = self.secret_aes_key() else {
3138            return;
3139        };
3140        for record in result.result.records.iter_mut() {
3141            for value in record.values_mut() {
3142                if let Value::Secret(ref bytes) = value {
3143                    if let Some(plain) =
3144                        super::impl_dml::decrypt_secret_payload(&key, bytes.as_slice())
3145                    {
3146                        if let Ok(text) = String::from_utf8(plain) {
3147                            *value = Value::text(text);
3148                        }
3149                    }
3150                }
3151            }
3152        }
3153    }
3154
3155    /// Emit a CDC change event and replicate to WAL buffer.
3156    /// Create a `MutationEngine` bound to this runtime.
3157    ///
3158    /// The engine is cheap to construct (no allocation) and should be
3159    /// dropped after `apply` returns. Use this from application-layer
3160    /// `create_row` / `create_rows_batch` instead of calling
3161    /// `bulk_insert` + `index_entity_insert` + `cdc_emit` separately.
3162    pub(crate) fn mutation_engine(&self) -> crate::runtime::mutation::MutationEngine<'_> {
3163        crate::runtime::mutation::MutationEngine::new(self)
3164    }
3165
3166    /// Public-mutation gate snapshot (PLAN.md W1).
3167    ///
3168    /// Surfaces that accept untrusted client requests (SQL DML/DDL,
3169    /// gRPC mutating RPCs, HTTP/native wire mutations, admin
3170    /// maintenance, serverless lifecycle) call `check_write` before
3171    /// dispatching to storage. Returns `RedDBError::ReadOnly` on any
3172    /// instance running as a replica or with `options.read_only =
3173    /// true`. The replica internal logical-WAL apply path reaches into
3174    /// the store directly and never calls this method, so legitimate
3175    /// replica catch-up still works.
3176    pub fn check_write(&self, kind: crate::runtime::write_gate::WriteKind) -> RedDBResult<()> {
3177        self.inner.write_gate.check(kind)
3178    }
3179
3180    /// Read-only handle to the gate, useful for transports that want
3181    /// to surface the policy in health/status output without taking on
3182    /// a dependency on the concrete enum.
3183    pub fn write_gate(&self) -> &crate::runtime::write_gate::WriteGate {
3184        &self.inner.write_gate
3185    }
3186
3187    /// Process lifecycle handle (PLAN.md Phase 1). Health probes,
3188    /// admin/shutdown, and signal handlers consult this single
3189    /// state machine.
3190    pub fn lifecycle(&self) -> &crate::runtime::lifecycle::Lifecycle {
3191        &self.inner.lifecycle
3192    }
3193
3194    /// Operator-imposed resource limits (PLAN.md Phase 4.1).
3195    pub fn resource_limits(&self) -> &crate::runtime::resource_limits::ResourceLimits {
3196        &self.inner.resource_limits
3197    }
3198
3199    /// Append-only audit log for admin mutations (PLAN.md Phase 6.5).
3200    pub fn audit_log(&self) -> &crate::runtime::audit_log::AuditLogger {
3201        &self.inner.audit_log
3202    }
3203
3204    /// Shared `Arc` to the audit logger — used by collaborators (the
3205    /// lease lifecycle, future request-context plumbing) that need to
3206    /// keep the logger alive past the runtime's stack frame.
3207    pub fn audit_log_arc(&self) -> Arc<crate::runtime::audit_log::AuditLogger> {
3208        Arc::clone(&self.inner.audit_log)
3209    }
3210
3211    /// Slice 10 of issue #527 — shared queue telemetry counters
3212    /// (delivered/acked/nacked). Cloned by `queue_delivery.rs` on
3213    /// each transition.
3214    pub(crate) fn queue_telemetry(
3215        &self,
3216    ) -> &crate::runtime::queue_telemetry::QueueTelemetryCounters {
3217        &self.inner.queue_telemetry
3218    }
3219
3220    /// Snapshots of the queue telemetry counters in label-deterministic
3221    /// order for `/metrics` rendering and the integration test.
3222    pub fn queue_telemetry_snapshot(
3223        &self,
3224    ) -> crate::runtime::queue_telemetry::QueueTelemetrySnapshot {
3225        crate::runtime::queue_telemetry::QueueTelemetrySnapshot {
3226            delivered: self.inner.queue_telemetry.delivered_snapshot(),
3227            acked: self.inner.queue_telemetry.acked_snapshot(),
3228            nacked: self.inner.queue_telemetry.nacked_snapshot(),
3229        }
3230    }
3231
3232    /// Slice 10 of issue #527 — render-time scan of pending entries
3233    /// per (queue, group) for the `queue_pending_gauge` exposition.
3234    /// Walks `red_queue_meta` live so the gauge cannot drift from
3235    /// the source of truth.
3236    pub fn queue_pending_counts(&self) -> Vec<((String, String), u64)> {
3237        let store = self.inner.db.store();
3238        crate::runtime::impl_queue::pending_counts_by_group(store.as_ref())
3239            .into_iter()
3240            .collect()
3241    }
3242
3243    /// Shared `Arc` to the write gate. Same rationale as
3244    /// `audit_log_arc`: collaborators (lease lifecycle, refresh
3245    /// thread) need a clone-cheap handle they can move into a
3246    /// background thread.
3247    pub fn write_gate_arc(&self) -> Arc<crate::runtime::write_gate::WriteGate> {
3248        Arc::clone(&self.inner.write_gate)
3249    }
3250
3251    /// Serverless writer-lease state machine. `None` when the operator
3252    /// did not opt into lease fencing (`RED_LEASE_REQUIRED` unset).
3253    pub fn lease_lifecycle(&self) -> Option<&Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>> {
3254        self.inner.lease_lifecycle.get()
3255    }
3256
3257    /// Install the lease lifecycle. Idempotent; subsequent calls
3258    /// return the previously stored value untouched.
3259    pub fn set_lease_lifecycle(
3260        &self,
3261        lifecycle: Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>,
3262    ) -> Result<(), Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>> {
3263        self.inner.lease_lifecycle.set(lifecycle)
3264    }
3265
3266    /// Reject the call when the requested batch size exceeds
3267    /// `RED_MAX_BATCH_SIZE`. Returns `RedDBError::QuotaExceeded`
3268    /// shaped so the HTTP layer can map it to 413 Payload Too
3269    /// Large (PLAN.md Phase 4.1).
3270    pub fn check_batch_size(&self, requested: usize) -> RedDBResult<()> {
3271        if self.inner.resource_limits.batch_size_exceeded(requested) {
3272            let max = self.inner.resource_limits.max_batch_size.unwrap_or(0);
3273            return Err(RedDBError::QuotaExceeded(format!(
3274                "max_batch_size:{requested}:{max}"
3275            )));
3276        }
3277        Ok(())
3278    }
3279
3280    /// Reject the call when the local DB file exceeds
3281    /// `RED_MAX_DB_SIZE_BYTES`. Reads file metadata once per call —
3282    /// the cost is a single `stat()` syscall, negligible against the
3283    /// I/O the caller is about to do. Returns `QuotaExceeded` shaped
3284    /// for HTTP 507 Insufficient Storage.
3285    pub fn check_db_size(&self) -> RedDBResult<()> {
3286        let Some(limit) = self.inner.resource_limits.max_db_size_bytes else {
3287            return Ok(());
3288        };
3289        if limit == 0 {
3290            return Ok(());
3291        }
3292        let Some(path) = self.inner.db.path() else {
3293            return Ok(());
3294        };
3295        let current = std::fs::metadata(path).map(|m| m.len()).unwrap_or(0);
3296        if current > limit {
3297            return Err(RedDBError::QuotaExceeded(format!(
3298                "max_db_size_bytes:{current}:{limit}"
3299            )));
3300        }
3301        Ok(())
3302    }
3303
3304    /// Graceful shutdown coordinator (PLAN.md Phase 1.1).
3305    ///
3306    /// Steps, in order, all idempotent across re-entrant calls:
3307    ///   1. Move lifecycle into `ShuttingDown` (concurrent callers
3308    ///      observe `Stopped` after first finishes).
3309    ///   2. Flush WAL + run final checkpoint via `db.flush()` so
3310    ///      every acked write is durable on disk.
3311    ///   3. If `backup_on_shutdown == true` and a remote backend is
3312    ///      configured, run a synchronous `trigger_backup()` so the
3313    ///      remote head reflects the final state.
3314    ///   4. Stamp the report and move to `Stopped`. Subsequent calls
3315    ///      return the cached report without re-running anything.
3316    ///
3317    /// On any error, the runtime is still marked `Stopped` so the
3318    /// process can exit; the caller logs the error context but does
3319    /// not retry the same shutdown — the operator can inspect the
3320    /// report fields to see which step failed.
3321    pub fn graceful_shutdown(
3322        &self,
3323        backup_on_shutdown: bool,
3324    ) -> RedDBResult<crate::runtime::lifecycle::ShutdownReport> {
3325        if !self.inner.lifecycle.begin_shutdown() {
3326            // Someone else already shut down (or is in flight). Return
3327            // the cached report so the HTTP caller and SIGTERM handler
3328            // get the same idempotent answer.
3329            return Ok(self.inner.lifecycle.shutdown_report().unwrap_or_default());
3330        }
3331
3332        let started_ms = std::time::SystemTime::now()
3333            .duration_since(std::time::UNIX_EPOCH)
3334            .map(|d| d.as_millis() as u64)
3335            .unwrap_or(0);
3336        let mut report = crate::runtime::lifecycle::ShutdownReport {
3337            started_at_ms: started_ms,
3338            ..Default::default()
3339        };
3340
3341        // Flush WAL + run any pending checkpoint. Local fsync is
3342        // unconditional — even a lease-lost replica needs its WAL on
3343        // disk before exit so a future restore has the latest tail.
3344        // The remote upload is gated separately so a lost-lease writer
3345        // doesn't clobber the new holder's state on its way out.
3346        let flush_res = self.inner.db.flush_local_only();
3347        report.flushed_wal = flush_res.is_ok();
3348        report.final_checkpoint = flush_res.is_ok();
3349        if let Err(err) = &flush_res {
3350            tracing::error!(
3351                target: "reddb::lifecycle",
3352                error = %err,
3353                "graceful_shutdown: local flush failed"
3354            );
3355        } else if let Err(lease_err) =
3356            self.assert_remote_write_allowed("shutdown/checkpoint_upload")
3357        {
3358            tracing::warn!(
3359                target: "reddb::serverless::lease",
3360                error = %lease_err,
3361                "graceful_shutdown: remote upload skipped — lease not held"
3362            );
3363        } else if let Err(err) = self.inner.db.upload_to_remote_backend() {
3364            tracing::error!(
3365                target: "reddb::lifecycle",
3366                error = %err,
3367                "graceful_shutdown: remote upload failed"
3368            );
3369        }
3370
3371        // Optional final backup. Skipped silently when no remote
3372        // backend is configured — `trigger_backup()` returns Err
3373        // anyway in that case, but logging it as a shutdown failure
3374        // would be misleading on a standalone (no-backend) runtime.
3375        if backup_on_shutdown && self.inner.db.remote_backend.is_some() {
3376            // The trigger_backup gate now reads `WriteKind::Backup`,
3377            // which a replica/read_only instance refuses. That's
3378            // intentional — replicas don't drive backups; only the
3379            // primary does. We still want shutdown to flush its WAL
3380            // even if the backup branch is gated off.
3381            match self.trigger_backup() {
3382                Ok(result) => {
3383                    report.backup_uploaded = result.uploaded;
3384                }
3385                Err(err) => {
3386                    tracing::warn!(
3387                        target: "reddb::lifecycle",
3388                        error = %err,
3389                        "graceful_shutdown: final backup skipped"
3390                    );
3391                }
3392            }
3393        }
3394
3395        let completed_ms = std::time::SystemTime::now()
3396            .duration_since(std::time::UNIX_EPOCH)
3397            .map(|d| d.as_millis() as u64)
3398            .unwrap_or(started_ms);
3399        report.completed_at_ms = completed_ms;
3400        report.duration_ms = completed_ms.saturating_sub(started_ms);
3401
3402        self.inner.lifecycle.finish_shutdown(report.clone());
3403        Ok(report)
3404    }
3405
3406    /// Emit a CDC record without invalidating the result cache.
3407    ///
3408    /// Used by `MutationEngine::append_batch` which calls
3409    /// `invalidate_result_cache` once for the whole batch before this
3410    /// loop, avoiding N write-lock acquisitions.
3411    pub(crate) fn cdc_emit_no_cache_invalidate(
3412        &self,
3413        operation: crate::replication::cdc::ChangeOperation,
3414        collection: &str,
3415        entity_id: u64,
3416        entity_kind: &str,
3417    ) -> u64 {
3418        let lsn = self
3419            .inner
3420            .cdc
3421            .emit(operation, collection, entity_id, entity_kind);
3422
3423        // Append to logical WAL replication buffer (if primary mode)
3424        if let Some(ref primary) = self.inner.db.replication {
3425            let store = self.inner.db.store();
3426            let entity = if operation == crate::replication::cdc::ChangeOperation::Delete {
3427                None
3428            } else {
3429                store.get(collection, EntityId::new(entity_id))
3430            };
3431            let record = ChangeRecord {
3432                lsn,
3433                timestamp: SystemTime::now()
3434                    .duration_since(UNIX_EPOCH)
3435                    .unwrap_or_default()
3436                    .as_millis() as u64,
3437                operation,
3438                collection: collection.to_string(),
3439                entity_id,
3440                entity_kind: entity_kind.to_string(),
3441                entity_bytes: entity
3442                    .as_ref()
3443                    .map(|e| UnifiedStore::serialize_entity(e, store.format_version())),
3444                metadata: self.latest_metadata_for(collection, entity_id),
3445                refresh_records: None,
3446            };
3447            let encoded = record.encode();
3448            primary.wal_buffer.append(record.lsn, encoded.clone());
3449            if let Some(spool) = &primary.logical_wal_spool {
3450                let _ = spool.append(record.lsn, &encoded);
3451            }
3452        }
3453        lsn
3454    }
3455
3456    pub(crate) fn cdc_emit_insert_batch_no_cache_invalidate(
3457        &self,
3458        collection: &str,
3459        ids: &[EntityId],
3460        entity_kind: &str,
3461    ) -> Vec<u64> {
3462        if ids.is_empty() {
3463            return Vec::new();
3464        }
3465
3466        // Without logical replication, CDC only needs the in-memory event
3467        // ring. Reserve all LSNs and push the batch under one mutex instead
3468        // of taking the ring lock once per inserted row.
3469        if self.inner.db.replication.is_none() {
3470            return self.inner.cdc.emit_batch_same_collection(
3471                crate::replication::cdc::ChangeOperation::Insert,
3472                collection,
3473                entity_kind,
3474                ids.iter().map(|id| id.raw()),
3475            );
3476        }
3477
3478        // Replication needs one logical-WAL record per entity with the
3479        // serialized entity bytes, so keep the existing per-row path.
3480        ids.iter()
3481            .map(|id| {
3482                self.cdc_emit_no_cache_invalidate(
3483                    crate::replication::cdc::ChangeOperation::Insert,
3484                    collection,
3485                    id.raw(),
3486                    entity_kind,
3487                )
3488            })
3489            .collect()
3490    }
3491
3492    pub fn cdc_emit(
3493        &self,
3494        operation: crate::replication::cdc::ChangeOperation,
3495        collection: &str,
3496        entity_id: u64,
3497        entity_kind: &str,
3498    ) -> u64 {
3499        let lsn = self
3500            .inner
3501            .cdc
3502            .emit(operation, collection, entity_id, entity_kind);
3503        // Perf: prior to this we called `invalidate_result_cache()`
3504        // which wipes EVERY cached query, across every table, under
3505        // a write lock — turning each INSERT into a serialisation
3506        // point for all readers. Swap to the per-table variant so
3507        // unrelated query caches survive.
3508        self.invalidate_result_cache_for_table(collection);
3509
3510        // Append to logical WAL replication buffer (if primary mode)
3511        if let Some(ref primary) = self.inner.db.replication {
3512            let store = self.inner.db.store();
3513            let entity = if operation == crate::replication::cdc::ChangeOperation::Delete {
3514                None
3515            } else {
3516                store.get(collection, EntityId::new(entity_id))
3517            };
3518            let record = ChangeRecord {
3519                lsn,
3520                timestamp: SystemTime::now()
3521                    .duration_since(UNIX_EPOCH)
3522                    .unwrap_or_default()
3523                    .as_millis() as u64,
3524                operation,
3525                collection: collection.to_string(),
3526                entity_id,
3527                entity_kind: entity_kind.to_string(),
3528                entity_bytes: entity
3529                    .as_ref()
3530                    .map(|entity| UnifiedStore::serialize_entity(entity, store.format_version())),
3531                metadata: self.latest_metadata_for(collection, entity_id),
3532                refresh_records: None,
3533            };
3534            let encoded = record.encode();
3535            primary.wal_buffer.append(record.lsn, encoded.clone());
3536            if let Some(spool) = &primary.logical_wal_spool {
3537                let _ = spool.append(record.lsn, &encoded);
3538            }
3539        }
3540        lsn
3541    }
3542
3543    pub(crate) fn cdc_emit_kv(
3544        &self,
3545        operation: crate::replication::cdc::ChangeOperation,
3546        collection: &str,
3547        key: &str,
3548        entity_id: u64,
3549        before: Option<crate::json::Value>,
3550        after: Option<crate::json::Value>,
3551    ) -> u64 {
3552        let lsn = self
3553            .inner
3554            .cdc
3555            .emit_kv(operation, collection, key, entity_id, before, after);
3556        self.inner.kv_stats.incr_watch_events_emitted();
3557        self.invalidate_result_cache_for_table(collection);
3558        lsn
3559    }
3560
3561    pub(crate) fn record_kv_watch_event(
3562        &self,
3563        operation: crate::replication::cdc::ChangeOperation,
3564        collection: &str,
3565        key: &str,
3566        entity_id: u64,
3567        before: Option<crate::json::Value>,
3568        after: Option<crate::json::Value>,
3569    ) {
3570        if self.current_xid().is_some() {
3571            let conn_id = current_connection_id();
3572            let event = crate::replication::cdc::KvWatchEvent {
3573                collection: collection.to_string(),
3574                key: key.to_string(),
3575                op: operation,
3576                before,
3577                after,
3578                lsn: 0,
3579                committed_at: 0,
3580                dropped_event_count: 0,
3581            };
3582            self.inner
3583                .pending_kv_watch_events
3584                .write()
3585                .entry(conn_id)
3586                .or_default()
3587                .push(event);
3588            return;
3589        }
3590
3591        self.cdc_emit_kv(operation, collection, key, entity_id, before, after);
3592    }
3593
3594    pub(crate) fn cdc_emit_prebuilt(
3595        &self,
3596        operation: crate::replication::cdc::ChangeOperation,
3597        collection: &str,
3598        entity: &UnifiedEntity,
3599        entity_kind: &str,
3600        metadata: Option<&crate::storage::Metadata>,
3601        invalidate_cache: bool,
3602    ) -> u64 {
3603        self.cdc_emit_prebuilt_with_columns(
3604            operation,
3605            collection,
3606            entity,
3607            entity_kind,
3608            metadata,
3609            invalidate_cache,
3610            None,
3611        )
3612    }
3613
3614    /// `cdc_emit_prebuilt` plus the list of column names whose values
3615    /// changed on this update. Callers that have already computed a
3616    /// `RowDamageVector` pass it here so downstream CDC consumers can
3617    /// filter events by touched column without re-diffing.
3618    /// `changed_columns` is only meaningful for `Update` operations —
3619    /// insert and delete events ignore it.
3620    pub(crate) fn cdc_emit_prebuilt_with_columns(
3621        &self,
3622        operation: crate::replication::cdc::ChangeOperation,
3623        collection: &str,
3624        entity: &UnifiedEntity,
3625        entity_kind: &str,
3626        metadata: Option<&crate::storage::Metadata>,
3627        invalidate_cache: bool,
3628        changed_columns: Option<Vec<String>>,
3629    ) -> u64 {
3630        if invalidate_cache {
3631            self.invalidate_result_cache();
3632        }
3633
3634        let public_id = entity.logical_id().raw();
3635        let lsn = self.inner.cdc.emit_with_columns(
3636            operation,
3637            collection,
3638            public_id,
3639            entity_kind,
3640            changed_columns,
3641        );
3642
3643        if let Some(ref primary) = self.inner.db.replication {
3644            let store = self.inner.db.store();
3645            let record = ChangeRecord {
3646                lsn,
3647                timestamp: SystemTime::now()
3648                    .duration_since(UNIX_EPOCH)
3649                    .unwrap_or_default()
3650                    .as_millis() as u64,
3651                operation,
3652                collection: collection.to_string(),
3653                entity_id: entity.id.raw(),
3654                entity_kind: entity_kind.to_string(),
3655                entity_bytes: Some(UnifiedStore::serialize_entity(
3656                    entity,
3657                    store.format_version(),
3658                )),
3659                metadata: metadata
3660                    .map(metadata_to_json)
3661                    .or_else(|| self.latest_metadata_for(collection, entity.id.raw())),
3662                refresh_records: None,
3663            };
3664            let encoded = record.encode();
3665            primary.wal_buffer.append(record.lsn, encoded.clone());
3666            if let Some(spool) = &primary.logical_wal_spool {
3667                let _ = spool.append(record.lsn, &encoded);
3668            }
3669        }
3670
3671        lsn
3672    }
3673
3674    pub(crate) fn cdc_emit_prebuilt_batch<'a, I>(
3675        &self,
3676        operation: crate::replication::cdc::ChangeOperation,
3677        entity_kind: &str,
3678        items: I,
3679        invalidate_cache: bool,
3680    ) where
3681        I: IntoIterator<
3682            Item = (
3683                &'a str,
3684                &'a UnifiedEntity,
3685                Option<&'a crate::storage::Metadata>,
3686            ),
3687        >,
3688    {
3689        let items: Vec<(&str, &UnifiedEntity, Option<&crate::storage::Metadata>)> =
3690            items.into_iter().collect();
3691        if items.is_empty() {
3692            return;
3693        }
3694
3695        if invalidate_cache {
3696            self.invalidate_result_cache();
3697        }
3698
3699        for (collection, entity, metadata) in items {
3700            self.cdc_emit_prebuilt(operation, collection, entity, entity_kind, metadata, false);
3701        }
3702    }
3703
3704    fn run_replica_loop(&self, primary_addr: String) {
3705        let endpoint = if primary_addr.starts_with("http") {
3706            primary_addr
3707        } else {
3708            format!("http://{primary_addr}")
3709        };
3710        let poll_ms = self.inner.db.options().replication.poll_interval_ms;
3711        let max_count = self.inner.db.options().replication.max_batch_size;
3712        let mut since_lsn = self.config_u64("red.replication.last_applied_lsn", 0);
3713
3714        let runtime = match tokio::runtime::Builder::new_current_thread()
3715            .enable_all()
3716            .build()
3717        {
3718            Ok(runtime) => runtime,
3719            Err(_) => return,
3720        };
3721
3722        runtime.block_on(async move {
3723            use crate::grpc::proto::red_db_client::RedDbClient;
3724            use crate::grpc::proto::JsonPayloadRequest;
3725
3726            let mut client = loop {
3727                match RedDbClient::connect(endpoint.clone()).await {
3728                    Ok(client) => {
3729                        self.persist_replication_health("connecting", "", None, None);
3730                        break client;
3731                    }
3732                    Err(_) => {
3733                        self.persist_replication_health(
3734                            "connecting",
3735                            "waiting for primary connection",
3736                            None,
3737                            None,
3738                        );
3739                        std::thread::sleep(std::time::Duration::from_millis(poll_ms.max(250)))
3740                    }
3741                }
3742            };
3743
3744            // PLAN.md Phase 11.5 — stateful applier guards LSN
3745            // monotonicity across pulls. Seed with the persisted
3746            // `last_applied_lsn` so reboots don't lose the chain
3747            // pointer.
3748            let applier = crate::replication::logical::LogicalChangeApplier::new(since_lsn);
3749
3750            loop {
3751                let payload = crate::json!({
3752                    "since_lsn": since_lsn,
3753                    "max_count": max_count
3754                });
3755                let request = tonic::Request::new(JsonPayloadRequest {
3756                    payload_json: crate::json::to_string(&payload)
3757                        .unwrap_or_else(|_| "{}".to_string()),
3758                });
3759
3760                if let Ok(response) = client.pull_wal_records(request).await {
3761                    if let Ok(value) =
3762                        crate::json::from_str::<crate::json::Value>(&response.into_inner().payload)
3763                    {
3764                        let current_lsn =
3765                            value.get("current_lsn").and_then(crate::json::Value::as_u64);
3766                        let oldest_available_lsn = value
3767                            .get("oldest_available_lsn")
3768                            .and_then(crate::json::Value::as_u64);
3769                        if since_lsn > 0
3770                            && oldest_available_lsn
3771                                .map(|oldest| oldest > since_lsn.saturating_add(1))
3772                                .unwrap_or(false)
3773                        {
3774                            self.persist_replication_health(
3775                                "stalled_gap",
3776                                "replica is behind the oldest logical WAL available on primary; re-bootstrap required",
3777                                current_lsn,
3778                                oldest_available_lsn,
3779                            );
3780                            std::thread::sleep(std::time::Duration::from_millis(poll_ms.max(250)));
3781                            continue;
3782                        }
3783                        if let Some(records) =
3784                            value.get("records").and_then(crate::json::Value::as_array)
3785                        {
3786                            for record in records {
3787                                let Some(data_hex) =
3788                                    record.get("data").and_then(crate::json::Value::as_str)
3789                                else {
3790                                    continue;
3791                                };
3792                                let Ok(data) = hex::decode(data_hex) else {
3793                                    self.inner.replica_apply_metrics.record(
3794                                        crate::replication::logical::ApplyErrorKind::Decode,
3795                                    );
3796                                    self.persist_replication_health(
3797                                        "apply_error",
3798                                        "failed to decode WAL record hex payload",
3799                                        current_lsn,
3800                                        oldest_available_lsn,
3801                                    );
3802                                    continue;
3803                                };
3804                                let Ok(change) = ChangeRecord::decode(&data) else {
3805                                    self.inner.replica_apply_metrics.record(
3806                                        crate::replication::logical::ApplyErrorKind::Decode,
3807                                    );
3808                                    self.persist_replication_health(
3809                                        "apply_error",
3810                                        "failed to decode logical WAL record",
3811                                        current_lsn,
3812                                        oldest_available_lsn,
3813                                    );
3814                                    continue;
3815                                };
3816                                match applier.apply(
3817                                    self.inner.db.as_ref(),
3818                                    &change,
3819                                    ApplyMode::Replica,
3820                                ) {
3821                                    Ok(crate::replication::logical::ApplyOutcome::Applied) => {
3822                                        self.invalidate_result_cache_for_table(&change.collection);
3823                                        since_lsn = since_lsn.max(change.lsn);
3824                                        self.persist_replica_lsn(since_lsn);
3825                                    }
3826                                    Ok(_) => {
3827                                        // Idempotent / Skipped: no advance, no error.
3828                                    }
3829                                    Err(err) => {
3830                                        self.inner.replica_apply_metrics.record(err.kind());
3831                                        // Issue #205 — emit operator-grade event
3832                                        // for the two replication-fatal kinds. `Gap`
3833                                        // / `Apply` / `Decode` already persist via
3834                                        // `persist_replication_health`; the
3835                                        // OperatorEvent variants only cover the
3836                                        // two "stream is broken" / "follower
3837                                        // diverged" conditions an operator must act
3838                                        // on out-of-band.
3839                                        match &err {
3840                                            crate::replication::logical::LogicalApplyError::Divergence { lsn, expected: _, got: _ } => {
3841                                                crate::telemetry::operator_event::OperatorEvent::Divergence {
3842                                                    peer: "primary".to_string(),
3843                                                    leader_lsn: *lsn,
3844                                                    follower_lsn: since_lsn,
3845                                                }
3846                                                .emit_global();
3847                                            }
3848                                            crate::replication::logical::LogicalApplyError::Gap { last, next } => {
3849                                                crate::telemetry::operator_event::OperatorEvent::ReplicationBroken {
3850                                                    peer: "primary".to_string(),
3851                                                    reason: format!("stalled gap last={last} next={next}"),
3852                                                }
3853                                                .emit_global();
3854                                            }
3855                                            _ => {}
3856                                        }
3857                                        let kind = match &err {
3858                                            crate::replication::logical::LogicalApplyError::Gap { .. } => "stalled_gap",
3859                                            crate::replication::logical::LogicalApplyError::Divergence { .. } => "divergence",
3860                                            _ => "apply_error",
3861                                        };
3862                                        self.persist_replication_health(
3863                                            kind,
3864                                            &format!("replica apply rejected: {err}"),
3865                                            current_lsn,
3866                                            oldest_available_lsn,
3867                                        );
3868                                        // Stop applying this batch. The
3869                                        // outer loop will retry on next
3870                                        // pull, which on a real Gap will
3871                                        // not magically heal — operator
3872                                        // must rebootstrap. For
3873                                        // Divergence, we explicitly do
3874                                        // not advance; this keeps the
3875                                        // replica visibly unhealthy
3876                                        // instead of silently swallowing
3877                                        // corruption.
3878                                        break;
3879                                    }
3880                                }
3881                            }
3882                        }
3883                        self.persist_replication_health(
3884                            "healthy",
3885                            "",
3886                            current_lsn,
3887                            oldest_available_lsn,
3888                        );
3889                    } else {
3890                        self.persist_replication_health(
3891                            "apply_error",
3892                            "failed to parse pull_wal_records response",
3893                            None,
3894                            None,
3895                        );
3896                    }
3897                } else {
3898                    self.persist_replication_health(
3899                        "connecting",
3900                        "primary pull_wal_records request failed",
3901                        None,
3902                        None,
3903                    );
3904                }
3905
3906                std::thread::sleep(std::time::Duration::from_millis(poll_ms));
3907            }
3908        });
3909    }
3910
3911    /// Poll CDC events since a given LSN.
3912    pub fn cdc_poll(
3913        &self,
3914        since_lsn: u64,
3915        max_count: usize,
3916    ) -> Vec<crate::replication::cdc::ChangeEvent> {
3917        self.inner.cdc.poll(since_lsn, max_count)
3918    }
3919
3920    /// PLAN.md Phase 11.4 — current CDC LSN. Public mutation
3921    /// surfaces (HTTP query, gRPC entity ops) call this immediately
3922    /// after a successful write to feed `enforce_commit_policy`.
3923    pub fn cdc_current_lsn(&self) -> u64 {
3924        self.inner.cdc.current_lsn()
3925    }
3926
3927    pub fn kv_watch_events_since(
3928        &self,
3929        collection: &str,
3930        key: &str,
3931        since_lsn: u64,
3932        max_count: usize,
3933    ) -> Vec<crate::replication::cdc::KvWatchEvent> {
3934        self.inner
3935            .cdc
3936            .poll(since_lsn, max_count)
3937            .into_iter()
3938            .filter_map(|event| event.kv)
3939            .filter(|event| event.collection == collection && event.key == key)
3940            .collect()
3941    }
3942
3943    pub fn kv_watch_events_since_prefix(
3944        &self,
3945        collection: &str,
3946        prefix: &str,
3947        since_lsn: u64,
3948        max_count: usize,
3949    ) -> Vec<crate::replication::cdc::KvWatchEvent> {
3950        self.inner
3951            .cdc
3952            .poll(since_lsn, max_count)
3953            .into_iter()
3954            .filter_map(|event| event.kv)
3955            .filter(|event| event.collection == collection && event.key.starts_with(prefix))
3956            .collect()
3957    }
3958
3959    pub(crate) fn kv_watch_subscribe<'a>(
3960        &'a self,
3961        collection: impl Into<String>,
3962        key: impl Into<String>,
3963        from_lsn: Option<u64>,
3964    ) -> crate::runtime::kv_watch::KvWatchStream<'a> {
3965        crate::runtime::kv_watch::KvWatchStream::subscribe(
3966            &self.inner.cdc,
3967            &self.inner.kv_stats,
3968            collection,
3969            key,
3970            from_lsn,
3971            self.kv_watch_idle_timeout_ms(),
3972        )
3973    }
3974
3975    pub(crate) fn kv_watch_subscribe_prefix<'a>(
3976        &'a self,
3977        collection: impl Into<String>,
3978        prefix: impl Into<String>,
3979        from_lsn: Option<u64>,
3980    ) -> crate::runtime::kv_watch::KvWatchStream<'a> {
3981        crate::runtime::kv_watch::KvWatchStream::subscribe_prefix(
3982            &self.inner.cdc,
3983            &self.inner.kv_stats,
3984            collection,
3985            prefix,
3986            from_lsn,
3987            self.kv_watch_idle_timeout_ms(),
3988        )
3989    }
3990
3991    pub(crate) fn kv_watch_idle_timeout_ms(&self) -> u64 {
3992        self.config_u64("red.config.kv.watch.idle_timeout_ms", 60_000)
3993    }
3994
3995    /// Get backup scheduler status.
3996    pub fn backup_status(&self) -> crate::replication::scheduler::BackupStatus {
3997        self.inner.backup_scheduler.status()
3998    }
3999
4000    /// Borrow the runtime's result Blob Cache.
4001    ///
4002    /// Wired for the `/admin/blob_cache/sweep` and
4003    /// `/admin/blob_cache/flush_namespace` HTTP handlers (issue #148
4004    /// follow-up): both delegate to
4005    /// `crate::storage::cache::sweeper::BlobCacheSweeper`, which takes a
4006    /// `&BlobCache`. Also used by `trigger_backup` when
4007    /// `red.config.backup.include_blob_cache=true` to locate the L2
4008    /// directory for archival.
4009    pub fn result_blob_cache(&self) -> &crate::storage::cache::BlobCache {
4010        &self.inner.result_blob_cache
4011    }
4012
4013    /// PLAN.md Phase 11.4 — owned snapshot of every registered
4014    /// replica's state on this primary. Returns empty vec on
4015    /// non-primary instances or when no replicas are registered yet.
4016    pub fn primary_replica_snapshots(&self) -> Vec<crate::replication::primary::ReplicaState> {
4017        self.inner
4018            .db
4019            .replication
4020            .as_ref()
4021            .map(|repl| repl.replica_snapshots())
4022            .unwrap_or_default()
4023    }
4024
4025    /// PLAN.md Phase 11.4 — active commit policy. Reads
4026    /// `RED_PRIMARY_COMMIT_POLICY` once at runtime construction;
4027    /// future env reloads will need a reload endpoint. Default is
4028    /// `Local` — current behavior, no replica blocking.
4029    pub fn commit_policy(&self) -> crate::replication::CommitPolicy {
4030        crate::replication::CommitPolicy::from_env()
4031    }
4032
4033    /// PLAN.md Phase 11.5 — accessor for replica-side apply error
4034    /// counters (gap / divergence / apply / decode). Returned
4035    /// snapshot is consistent across the four counters; the labels
4036    /// match `reddb_replica_apply_errors_total{kind}`.
4037    pub fn replica_apply_error_counts(
4038        &self,
4039    ) -> [(crate::replication::logical::ApplyErrorKind, u64); 4] {
4040        self.inner.replica_apply_metrics.snapshot()
4041    }
4042
4043    /// PLAN.md Phase 4.4 — per-caller quota bucket. Always
4044    /// returned; `is_configured()` lets callers short-circuit.
4045    pub fn quota_bucket(&self) -> &crate::runtime::quota_bucket::QuotaBucket {
4046        &self.inner.quota_bucket
4047    }
4048
4049    /// PLAN.md Phase 11.4 — observability snapshot of every
4050    /// replica's durable LSN as known to the commit waiter. Empty
4051    /// vec on non-primary instances or when no replica has acked.
4052    pub fn commit_waiter_snapshot(&self) -> Vec<(String, u64)> {
4053        self.inner
4054            .db
4055            .replication
4056            .as_ref()
4057            .map(|repl| repl.commit_waiter.snapshot())
4058            .unwrap_or_default()
4059    }
4060
4061    /// PLAN.md Phase 11.4 — `(reached, timed_out, not_required, last_micros)`
4062    /// counters for /metrics. Always-zero on non-primary instances.
4063    pub fn commit_waiter_metrics_snapshot(&self) -> (u64, u64, u64, u64) {
4064        self.inner
4065            .db
4066            .replication
4067            .as_ref()
4068            .map(|repl| repl.commit_waiter.metrics_snapshot())
4069            .unwrap_or((0, 0, 0, 0))
4070    }
4071
4072    /// PLAN.md Phase 11.4 — block until at least `count` replicas
4073    /// have durably applied through `target_lsn`, or `timeout`
4074    /// elapses. Returns the `AwaitOutcome` so the caller can decide
4075    /// whether to surface a timeout error to the client or continue
4076    /// (the policy mapping lives in the commit dispatcher).
4077    ///
4078    /// Foundation only — the write commit path doesn't yet call
4079    /// this. Wiring it is a per-surface task gated on the operator
4080    /// flipping `RED_PRIMARY_COMMIT_POLICY` away from `local`.
4081    pub fn await_replica_acks(
4082        &self,
4083        target_lsn: u64,
4084        count: u32,
4085        timeout: std::time::Duration,
4086    ) -> crate::replication::AwaitOutcome {
4087        match &self.inner.db.replication {
4088            Some(repl) => repl.commit_waiter.await_acks(target_lsn, count, timeout),
4089            None => {
4090                // No replication configured: policy must be `Local`.
4091                // Treat as immediate `NotRequired` so callers don't
4092                // block on a degenerate setup.
4093                crate::replication::AwaitOutcome::NotRequired
4094            }
4095        }
4096    }
4097
4098    /// PLAN.md Phase 11.4 — enforce the configured commit policy
4099    /// against `post_lsn` (the LSN of the just-completed write).
4100    /// Returns `Ok(AwaitOutcome)` on every successful enforcement
4101    /// (including `Reached` and `TimedOut` when fail-on-timeout is
4102    /// off). Returns `Err(ReadOnly)` only when:
4103    ///   * policy is `AckN(n)` with `n > 0`
4104    ///   * the wait timed out
4105    ///   * `RED_COMMIT_FAIL_ON_TIMEOUT=true` is set
4106    ///
4107    /// The HTTP / gRPC / wire surfaces map the error to 504 / wire
4108    /// backoff. Default behaviour (env unset) logs warn and returns
4109    /// success — matches PLAN.md "default v1 stays local" semantics
4110    /// while still letting the operator opt into hard-blocking.
4111    pub fn enforce_commit_policy(
4112        &self,
4113        post_lsn: u64,
4114    ) -> RedDBResult<crate::replication::AwaitOutcome> {
4115        let n = match self.commit_policy() {
4116            crate::replication::CommitPolicy::AckN(n) if n > 0 => n,
4117            _ => return Ok(crate::replication::AwaitOutcome::NotRequired),
4118        };
4119        let timeout_ms = std::env::var("RED_REPLICATION_ACK_TIMEOUT_MS")
4120            .ok()
4121            .and_then(|v| v.parse::<u64>().ok())
4122            .unwrap_or(5_000);
4123        let outcome =
4124            self.await_replica_acks(post_lsn, n, std::time::Duration::from_millis(timeout_ms));
4125        if let crate::replication::AwaitOutcome::TimedOut { observed, required } = &outcome {
4126            tracing::warn!(
4127                target: "reddb::commit",
4128                post_lsn,
4129                observed = *observed,
4130                required = *required,
4131                timeout_ms,
4132                "ack_n: timed out waiting for replicas"
4133            );
4134            let fail = std::env::var("RED_COMMIT_FAIL_ON_TIMEOUT")
4135                .ok()
4136                .map(|v| {
4137                    let t = v.trim();
4138                    t.eq_ignore_ascii_case("true") || t == "1" || t.eq_ignore_ascii_case("yes")
4139                })
4140                .unwrap_or(false);
4141            if fail {
4142                return Err(RedDBError::ReadOnly(format!(
4143                    "commit policy timed out at lsn {post_lsn}: observed={observed} required={required} (RED_COMMIT_FAIL_ON_TIMEOUT=true)"
4144                )));
4145            }
4146        }
4147        Ok(outcome)
4148    }
4149
4150    /// PLAN.md Phase 6.3 — whether at-rest encryption is configured.
4151    /// Reads `RED_ENCRYPTION_KEY` / `RED_ENCRYPTION_KEY_FILE` lazily;
4152    /// returns `("enabled", None)` when a key is loadable, `("error", Some(msg))`
4153    /// when the operator set the env but it doesn't parse, and
4154    /// `("disabled", None)` when no key is configured. The pager
4155    /// hookup is deferred — this accessor surfaces the operator's
4156    /// intent for /admin/status without yet using the key in writes.
4157    pub fn encryption_at_rest_status(&self) -> (&'static str, Option<String>) {
4158        match crate::crypto::page_encryption::key_from_env() {
4159            Ok(Some(_)) => ("enabled", None),
4160            Ok(None) => ("disabled", None),
4161            Err(err) => ("error", Some(err)),
4162        }
4163    }
4164
4165    /// PLAN.md Phase 11.5 — current replica apply health label
4166    /// (`ok`, `gap`, `divergence`, `apply_error`, `connecting`,
4167    /// `stalled_gap`). Read from the persisted `red.replication.state`
4168    /// config key updated by the replica loop. Returns `None` on
4169    /// non-replica instances or when no apply has run yet.
4170    pub fn replica_apply_health(&self) -> Option<String> {
4171        let state = self.config_string("red.replication.state", "");
4172        if state.is_empty() {
4173            None
4174        } else {
4175            Some(state)
4176        }
4177    }
4178
4179    /// Current local LSN paired with the LSN of the most recently
4180    /// archived WAL segment. The difference is the replication /
4181    /// archive lag operators alert on (PLAN.md Phase 5.1). Returns
4182    /// `(0, 0)` when neither replication nor archiving is configured.
4183    pub fn wal_archive_progress(&self) -> (u64, u64) {
4184        let current_lsn = self
4185            .inner
4186            .db
4187            .replication
4188            .as_ref()
4189            .map(|repl| {
4190                repl.logical_wal_spool
4191                    .as_ref()
4192                    .map(|spool| spool.current_lsn())
4193                    .unwrap_or_else(|| repl.wal_buffer.current_lsn())
4194            })
4195            .unwrap_or_else(|| self.inner.cdc.current_lsn());
4196        let last_archived_lsn = self.config_u64("red.config.timeline.last_archived_lsn", 0);
4197        (current_lsn, last_archived_lsn)
4198    }
4199
4200    /// Trigger an immediate backup.
4201    pub fn trigger_backup(&self) -> RedDBResult<crate::replication::scheduler::BackupResult> {
4202        self.check_write(crate::runtime::write_gate::WriteKind::Backup)?;
4203        // Defense in depth — check_write above already rejects when
4204        // the lease is NotHeld, but log + audit the lease angle here
4205        // explicitly so dashboards distinguish "lease lost" from a
4206        // generic read-only refusal.
4207        self.assert_remote_write_allowed("admin/backup")?;
4208        let started = std::time::Instant::now();
4209        let snapshot = self.create_snapshot()?;
4210        let mut uploaded = false;
4211
4212        if let (Some(backend), Some(path)) = (&self.inner.db.remote_backend, self.inner.db.path()) {
4213            let default_snapshot_prefix = self.inner.db.options().default_snapshot_prefix();
4214            let default_wal_prefix = self.inner.db.options().default_wal_archive_prefix();
4215            let default_head_key = self.inner.db.options().default_backup_head_key();
4216            let snapshot_prefix = self.config_string(
4217                "red.config.backup.snapshot_prefix",
4218                &default_snapshot_prefix,
4219            );
4220            let wal_prefix =
4221                self.config_string("red.config.wal.archive.prefix", &default_wal_prefix);
4222            let head_key = self.config_string("red.config.backup.head_key", &default_head_key);
4223            let timeline_id = self.config_string("red.config.timeline.id", "main");
4224            let snapshot_key = crate::storage::wal::archive_snapshot(
4225                backend.as_ref(),
4226                path,
4227                snapshot.snapshot_id,
4228                &snapshot_prefix,
4229            )
4230            .map_err(|err| RedDBError::Internal(err.to_string()))?;
4231            let current_lsn = self
4232                .inner
4233                .db
4234                .replication
4235                .as_ref()
4236                .map(|repl| {
4237                    repl.logical_wal_spool
4238                        .as_ref()
4239                        .map(|spool| spool.current_lsn())
4240                        .unwrap_or_else(|| repl.wal_buffer.current_lsn())
4241                })
4242                .unwrap_or_else(|| self.inner.cdc.current_lsn());
4243            let last_archived_lsn = self.config_u64("red.config.timeline.last_archived_lsn", 0);
4244            // Hash the local snapshot bytes so the manifest can carry
4245            // the digest for restore-side verification (PLAN.md
4246            // Phase 4). Failure to hash is non-fatal — we still
4247            // publish the manifest, just without a checksum, so a
4248            // future fix can backfill rather than losing the backup.
4249            let snapshot_sha256 =
4250                crate::storage::wal::SnapshotManifest::compute_snapshot_sha256(path)
4251                    .map_err(|err| {
4252                        tracing::warn!(
4253                            target: "reddb::backup",
4254                            error = %err,
4255                            snapshot_id = snapshot.snapshot_id,
4256                            "snapshot hash failed; manifest will lack checksum"
4257                        );
4258                    })
4259                    .ok();
4260            let manifest = crate::storage::wal::SnapshotManifest {
4261                timeline_id: timeline_id.clone(),
4262                snapshot_key: snapshot_key.clone(),
4263                snapshot_id: snapshot.snapshot_id,
4264                snapshot_time: snapshot.created_at_unix_ms as u64,
4265                base_lsn: current_lsn,
4266                schema_version: crate::api::REDDB_FORMAT_VERSION,
4267                format_version: crate::api::REDDB_FORMAT_VERSION,
4268                snapshot_sha256,
4269            };
4270            crate::storage::wal::publish_snapshot_manifest(backend.as_ref(), &manifest)
4271                .map_err(|err| RedDBError::Internal(err.to_string()))?;
4272
4273            // PLAN.md Phase 11.3 — read the head of the WAL hash chain
4274            // so the new segment can link back. `None` means we're
4275            // starting a fresh timeline (after a clean restore or on
4276            // first archive ever); the segment's `prev_hash` will be
4277            // `None` and restore-side validation accepts that only for
4278            // the first segment in `plan.wal_segments`.
4279            let prev_segment_hash = self.config_string("red.config.timeline.last_segment_hash", "");
4280            let prev_hash_arg = if prev_segment_hash.is_empty() {
4281                None
4282            } else {
4283                Some(prev_segment_hash)
4284            };
4285
4286            let archived_lsn = if let Some(primary) = &self.inner.db.replication {
4287                let oldest = primary
4288                    .logical_wal_spool
4289                    .as_ref()
4290                    .and_then(|spool| spool.oldest_lsn().ok().flatten())
4291                    .or_else(|| primary.wal_buffer.oldest_lsn())
4292                    .unwrap_or(last_archived_lsn);
4293                if last_archived_lsn > 0 && last_archived_lsn < oldest.saturating_sub(1) {
4294                    return Err(RedDBError::Internal(format!(
4295                        "logical WAL gap detected: last_archived_lsn={last_archived_lsn}, oldest_available_lsn={oldest}"
4296                    )));
4297                }
4298                let records = if let Some(spool) = &primary.logical_wal_spool {
4299                    spool
4300                        .read_since(last_archived_lsn, usize::MAX)
4301                        .map_err(|err| RedDBError::Internal(err.to_string()))?
4302                } else {
4303                    primary.wal_buffer.read_since(last_archived_lsn, usize::MAX)
4304                };
4305                if let Some(meta) = crate::storage::wal::archive_change_records(
4306                    backend.as_ref(),
4307                    &wal_prefix,
4308                    &records,
4309                    prev_hash_arg,
4310                )
4311                .map_err(|err| RedDBError::Internal(err.to_string()))?
4312                {
4313                    if let Some(spool) = &primary.logical_wal_spool {
4314                        let _ = spool.prune_through(meta.lsn_end);
4315                    }
4316                    // Advance the chain head so the next archive call
4317                    // links to this segment's hash. If the segment has
4318                    // no sha256 (legacy / hashing failed) we leave the
4319                    // head as-is — the next segment then carries the
4320                    // prior chain head, preserving continuity.
4321                    if let Some(sha) = &meta.sha256 {
4322                        self.inner.db.store().set_config_tree(
4323                            "red.config.timeline",
4324                            &crate::json!({ "last_segment_hash": sha }),
4325                        );
4326                    }
4327                    meta.lsn_end
4328                } else {
4329                    last_archived_lsn
4330                }
4331            } else {
4332                last_archived_lsn
4333            };
4334
4335            let head = crate::storage::wal::BackupHead {
4336                timeline_id,
4337                snapshot_key,
4338                snapshot_id: snapshot.snapshot_id,
4339                snapshot_time: snapshot.created_at_unix_ms as u64,
4340                current_lsn,
4341                last_archived_lsn: archived_lsn,
4342                wal_prefix,
4343            };
4344            crate::storage::wal::publish_backup_head(backend.as_ref(), &head_key, &head)
4345                .map_err(|err| RedDBError::Internal(err.to_string()))?;
4346            self.inner.db.store().set_config_tree(
4347                "red.config.timeline",
4348                &crate::json!({
4349                    "last_archived_lsn": archived_lsn,
4350                    "id": head.timeline_id
4351                }),
4352            );
4353
4354            // PLAN.md Phase 2.4 — refresh the unified `MANIFEST.json`
4355            // at the prefix root so external tooling sees a single
4356            // catalog of every snapshot + WAL segment with their
4357            // checksums. Best-effort: a manifest publish failure
4358            // doesn't fail the backup (the per-artifact sidecars
4359            // already give restore-side integrity), but it does log
4360            // so dashboards can flag stale catalogs.
4361            if let Err(err) = crate::storage::wal::publish_unified_manifest_for_prefix(
4362                backend.as_ref(),
4363                &snapshot_prefix,
4364            ) {
4365                tracing::warn!(
4366                    target: "reddb::backup",
4367                    error = %err,
4368                    snapshot_prefix = %snapshot_prefix,
4369                    "unified MANIFEST.json refresh failed; per-artifact sidecars unaffected"
4370                );
4371            }
4372
4373            // PLAN.md Phase 11.4 — when the operator picked a
4374            // commit policy that demands replica durability, block
4375            // until the configured count of replicas has acked the
4376            // archived LSN (or the timeout fires). For backup the
4377            // policy decides the *DR posture* — `local` returns
4378            // immediately, `ack_n` ensures at least N replicas saw
4379            // the new tail before we report success to the
4380            // operator. A `TimedOut` is logged but does NOT fail
4381            // the backup: the local WAL + remote upload are durable
4382            // regardless; the missing acks are reported via
4383            // /metrics and /admin/status so the operator can decide.
4384            match self.commit_policy() {
4385                crate::replication::CommitPolicy::AckN(n) if n > 0 => {
4386                    let timeout = std::env::var("RED_REPLICATION_ACK_TIMEOUT_MS")
4387                        .ok()
4388                        .and_then(|v| v.parse::<u64>().ok())
4389                        .unwrap_or(5_000);
4390                    let outcome = self.await_replica_acks(
4391                        archived_lsn,
4392                        n,
4393                        std::time::Duration::from_millis(timeout),
4394                    );
4395                    match outcome {
4396                        crate::replication::AwaitOutcome::Reached(count) => {
4397                            tracing::debug!(
4398                                target: "reddb::backup",
4399                                archived_lsn,
4400                                n,
4401                                count,
4402                                "ack_n: replicas synced before backup return"
4403                            );
4404                        }
4405                        crate::replication::AwaitOutcome::TimedOut { observed, required } => {
4406                            tracing::warn!(
4407                                target: "reddb::backup",
4408                                archived_lsn,
4409                                observed,
4410                                required,
4411                                timeout_ms = timeout,
4412                                "ack_n: timed out waiting for replicas; backup uploaded but DR posture degraded"
4413                            );
4414                        }
4415                        crate::replication::AwaitOutcome::NotRequired => {}
4416                    }
4417                }
4418                _ => {} // Local / RemoteWal / Quorum: no blocking yet
4419            }
4420
4421            // Issue #148 follow-up — opt-in archive of the L2 Blob Cache
4422            // directory tree. Default off so a standard backup stays
4423            // small; flip via `red.config.backup.include_blob_cache=true`
4424            // when warm-cache restore is required (per
4425            // docs/operations/blob-cache-backup-restore.md §1).
4426            //
4427            // The L2 tree is *derived* state (ADR 0006) — its absence
4428            // never causes data loss; it only affects post-restore
4429            // p99 latency until the cache re-warms. We therefore log
4430            // (not fail) on per-file upload errors so a partial L2
4431            // upload never aborts a healthy snapshot+WAL backup.
4432            if self.config_bool("red.config.backup.include_blob_cache", false) {
4433                let blob_cache_prefix = self.config_string(
4434                    "red.config.backup.blob_cache_prefix",
4435                    &format!("{snapshot_prefix}blob_cache/"),
4436                );
4437                if let Some(l2_path) = self.inner.result_blob_cache.l2_path() {
4438                    match crate::storage::cache::archive_blob_cache_l2(
4439                        backend.as_ref(),
4440                        l2_path,
4441                        &blob_cache_prefix,
4442                    ) {
4443                        Ok(count) => {
4444                            tracing::info!(
4445                                target: "reddb::backup",
4446                                files_uploaded = count,
4447                                blob_cache_prefix = %blob_cache_prefix,
4448                                "include_blob_cache: archived L2 directory"
4449                            );
4450                        }
4451                        Err(err) => {
4452                            tracing::warn!(
4453                                target: "reddb::backup",
4454                                error = %err,
4455                                blob_cache_prefix = %blob_cache_prefix,
4456                                "include_blob_cache: L2 archive failed; backup proceeding (cache is derived state)"
4457                            );
4458                        }
4459                    }
4460                } else {
4461                    tracing::debug!(
4462                        target: "reddb::backup",
4463                        "include_blob_cache=true but no L2 path configured; nothing to archive"
4464                    );
4465                }
4466            }
4467
4468            uploaded = true;
4469        }
4470
4471        Ok(crate::replication::scheduler::BackupResult {
4472            snapshot_id: snapshot.snapshot_id,
4473            uploaded,
4474            duration_ms: started.elapsed().as_millis() as u64,
4475            timestamp: snapshot.created_at_unix_ms as u64,
4476        })
4477    }
4478
4479    pub fn acquire(&self) -> RedDBResult<RuntimeConnection> {
4480        let mut pool = self
4481            .inner
4482            .pool
4483            .lock()
4484            .map_err(|e| RedDBError::Internal(format!("connection pool lock poisoned: {e}")))?;
4485        if pool.active >= self.inner.pool_config.max_connections {
4486            return Err(RedDBError::Internal(
4487                "connection pool exhausted".to_string(),
4488            ));
4489        }
4490
4491        let id = if let Some(id) = pool.idle.pop() {
4492            id
4493        } else {
4494            let id = pool.next_id;
4495            pool.next_id += 1;
4496            id
4497        };
4498        pool.active += 1;
4499        pool.total_checkouts += 1;
4500        drop(pool);
4501
4502        Ok(RuntimeConnection {
4503            id,
4504            inner: Arc::clone(&self.inner),
4505        })
4506    }
4507
4508    pub fn checkpoint(&self) -> RedDBResult<()> {
4509        // Local fsync always allowed — losing the lease shouldn't
4510        // prevent us from durably persisting what's already in memory.
4511        // The remote upload is the side-effect that risks clobbering a
4512        // peer's state, so it's behind the lease gate.
4513        self.inner.db.flush_local_only().map_err(|err| {
4514            // Issue #205 — local flush failure is a CheckpointFailed
4515            // operator-grade event. The local-flush path also covers
4516            // the WAL fsync we depend on, so a failure here doubles as
4517            // the WalFsyncFailed signal for the runtime entry point.
4518            let msg = err.to_string();
4519            crate::telemetry::operator_event::OperatorEvent::CheckpointFailed {
4520                lsn: 0,
4521                error: msg.clone(),
4522            }
4523            .emit_global();
4524            crate::telemetry::operator_event::OperatorEvent::WalFsyncFailed {
4525                path: "<flush_local_only>".to_string(),
4526                error: msg.clone(),
4527            }
4528            .emit_global();
4529            RedDBError::Engine(msg)
4530        })?;
4531        if let Err(err) = self.assert_remote_write_allowed("checkpoint") {
4532            tracing::warn!(
4533                target: "reddb::serverless::lease",
4534                error = %err,
4535                "checkpoint: skipping remote upload — lease not held"
4536            );
4537            return Ok(());
4538        }
4539        self.inner
4540            .db
4541            .upload_to_remote_backend()
4542            .map_err(|err| RedDBError::Engine(err.to_string()))
4543    }
4544
4545    /// Guard remote-mutating operations on the writer lease.
4546    /// Returns `Ok(())` when no remote backend is configured (the
4547    /// lease is irrelevant) or the lease state is `NotRequired` /
4548    /// `Held`. Returns `RedDBError::ReadOnly` when the lease is
4549    /// `NotHeld`, with an audit-friendly action label so the caller
4550    /// can record the rejection.
4551    pub(crate) fn assert_remote_write_allowed(&self, action: &str) -> RedDBResult<()> {
4552        if self.inner.db.remote_backend.is_none() {
4553            return Ok(());
4554        }
4555        match self.inner.write_gate.lease_state() {
4556            crate::runtime::write_gate::LeaseGateState::NotHeld => {
4557                self.inner.audit_log.record(
4558                    action,
4559                    "system",
4560                    "remote_backend",
4561                    "err: writer lease not held",
4562                    crate::json::Value::Null,
4563                );
4564                Err(RedDBError::ReadOnly(format!(
4565                    "writer lease not held — {action} blocked (serverless fence)"
4566                )))
4567            }
4568            _ => Ok(()),
4569        }
4570    }
4571
4572    pub fn run_maintenance(&self) -> RedDBResult<()> {
4573        self.inner
4574            .db
4575            .run_maintenance()
4576            .map_err(|err| RedDBError::Internal(err.to_string()))
4577    }
4578
4579    pub fn scan_collection(
4580        &self,
4581        collection: &str,
4582        cursor: Option<ScanCursor>,
4583        limit: usize,
4584    ) -> RedDBResult<ScanPage> {
4585        let store = self.inner.db.store();
4586        let manager = store
4587            .get_collection(collection)
4588            .ok_or_else(|| RedDBError::NotFound(collection.to_string()))?;
4589
4590        let mut entities = manager.query_all(|_| true);
4591        entities.sort_by_key(|entity| entity.id.raw());
4592
4593        let offset = cursor.map(|cursor| cursor.offset).unwrap_or(0);
4594        let total = entities.len();
4595        let end = total.min(offset.saturating_add(limit.max(1)));
4596        let items = if offset >= total {
4597            Vec::new()
4598        } else {
4599            entities[offset..end].to_vec()
4600        };
4601        let next = (end < total).then_some(ScanCursor { offset: end });
4602
4603        Ok(ScanPage {
4604            collection: collection.to_string(),
4605            items,
4606            next,
4607            total,
4608        })
4609    }
4610
4611    pub fn catalog(&self) -> CatalogModelSnapshot {
4612        self.inner.db.catalog_model_snapshot()
4613    }
4614
4615    pub fn catalog_consistency_report(&self) -> crate::catalog::CatalogConsistencyReport {
4616        self.inner.db.catalog_consistency_report()
4617    }
4618
4619    pub fn catalog_attention_summary(&self) -> CatalogAttentionSummary {
4620        crate::catalog::attention_summary(&self.catalog())
4621    }
4622
4623    pub fn collection_attention(&self) -> Vec<CollectionDescriptor> {
4624        crate::catalog::collection_attention(&self.catalog())
4625    }
4626
4627    pub fn index_attention(&self) -> Vec<CatalogIndexStatus> {
4628        crate::catalog::index_attention(&self.catalog())
4629    }
4630
4631    pub fn graph_projection_attention(&self) -> Vec<CatalogGraphProjectionStatus> {
4632        crate::catalog::graph_projection_attention(&self.catalog())
4633    }
4634
4635    pub fn analytics_job_attention(&self) -> Vec<CatalogAnalyticsJobStatus> {
4636        crate::catalog::analytics_job_attention(&self.catalog())
4637    }
4638
4639    pub fn stats(&self) -> RuntimeStats {
4640        let pool = runtime_pool_lock(self);
4641        RuntimeStats {
4642            active_connections: pool.active,
4643            idle_connections: pool.idle.len(),
4644            total_checkouts: pool.total_checkouts,
4645            paged_mode: self.inner.db.is_paged(),
4646            started_at_unix_ms: self.inner.started_at_unix_ms,
4647            store: self.inner.db.stats(),
4648            system: SystemInfo::collect(),
4649            result_blob_cache: self.inner.result_blob_cache.stats(),
4650            kv: self.inner.kv_stats.snapshot(),
4651            metrics_ingest: self.inner.metrics_ingest_stats.snapshot(),
4652        }
4653    }
4654
4655    pub(crate) fn record_metrics_ingest(
4656        &self,
4657        accepted_samples: u64,
4658        accepted_series: u64,
4659        rejected_samples: u64,
4660        rejected_series: u64,
4661    ) {
4662        self.inner.metrics_ingest_stats.record(
4663            accepted_samples,
4664            accepted_series,
4665            rejected_samples,
4666            rejected_series,
4667        );
4668    }
4669
4670    pub(crate) fn record_metrics_cardinality_budget_rejections(&self, rejected_series: u64) {
4671        self.inner
4672            .metrics_ingest_stats
4673            .record_cardinality_budget_rejections(rejected_series);
4674    }
4675
4676    pub(crate) fn record_metrics_tenant_activity(
4677        &self,
4678        tenant: &str,
4679        namespace: &str,
4680        operation: &str,
4681    ) {
4682        self.inner
4683            .metrics_tenant_activity_stats
4684            .record(tenant, namespace, operation);
4685    }
4686
4687    pub(crate) fn metrics_tenant_activity_snapshot(
4688        &self,
4689    ) -> Vec<crate::runtime::MetricsTenantActivityStats> {
4690        self.inner.metrics_tenant_activity_stats.snapshot()
4691    }
4692
4693    /// Execute a query under a typed scope override without embedding
4694    /// the tenant / user / role values into the SQL string. Use this
4695    /// from transport middleware (HTTP / gRPC / worker loops) where the
4696    /// scope is resolved from auth claims and the SQL is a parameterised
4697    /// template — avoids the string-concat injection risk of building
4698    /// `WITHIN TENANT '<id>' …` manually, and is drop-in compatible with
4699    /// prepared statements that didn't know about tenancy.
4700    ///
4701    /// Precedence matches the `WITHIN` clause: the passed `scope`
4702    /// overrides `SET LOCAL TENANT`, which overrides `SET TENANT`.
4703    /// The override is pushed on the thread-local scope stack for the
4704    /// duration of the call and popped on return — pool-shared
4705    /// connections cannot leak it across requests.
4706    pub fn execute_query_with_scope(
4707        &self,
4708        query: &str,
4709        scope: crate::runtime::within_clause::ScopeOverride,
4710    ) -> RedDBResult<RuntimeQueryResult> {
4711        if scope.is_empty() {
4712            return self.execute_query(query);
4713        }
4714        let _scope_guard = ScopeOverrideGuard::install(scope);
4715        self.execute_query(query)
4716    }
4717
4718    /// Issue #205 — single lifecycle exit for slow-query logging.
4719    ///
4720    /// `execute_query_inner` does the real work; this wrapper times it
4721    /// and, if elapsed exceeds the configured threshold, hands the
4722    /// triple `(QueryKind, elapsed_ms, sql_redacted, scope)` to the
4723    /// SlowQueryLogger. The threshold + sample_pct were captured at
4724    /// SlowQueryLogger construction (runtime startup), so the per-call
4725    /// cost on below-threshold paths is one relaxed atomic load.
4726    pub fn execute_query(&self, query: &str) -> RedDBResult<RuntimeQueryResult> {
4727        let started = std::time::Instant::now();
4728        let result = self.execute_query_inner(query);
4729        let elapsed_ms = started.elapsed().as_millis() as u64;
4730
4731        // Build EffectiveScope from the same thread-locals frame-build
4732        // consults — keeps the slow-log row consistent with the audit /
4733        // RLS view of "this statement". `ai_scope()` is the canonical
4734        // builder.
4735        let scope = self.ai_scope();
4736        let kind = match result
4737            .as_ref()
4738            .map(|r| r.statement_type)
4739            .unwrap_or("select")
4740        {
4741            "select" => crate::telemetry::slow_query_logger::QueryKind::Select,
4742            "insert" => crate::telemetry::slow_query_logger::QueryKind::Insert,
4743            "update" => crate::telemetry::slow_query_logger::QueryKind::Update,
4744            "delete" => crate::telemetry::slow_query_logger::QueryKind::Delete,
4745            _ => crate::telemetry::slow_query_logger::QueryKind::Internal,
4746        };
4747        // SQL redaction: pass the raw query through. The slow-query
4748        // logger writes structured JSON so embedded literals stay
4749        // escape-safe at the JSON boundary (proven by
4750        // `adversarial_sql_is_escape_safe` in slow_query_logger.rs).
4751        // PII redaction (e.g. literal masking) is a follow-up.
4752        self.inner
4753            .slow_query_logger
4754            .record(kind, elapsed_ms, query.to_string(), &scope);
4755
4756        result
4757    }
4758
4759    #[inline(never)]
4760    fn execute_query_inner(&self, query: &str) -> RedDBResult<RuntimeQueryResult> {
4761        // ── ULTRA-TURBO: autocommit `SELECT * FROM t WHERE _entity_id = N` ──
4762        //
4763        // Moved above every boot-cost the normal path pays (WITHIN
4764        // strip, SET LOCAL parse, tx_local_tenants read, snapshot
4765        // guard, tracing span, tx_contexts read) because the bench's
4766        // `select_point` scenario was observed at 28× vs PostgreSQL —
4767        // the dominant cost wasn't the entity fetch but the ceremony
4768        // before it. Only fires when there's no ambient transaction
4769        // context or WITHIN override, so the snapshot install we skip
4770        // truly is a no-op for this query.
4771        if !has_scope_override_active()
4772            && !query.trim_start().starts_with("WITHIN")
4773            && !query.trim_start().starts_with("within")
4774            && !self
4775                .inner
4776                .tx_contexts
4777                .read()
4778                .contains_key(&current_connection_id())
4779        {
4780            if let Some(result) = self.try_fast_entity_lookup(query) {
4781                return result;
4782            }
4783        }
4784
4785        // `WITHIN TENANT '<id>' [USER '<u>'] [AS ROLE '<r>'] <stmt>` —
4786        // strip the prefix, push a stack-scoped override, recurse on
4787        // the inner statement, pop on return. Stack lives in a
4788        // thread-local but is balanced by the RAII guard, so a
4789        // pool-shared connection cannot leak the override across
4790        // requests and an early `?` return still pops cleanly.
4791        match crate::runtime::within_clause::try_strip_within_prefix(query) {
4792            Ok(Some((scope, inner))) => {
4793                let _scope_guard = ScopeOverrideGuard::install(scope);
4794                // Re-enter the inner path, NOT `execute_query`, so the
4795                // slow-query lifecycle hook records exactly one row per
4796                // top-level statement (the WITHIN-stripped form would
4797                // double-record).
4798                return self.execute_query_inner(inner);
4799            }
4800            Ok(None) => {}
4801            Err(msg) => return Err(RedDBError::Query(msg)),
4802        }
4803
4804        // `EXPLAIN <stmt>` — introspection. Runs the planner on the
4805        // inner statement (WITHOUT executing it) and returns the
4806        // CanonicalLogicalNode tree as rows so the caller can see the
4807        // operator shape and estimated cost. `EXPLAIN ALTER FOR ...`
4808        // is a distinct schema-diff command and continues down the
4809        // regular SQL path.
4810        if let Some(inner) = strip_explain_prefix(query) {
4811            return self.explain_as_rows(query, inner);
4812        }
4813
4814        // `SET LOCAL TENANT '<id>'` — write the per-transaction tenant
4815        // override and return. Outside a transaction the statement is
4816        // an error (matches PG semantics: SET LOCAL only takes effect
4817        // within an active transaction).
4818        if let Some(value) = parse_set_local_tenant(query)? {
4819            let conn_id = current_connection_id();
4820            if !self.inner.tx_contexts.read().contains_key(&conn_id) {
4821                return Err(RedDBError::Query(
4822                    "SET LOCAL TENANT requires an active transaction".to_string(),
4823                ));
4824            }
4825            self.inner
4826                .tx_local_tenants
4827                .write()
4828                .insert(conn_id, value.clone());
4829            return Ok(RuntimeQueryResult::ok_message(
4830                query.to_string(),
4831                &match &value {
4832                    Some(id) => format!("local tenant set: {id}"),
4833                    None => "local tenant cleared".to_string(),
4834                },
4835                "set_local_tenant",
4836            ));
4837        }
4838
4839        if super::red_schema::is_system_schema_write(query) {
4840            return Err(RedDBError::Query(
4841                super::red_schema::READ_ONLY_ERROR.to_string(),
4842            ));
4843        }
4844
4845        let rewritten_query = super::red_schema::rewrite_virtual_names(query);
4846        let execution_query = rewritten_query.as_deref().unwrap_or(query);
4847
4848        let frame = super::statement_frame::StatementExecutionFrame::build(self, execution_query)?;
4849        let _frame_guards = frame.install(self);
4850
4851        // Phase 6 logging: enter a span stamped with conn_id / tenant
4852        // / query_len. Every downstream tracing::info!/warn!/error!
4853        // inherits these fields — no need to thread them manually
4854        // through storage/scan layers. Entered AFTER the WITHIN /
4855        // SET LOCAL TENANT resolution above so the span reflects the
4856        // effective scope for this statement.
4857        let _log_span = crate::telemetry::span::query_span(query).entered();
4858
4859        // ── CTE prelude (#41) — `WITH x AS (...) SELECT ... FROM x` ──
4860        if let Some(rewritten) = frame.prepare_cte(execution_query)? {
4861            return self.execute_query_expr(rewritten);
4862        }
4863
4864        // ── TURBO: bypass SQL parse for SELECT * FROM x WHERE _entity_id = N ──
4865        if let Some(result) = self.try_fast_entity_lookup(execution_query) {
4866            return result;
4867        }
4868
4869        // ── Result cache: return cached result if still fresh (30s TTL) ──
4870        if let Some(result) = frame.read_result_cache(self) {
4871            return Ok(result);
4872        }
4873
4874        let prepared = frame.prepare_statement(self, execution_query)?;
4875        let mode = prepared.mode;
4876        let expr = prepared.expr;
4877
4878        let statement = query_expr_name(&expr);
4879        let result_cache_scopes = query_expr_result_cache_scopes(&expr);
4880
4881        let _lock_guard = frame.prepare_dispatch(self, &expr)?;
4882        let frame_iface: &dyn super::statement_frame::ReadFrame = &frame;
4883
4884        let query_result = match expr {
4885            QueryExpr::Graph(_) | QueryExpr::Path(_) => {
4886                // Apply MVCC visibility + RLS gate while materialising the
4887                // graph: every node entity is screened against the source
4888                // collection's policy chain (basic and `Nodes`-targeted)
4889                // and dropped when the caller's tenant / role doesn't
4890                // admit it. Edges are pruned automatically because the
4891                // graph builder skips edges whose endpoints aren't in
4892                // `allowed_nodes`.
4893                let (graph, node_properties, edge_properties) =
4894                    self.materialize_graph_with_rls()?;
4895                let result =
4896                    crate::storage::query::unified::UnifiedExecutor::execute_on_with_graph_properties(
4897                        &graph,
4898                        &expr,
4899                        node_properties,
4900                        edge_properties,
4901                    )
4902                        .map_err(|err| RedDBError::Query(err.to_string()))?;
4903
4904                Ok(RuntimeQueryResult {
4905                    query: query.to_string(),
4906                    mode,
4907                    statement,
4908                    engine: "materialized-graph",
4909                    result,
4910                    affected_rows: 0,
4911                    statement_type: "select",
4912                })
4913            }
4914            QueryExpr::Table(table) => {
4915                let table = self.resolve_table_expr_subqueries(
4916                    table,
4917                    &frame as &dyn super::statement_frame::ReadFrame,
4918                )?;
4919                if super::red_schema::is_virtual_table(&table.table) {
4920                    return Ok(RuntimeQueryResult {
4921                        query: query.to_string(),
4922                        mode,
4923                        statement,
4924                        engine: "runtime-red-schema",
4925                        result: super::red_schema::red_query(
4926                            self,
4927                            &table.table,
4928                            &table,
4929                            &frame as &dyn super::statement_frame::ReadFrame,
4930                        )?,
4931                        affected_rows: 0,
4932                        statement_type: "select",
4933                    });
4934                }
4935
4936                if let Some(result) = self.execute_probabilistic_select(&table)? {
4937                    return Ok(RuntimeQueryResult {
4938                        query: query.to_string(),
4939                        mode,
4940                        statement,
4941                        engine: "runtime-probabilistic",
4942                        result,
4943                        affected_rows: 0,
4944                        statement_type: "select",
4945                    });
4946                }
4947
4948                // Foreign-table intercept (Phase 3.2.2 PG parity).
4949                //
4950                // When the referenced table matches a `CREATE FOREIGN TABLE`
4951                // registration, short-circuit into the FDW scan. Phase 3.2
4952                // wrappers don't yet support pushdown, so filters/projections
4953                // apply post-scan via `apply_foreign_table_filters` — good
4954                // enough for correctness; perf work lands in 3.2.3.
4955                if self.inner.foreign_tables.is_foreign_table(&table.table) {
4956                    let records = self
4957                        .inner
4958                        .foreign_tables
4959                        .scan(&table.table)
4960                        .map_err(|e| RedDBError::Internal(e.to_string()))?;
4961                    let result = apply_foreign_table_filters(records, &table);
4962                    return Ok(RuntimeQueryResult {
4963                        query: query.to_string(),
4964                        mode,
4965                        statement,
4966                        engine: "runtime-fdw",
4967                        result,
4968                        affected_rows: 0,
4969                        statement_type: "select",
4970                    });
4971                }
4972
4973                // Row-Level Security enforcement (Phase 2.5.2 PG parity).
4974                //
4975                // When RLS is enabled on this table, fetch every policy
4976                // that applies to the current (role, SELECT) pair and
4977                // fold them into the query's WHERE clause: policies
4978                // OR-combine (any of them admitting the row is enough),
4979                // then AND into the caller's existing filter.
4980                //
4981                // Anonymous callers (no thread-local identity) pass
4982                // `role = None`; policies with a specific `TO role`
4983                // clause skip, but `TO PUBLIC` policies still apply.
4984                //
4985                // When `inject_rls_filters` returns `None` the table has
4986                // RLS enabled but no policy admits the caller's role —
4987                // short-circuit with an empty result set instead of
4988                // synthesising a contradiction filter.
4989                let Some(table_with_rls) = self.authorize_relational_table_select(
4990                    table,
4991                    &frame as &dyn super::statement_frame::ReadFrame,
4992                )?
4993                else {
4994                    let empty = crate::storage::query::unified::UnifiedResult::empty();
4995                    return Ok(RuntimeQueryResult {
4996                        query: query.to_string(),
4997                        mode,
4998                        statement,
4999                        engine: "runtime-table-rls",
5000                        result: empty,
5001                        affected_rows: 0,
5002                        statement_type: "select",
5003                    });
5004                };
5005                Ok(RuntimeQueryResult {
5006                    query: query.to_string(),
5007                    mode,
5008                    statement,
5009                    engine: "runtime-table",
5010                    result: execute_runtime_table_query(
5011                        &self.inner.db,
5012                        &table_with_rls,
5013                        Some(&self.inner.index_store),
5014                    )?,
5015                    affected_rows: 0,
5016                    statement_type: "select",
5017                })
5018            }
5019            QueryExpr::Join(join) => {
5020                // Fold per-table RLS filters into each `QueryExpr::Table`
5021                // leaf of the join tree before executing. Without this
5022                // the join executor scans both tables raw and ignores
5023                // policies — a `WITHIN TENANT 'x'` against a join of
5024                // two tenant-scoped tables would leak cross-tenant rows.
5025                // When any leaf has RLS enabled and zero matching policy,
5026                // short-circuit to an empty join result instead of
5027                // emitting a contradiction filter.
5028                let join_with_rls = match self.authorize_relational_join_select(
5029                    join,
5030                    &frame as &dyn super::statement_frame::ReadFrame,
5031                )? {
5032                    Some(j) => j,
5033                    None => {
5034                        return Ok(RuntimeQueryResult {
5035                            query: query.to_string(),
5036                            mode,
5037                            statement,
5038                            engine: "runtime-join-rls",
5039                            result: crate::storage::query::unified::UnifiedResult::empty(),
5040                            affected_rows: 0,
5041                            statement_type: "select",
5042                        });
5043                    }
5044                };
5045                Ok(RuntimeQueryResult {
5046                    query: query.to_string(),
5047                    mode,
5048                    statement,
5049                    engine: "runtime-join",
5050                    result: execute_runtime_join_query(&self.inner.db, &join_with_rls)?,
5051                    affected_rows: 0,
5052                    statement_type: "select",
5053                })
5054            }
5055            QueryExpr::Vector(vector) => Ok(RuntimeQueryResult {
5056                query: query.to_string(),
5057                mode,
5058                statement,
5059                engine: "runtime-vector",
5060                result: execute_runtime_vector_query(&self.inner.db, &vector)?,
5061                affected_rows: 0,
5062                statement_type: "select",
5063            }),
5064            QueryExpr::Hybrid(hybrid) => Ok(RuntimeQueryResult {
5065                query: query.to_string(),
5066                mode,
5067                statement,
5068                engine: "runtime-hybrid",
5069                result: execute_runtime_hybrid_query(&self.inner.db, &hybrid)?,
5070                affected_rows: 0,
5071                statement_type: "select",
5072            }),
5073            // DML execution
5074            QueryExpr::Insert(ref insert) if super::red_schema::is_virtual_table(&insert.table) => {
5075                Err(RedDBError::Query(
5076                    super::red_schema::READ_ONLY_ERROR.to_string(),
5077                ))
5078            }
5079            QueryExpr::Update(ref update) if super::red_schema::is_virtual_table(&update.table) => {
5080                Err(RedDBError::Query(
5081                    super::red_schema::READ_ONLY_ERROR.to_string(),
5082                ))
5083            }
5084            QueryExpr::Delete(ref delete) if super::red_schema::is_virtual_table(&delete.table) => {
5085                Err(RedDBError::Query(
5086                    super::red_schema::READ_ONLY_ERROR.to_string(),
5087                ))
5088            }
5089            QueryExpr::Insert(ref insert) => self
5090                .with_deferred_store_wal_for_dml(self.insert_may_emit_events(insert), || {
5091                    self.execute_insert(query, insert)
5092                }),
5093            QueryExpr::Update(ref update) => self
5094                .with_deferred_store_wal_for_dml(self.update_may_emit_events(update), || {
5095                    self.execute_update(query, update)
5096                }),
5097            QueryExpr::Delete(ref delete) => self
5098                .with_deferred_store_wal_for_dml(self.delete_may_emit_events(delete), || {
5099                    self.execute_delete(query, delete)
5100                }),
5101            // DDL execution
5102            QueryExpr::CreateTable(ref create) => self.execute_create_table(query, create),
5103            QueryExpr::CreateCollection(ref create) => {
5104                self.execute_create_collection(query, create)
5105            }
5106            QueryExpr::CreateVector(ref create) => self.execute_create_vector(query, create),
5107            QueryExpr::DropTable(ref drop_tbl) => self.execute_drop_table(query, drop_tbl),
5108            QueryExpr::DropGraph(ref drop_graph) => self.execute_drop_graph(query, drop_graph),
5109            QueryExpr::DropVector(ref drop_vector) => self.execute_drop_vector(query, drop_vector),
5110            QueryExpr::DropDocument(ref drop_document) => {
5111                self.execute_drop_document(query, drop_document)
5112            }
5113            QueryExpr::DropKv(ref drop_kv) => self.execute_drop_kv(query, drop_kv),
5114            QueryExpr::DropCollection(ref drop_collection) => {
5115                self.execute_drop_collection(query, drop_collection)
5116            }
5117            QueryExpr::Truncate(ref truncate) => self.execute_truncate(query, truncate),
5118            QueryExpr::AlterTable(ref alter) => self.execute_alter_table(query, alter),
5119            QueryExpr::ExplainAlter(ref explain) => self.execute_explain_alter(query, explain),
5120            // Graph analytics commands
5121            QueryExpr::GraphCommand(ref cmd) => self.execute_graph_command(query, cmd),
5122            // Search commands
5123            QueryExpr::SearchCommand(ref cmd) => self.execute_search_command(query, cmd),
5124            // ASK: RAG query with LLM synthesis
5125            QueryExpr::Ask(ref ask) => self.execute_ask(query, ask),
5126            QueryExpr::CreateIndex(ref create_idx) => self.execute_create_index(query, create_idx),
5127            QueryExpr::DropIndex(ref drop_idx) => self.execute_drop_index(query, drop_idx),
5128            QueryExpr::ProbabilisticCommand(ref cmd) => {
5129                self.execute_probabilistic_command(query, cmd)
5130            }
5131            // Time-series DDL
5132            QueryExpr::CreateTimeSeries(ref ts) => self.execute_create_timeseries(query, ts),
5133            QueryExpr::DropTimeSeries(ref ts) => self.execute_drop_timeseries(query, ts),
5134            // Queue DDL and commands
5135            QueryExpr::CreateQueue(ref q) => self.execute_create_queue(query, q),
5136            QueryExpr::AlterQueue(ref q) => self.execute_alter_queue(query, q),
5137            QueryExpr::DropQueue(ref q) => self.execute_drop_queue(query, q),
5138            QueryExpr::QueueSelect(ref q) => self.execute_queue_select(query, q),
5139            QueryExpr::QueueCommand(ref cmd) => self.execute_queue_command(query, cmd),
5140            QueryExpr::EventsBackfill(ref backfill) => {
5141                self.execute_events_backfill(query, backfill)
5142            }
5143            QueryExpr::EventsBackfillStatus { ref collection } => Err(RedDBError::Query(format!(
5144                "EVENTS BACKFILL STATUS for '{collection}' is not implemented in this slice"
5145            ))),
5146            QueryExpr::KvCommand(ref cmd) => self.execute_kv_command(query, cmd),
5147            QueryExpr::ConfigCommand(ref cmd) => self.execute_config_command(query, cmd),
5148            QueryExpr::CreateTree(ref tree) => self.execute_create_tree(query, tree),
5149            QueryExpr::DropTree(ref tree) => self.execute_drop_tree(query, tree),
5150            QueryExpr::TreeCommand(ref cmd) => self.execute_tree_command(query, cmd),
5151            // SET CONFIG key = value
5152            QueryExpr::SetConfig { ref key, ref value } => {
5153                if key.starts_with("red.secret.") {
5154                    return Err(RedDBError::Query(
5155                        "red.secret.* is reserved for vault secrets; use SET SECRET".to_string(),
5156                    ));
5157                }
5158                let store = self.inner.db.store();
5159                let json_val = match value {
5160                    Value::Text(s) => crate::serde_json::Value::String(s.to_string()),
5161                    Value::Integer(n) => crate::serde_json::Value::Number(*n as f64),
5162                    Value::Float(n) => crate::serde_json::Value::Number(*n),
5163                    Value::Boolean(b) => crate::serde_json::Value::Bool(*b),
5164                    _ => crate::serde_json::Value::String(value.to_string()),
5165                };
5166                store.set_config_tree(key, &json_val);
5167                update_current_config_value(key, value.clone());
5168                // Config changes can flip runtime behavior mid-session
5169                // (auto_decrypt, auto_encrypt, etc.) — invalidate the
5170                // result cache so subsequent reads re-execute against
5171                // the new config.
5172                self.invalidate_result_cache();
5173                Ok(RuntimeQueryResult::ok_message(
5174                    query.to_string(),
5175                    &format!("config set: {key}"),
5176                    "set",
5177                ))
5178            }
5179            // SET SECRET key = value
5180            QueryExpr::SetSecret { ref key, ref value } => {
5181                if key.starts_with("red.config.") {
5182                    return Err(RedDBError::Query(
5183                        "red.config.* is reserved for config; use SET CONFIG".to_string(),
5184                    ));
5185                }
5186                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
5187                    RedDBError::Query("SET SECRET requires an enabled, unsealed vault".to_string())
5188                })?;
5189                if matches!(value, Value::Null) {
5190                    auth_store
5191                        .vault_kv_try_delete(key)
5192                        .map_err(|err| RedDBError::Query(err.to_string()))?;
5193                    update_current_secret_value(key, None);
5194                    self.invalidate_result_cache();
5195                    return Ok(RuntimeQueryResult::ok_message(
5196                        query.to_string(),
5197                        &format!("secret deleted: {key}"),
5198                        "delete_secret",
5199                    ));
5200                }
5201                let value = secret_sql_value_to_string(value)?;
5202                auth_store
5203                    .vault_kv_try_set(key.clone(), value.clone())
5204                    .map_err(|err| RedDBError::Query(err.to_string()))?;
5205                update_current_secret_value(key, Some(value));
5206                self.invalidate_result_cache();
5207                Ok(RuntimeQueryResult::ok_message(
5208                    query.to_string(),
5209                    &format!("secret set: {key}"),
5210                    "set_secret",
5211                ))
5212            }
5213            // DELETE SECRET key
5214            QueryExpr::DeleteSecret { ref key } => {
5215                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
5216                    RedDBError::Query(
5217                        "DELETE SECRET requires an enabled, unsealed vault".to_string(),
5218                    )
5219                })?;
5220                let deleted = auth_store
5221                    .vault_kv_try_delete(key)
5222                    .map_err(|err| RedDBError::Query(err.to_string()))?;
5223                if deleted {
5224                    update_current_secret_value(key, None);
5225                }
5226                self.invalidate_result_cache();
5227                Ok(RuntimeQueryResult::ok_message(
5228                    query.to_string(),
5229                    &format!("secret deleted: {key}"),
5230                    if deleted {
5231                        "delete_secret"
5232                    } else {
5233                        "delete_secret_not_found"
5234                    },
5235                ))
5236            }
5237            // SHOW SECRET[S] [prefix]
5238            QueryExpr::ShowSecrets { ref prefix } => {
5239                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
5240                    RedDBError::Query("SHOW SECRET requires an enabled, unsealed vault".to_string())
5241                })?;
5242                if !auth_store.is_vault_backed() {
5243                    return Err(RedDBError::Query(
5244                        "SHOW SECRET requires an enabled, unsealed vault".to_string(),
5245                    ));
5246                }
5247                let mut keys = auth_store.vault_kv_keys();
5248                keys.sort();
5249                let mut result = UnifiedResult::with_columns(vec![
5250                    "key".into(),
5251                    "value".into(),
5252                    "status".into(),
5253                ]);
5254                for key in keys {
5255                    if let Some(ref pfx) = prefix {
5256                        if !key.starts_with(pfx) {
5257                            continue;
5258                        }
5259                    }
5260                    let mut record = UnifiedRecord::new();
5261                    record.set("key", Value::text(key));
5262                    record.set("value", Value::text("***"));
5263                    record.set("status", Value::text("active"));
5264                    result.push(record);
5265                }
5266                Ok(RuntimeQueryResult {
5267                    query: query.to_string(),
5268                    mode,
5269                    statement: "show_secrets",
5270                    engine: "runtime-secret",
5271                    result,
5272                    affected_rows: 0,
5273                    statement_type: "select",
5274                })
5275            }
5276            // SHOW CONFIG [prefix]
5277            QueryExpr::ShowConfig { ref prefix } => {
5278                let store = self.inner.db.store();
5279                let all_collections = store.list_collections();
5280                if !all_collections.contains(&"red_config".to_string()) {
5281                    let result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
5282                    return Ok(RuntimeQueryResult {
5283                        query: query.to_string(),
5284                        mode,
5285                        statement: "show_config",
5286                        engine: "runtime-config",
5287                        result,
5288                        affected_rows: 0,
5289                        statement_type: "select",
5290                    });
5291                }
5292                let manager = store
5293                    .get_collection("red_config")
5294                    .ok_or_else(|| RedDBError::NotFound("red_config".to_string()))?;
5295                let entities = manager.query_all(|_| true);
5296                let mut latest = std::collections::BTreeMap::<String, (u64, Value, Value)>::new();
5297                for entity in entities {
5298                    if let EntityData::Row(ref row) = entity.data {
5299                        if let Some(ref named) = row.named {
5300                            let key_val = named.get("key").cloned().unwrap_or(Value::Null);
5301                            let val = named.get("value").cloned().unwrap_or(Value::Null);
5302                            let key_str = match &key_val {
5303                                Value::Text(s) => s.as_ref(),
5304                                _ => continue,
5305                            };
5306                            if let Some(ref pfx) = prefix {
5307                                if !key_str.starts_with(pfx.as_str()) {
5308                                    continue;
5309                                }
5310                            }
5311                            let entity_id = entity.id.raw();
5312                            match latest.get(key_str) {
5313                                Some((prev_id, _, _)) if *prev_id > entity_id => {}
5314                                _ => {
5315                                    latest.insert(key_str.to_string(), (entity_id, key_val, val));
5316                                }
5317                            }
5318                        }
5319                    }
5320                }
5321                let mut result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
5322                for (_, key_val, val) in latest.into_values() {
5323                    let mut record = UnifiedRecord::new();
5324                    record.set("key", key_val);
5325                    record.set("value", val);
5326                    result.push(record);
5327                }
5328                Ok(RuntimeQueryResult {
5329                    query: query.to_string(),
5330                    mode,
5331                    statement: "show_config",
5332                    engine: "runtime-config",
5333                    result,
5334                    affected_rows: 0,
5335                    statement_type: "select",
5336                })
5337            }
5338            // Session-local multi-tenancy handle (Phase 2.5.3).
5339            //
5340            // SET TENANT 'id' / SET TENANT NULL / RESET TENANT — writes
5341            // the thread-local; SHOW TENANT returns it. Paired with the
5342            // CURRENT_TENANT() scalar for use in RLS policies.
5343            QueryExpr::SetTenant(ref value) => {
5344                match value {
5345                    Some(id) => set_current_tenant(id.clone()),
5346                    None => clear_current_tenant(),
5347                }
5348                Ok(RuntimeQueryResult::ok_message(
5349                    query.to_string(),
5350                    &match value {
5351                        Some(id) => format!("tenant set: {id}"),
5352                        None => "tenant cleared".to_string(),
5353                    },
5354                    "set_tenant",
5355                ))
5356            }
5357            QueryExpr::ShowTenant => {
5358                let mut result = UnifiedResult::with_columns(vec!["tenant".into()]);
5359                let mut record = UnifiedRecord::new();
5360                record.set(
5361                    "tenant",
5362                    current_tenant().map(Value::text).unwrap_or(Value::Null),
5363                );
5364                result.push(record);
5365                Ok(RuntimeQueryResult {
5366                    query: query.to_string(),
5367                    mode,
5368                    statement: "show_tenant",
5369                    engine: "runtime-tenant",
5370                    result,
5371                    affected_rows: 0,
5372                    statement_type: "select",
5373                })
5374            }
5375            // Transaction control (Phase 2.3 PG parity).
5376            //
5377            // BEGIN allocates a real `Xid` and stores a `TxnContext` keyed by
5378            // the current connection's id. COMMIT/ROLLBACK release it through
5379            // the `SnapshotManager` so future snapshots see the correct set of
5380            // active/aborted transactions.
5381            //
5382            // Tuple stamping (xmin/xmax) and read-path visibility filtering
5383            // land in Phase 2.3.2 — this dispatch only manages the snapshot
5384            // registry. Statements running outside a TxnContext still behave
5385            // as autocommit (xid=0 → visible to every snapshot).
5386            QueryExpr::TransactionControl(ref ctl) => {
5387                use crate::storage::query::ast::TxnControl;
5388                use crate::storage::transaction::snapshot::{TxnContext, Xid};
5389                use crate::storage::transaction::IsolationLevel;
5390
5391                // Phase 2.3 keys transactions by a thread-local connection id.
5392                // The stdio/gRPC paths wire a real per-connection id later;
5393                // for embedded use (one RedDBRuntime per process-ish caller)
5394                // we fall back to a deterministic placeholder.
5395                let conn_id = current_connection_id();
5396
5397                let (kind, msg) = match ctl {
5398                    TxnControl::Begin => {
5399                        let mgr = Arc::clone(&self.inner.snapshot_manager);
5400                        let xid = mgr.begin();
5401                        let snapshot = mgr.snapshot(xid);
5402                        let ctx = TxnContext {
5403                            xid,
5404                            isolation: IsolationLevel::SnapshotIsolation,
5405                            snapshot,
5406                            savepoints: Vec::new(),
5407                            released_sub_xids: Vec::new(),
5408                        };
5409                        self.inner.tx_contexts.write().insert(conn_id, ctx);
5410                        ("begin", format!("BEGIN — xid={xid} (snapshot isolation)"))
5411                    }
5412                    TxnControl::Commit => {
5413                        // SET LOCAL TENANT ends with the transaction.
5414                        self.inner.tx_local_tenants.write().remove(&conn_id);
5415                        let ctx = self.inner.tx_contexts.write().remove(&conn_id);
5416                        match ctx {
5417                            Some(ctx) => {
5418                                let mut own_xids = std::collections::HashSet::new();
5419                                own_xids.insert(ctx.xid);
5420                                for (_, sub) in &ctx.savepoints {
5421                                    own_xids.insert(*sub);
5422                                }
5423                                for sub in &ctx.released_sub_xids {
5424                                    own_xids.insert(*sub);
5425                                }
5426                                if let Err(err) = self.check_table_row_write_conflicts(
5427                                    conn_id,
5428                                    &ctx.snapshot,
5429                                    &own_xids,
5430                                ) {
5431                                    for (_, sub) in &ctx.savepoints {
5432                                        self.inner.snapshot_manager.rollback(*sub);
5433                                    }
5434                                    for sub in &ctx.released_sub_xids {
5435                                        self.inner.snapshot_manager.rollback(*sub);
5436                                    }
5437                                    self.inner.snapshot_manager.rollback(ctx.xid);
5438                                    self.revive_pending_versioned_updates(conn_id);
5439                                    self.revive_pending_tombstones(conn_id);
5440                                    self.discard_pending_kv_watch_events(conn_id);
5441                                    self.discard_pending_store_wal_actions(conn_id);
5442                                    return Err(err);
5443                                }
5444                                self.restore_pending_write_stamps(conn_id);
5445                                if let Err(err) = self.flush_pending_store_wal_actions(conn_id) {
5446                                    for (_, sub) in &ctx.savepoints {
5447                                        self.inner.snapshot_manager.rollback(*sub);
5448                                    }
5449                                    for sub in &ctx.released_sub_xids {
5450                                        self.inner.snapshot_manager.rollback(*sub);
5451                                    }
5452                                    self.inner.snapshot_manager.rollback(ctx.xid);
5453                                    self.revive_pending_versioned_updates(conn_id);
5454                                    self.revive_pending_tombstones(conn_id);
5455                                    self.discard_pending_kv_watch_events(conn_id);
5456                                    return Err(err);
5457                                }
5458                                // Phase 2.3.2e: commit every open sub-xid
5459                                // so they also become visible. Their
5460                                // work is promoted to the parent txn's
5461                                // result exactly like a RELEASE would
5462                                // have done.
5463                                for (_, sub) in &ctx.savepoints {
5464                                    self.inner.snapshot_manager.commit(*sub);
5465                                }
5466                                for sub in &ctx.released_sub_xids {
5467                                    self.inner.snapshot_manager.commit(*sub);
5468                                }
5469                                self.inner.snapshot_manager.commit(ctx.xid);
5470                                self.finalize_pending_versioned_updates(conn_id);
5471                                self.finalize_pending_tombstones(conn_id);
5472                                self.finalize_pending_kv_watch_events(conn_id);
5473                                ("commit", format!("COMMIT — xid={} committed", ctx.xid))
5474                            }
5475                            None => (
5476                                "commit",
5477                                "COMMIT outside transaction — no-op (autocommit)".to_string(),
5478                            ),
5479                        }
5480                    }
5481                    TxnControl::Rollback => {
5482                        self.inner.tx_local_tenants.write().remove(&conn_id);
5483                        let ctx = self.inner.tx_contexts.write().remove(&conn_id);
5484                        match ctx {
5485                            Some(ctx) => {
5486                                // Phase 2.3.2e: abort every open sub-xid
5487                                // too so their writes stay hidden.
5488                                for (_, sub) in &ctx.savepoints {
5489                                    self.inner.snapshot_manager.rollback(*sub);
5490                                }
5491                                for sub in &ctx.released_sub_xids {
5492                                    self.inner.snapshot_manager.rollback(*sub);
5493                                }
5494                                self.inner.snapshot_manager.rollback(ctx.xid);
5495                                // Phase 2.3.2b: tuples that the txn had
5496                                // xmax-stamped become live again — wipe xmax
5497                                // back to 0 so later snapshots see them.
5498                                self.revive_pending_versioned_updates(conn_id);
5499                                self.revive_pending_tombstones(conn_id);
5500                                self.discard_pending_kv_watch_events(conn_id);
5501                                self.discard_pending_store_wal_actions(conn_id);
5502                                ("rollback", format!("ROLLBACK — xid={} aborted", ctx.xid))
5503                            }
5504                            None => (
5505                                "rollback",
5506                                "ROLLBACK outside transaction — no-op (autocommit)".to_string(),
5507                            ),
5508                        }
5509                    }
5510                    // Phase 2.3.2e: savepoints map onto sub-xids. Each
5511                    // SAVEPOINT allocates a fresh xid and pushes it
5512                    // onto the per-txn stack so subsequent writes can
5513                    // be selectively rolled back. RELEASE pops without
5514                    // aborting; ROLLBACK TO aborts the sub-xid (and
5515                    // any nested ones) + revives their tombstones.
5516                    TxnControl::Savepoint(name) => {
5517                        let mgr = Arc::clone(&self.inner.snapshot_manager);
5518                        let mut guard = self.inner.tx_contexts.write();
5519                        match guard.get_mut(&conn_id) {
5520                            Some(ctx) => {
5521                                let sub = mgr.begin();
5522                                ctx.savepoints.push((name.clone(), sub));
5523                                ("savepoint", format!("SAVEPOINT {name} — sub_xid={sub}"))
5524                            }
5525                            None => (
5526                                "savepoint",
5527                                "SAVEPOINT outside transaction — no-op".to_string(),
5528                            ),
5529                        }
5530                    }
5531                    TxnControl::ReleaseSavepoint(name) => {
5532                        let mut guard = self.inner.tx_contexts.write();
5533                        match guard.get_mut(&conn_id) {
5534                            Some(ctx) => {
5535                                let pos = ctx
5536                                    .savepoints
5537                                    .iter()
5538                                    .position(|(n, _)| n == name)
5539                                    .ok_or_else(|| {
5540                                        RedDBError::Internal(format!(
5541                                            "savepoint {name} does not exist"
5542                                        ))
5543                                    })?;
5544                                // RELEASE pops the named savepoint and
5545                                // any nested ones. Their sub-xids move
5546                                // to `released_sub_xids` so they commit
5547                                // (or roll back) alongside the parent
5548                                // xid — PG semantics: released
5549                                // savepoints still contribute their
5550                                // work, but their names are gone.
5551                                let released = ctx.savepoints.len() - pos;
5552                                let popped: Vec<Xid> = ctx
5553                                    .savepoints
5554                                    .split_off(pos)
5555                                    .into_iter()
5556                                    .map(|(_, x)| x)
5557                                    .collect();
5558                                ctx.released_sub_xids.extend(popped);
5559                                (
5560                                    "release_savepoint",
5561                                    format!("RELEASE SAVEPOINT {name} — {released} level(s)"),
5562                                )
5563                            }
5564                            None => (
5565                                "release_savepoint",
5566                                "RELEASE outside transaction — no-op".to_string(),
5567                            ),
5568                        }
5569                    }
5570                    TxnControl::RollbackToSavepoint(name) => {
5571                        let mgr = Arc::clone(&self.inner.snapshot_manager);
5572                        // Splice out the savepoint + nested ones under
5573                        // a narrow lock, then run the snapshot-manager
5574                        // + tombstone side-effects without the tx map
5575                        // held so nothing re-enters.
5576                        let drop_result: Option<(Xid, Vec<Xid>)> = {
5577                            let mut guard = self.inner.tx_contexts.write();
5578                            if let Some(ctx) = guard.get_mut(&conn_id) {
5579                                let pos = ctx
5580                                    .savepoints
5581                                    .iter()
5582                                    .position(|(n, _)| n == name)
5583                                    .ok_or_else(|| {
5584                                        RedDBError::Internal(format!(
5585                                            "savepoint {name} does not exist"
5586                                        ))
5587                                    })?;
5588                                let savepoint_xid = ctx.savepoints[pos].1;
5589                                let aborted: Vec<Xid> = ctx
5590                                    .savepoints
5591                                    .split_off(pos)
5592                                    .into_iter()
5593                                    .map(|(_, x)| x)
5594                                    .collect();
5595                                Some((savepoint_xid, aborted))
5596                            } else {
5597                                None
5598                            }
5599                        };
5600
5601                        match drop_result {
5602                            Some((savepoint_xid, aborted)) => {
5603                                for x in &aborted {
5604                                    mgr.rollback(*x);
5605                                }
5606                                let reverted_updates =
5607                                    self.revive_versioned_updates_since(conn_id, savepoint_xid);
5608                                let revived = self.revive_tombstones_since(conn_id, savepoint_xid);
5609                                (
5610                                    "rollback_to_savepoint",
5611                                    format!(
5612                                        "ROLLBACK TO SAVEPOINT {name} — aborted {} sub_xid(s), reverted {reverted_updates} update(s), revived {revived} tombstone(s)",
5613                                        aborted.len(),
5614                                    ),
5615                                )
5616                            }
5617                            None => (
5618                                "rollback_to_savepoint",
5619                                "ROLLBACK TO outside transaction — no-op".to_string(),
5620                            ),
5621                        }
5622                    }
5623                };
5624                Ok(RuntimeQueryResult::ok_message(
5625                    query.to_string(),
5626                    &msg,
5627                    kind,
5628                ))
5629            }
5630            // Schema + Sequence DDL (Phase 1.3 PG parity).
5631            //
5632            // Schemas are lightweight logical namespaces: a CREATE SCHEMA call
5633            // just registers the name in `red_config` under `schema.{name}`.
5634            // Table lookups still happen by collection name; clients using
5635            // `schema.table` qualified names collapse to collection `schema.table`.
5636            //
5637            // Sequences persist a 64-bit counter + metadata (start, increment)
5638            // in `red_config` under `sequence.{name}.*`. Scalar callers
5639            // `nextval('name')` / `currval('name')` arrive with the MVCC phase
5640            // once we have a proper mutating-function dispatch path; for now the
5641            // DDL just establishes the catalog entry so clients don't error.
5642            QueryExpr::CreateSchema(ref q) => {
5643                let store = self.inner.db.store();
5644                let key = format!("schema.{}", q.name);
5645                if store.get_config(&key).is_some() {
5646                    if q.if_not_exists {
5647                        return Ok(RuntimeQueryResult::ok_message(
5648                            query.to_string(),
5649                            &format!("schema {} already exists — skipped", q.name),
5650                            "create_schema",
5651                        ));
5652                    }
5653                    return Err(RedDBError::Internal(format!(
5654                        "schema {} already exists",
5655                        q.name
5656                    )));
5657                }
5658                store.set_config_tree(&key, &crate::serde_json::Value::Bool(true));
5659                Ok(RuntimeQueryResult::ok_message(
5660                    query.to_string(),
5661                    &format!("schema {} created", q.name),
5662                    "create_schema",
5663                ))
5664            }
5665            QueryExpr::DropSchema(ref q) => {
5666                let store = self.inner.db.store();
5667                let key = format!("schema.{}", q.name);
5668                let existed = store.get_config(&key).is_some();
5669                if !existed && !q.if_exists {
5670                    return Err(RedDBError::Internal(format!(
5671                        "schema {} does not exist",
5672                        q.name
5673                    )));
5674                }
5675                // Remove marker from red_config via set to null.
5676                store.set_config_tree(&key, &crate::serde_json::Value::Null);
5677                let suffix = if q.cascade {
5678                    " (CASCADE accepted — tables untouched)"
5679                } else {
5680                    ""
5681                };
5682                Ok(RuntimeQueryResult::ok_message(
5683                    query.to_string(),
5684                    &format!("schema {} dropped{}", q.name, suffix),
5685                    "drop_schema",
5686                ))
5687            }
5688            QueryExpr::CreateSequence(ref q) => {
5689                let store = self.inner.db.store();
5690                let base = format!("sequence.{}", q.name);
5691                let start_key = format!("{base}.start");
5692                let incr_key = format!("{base}.increment");
5693                let curr_key = format!("{base}.current");
5694                if store.get_config(&start_key).is_some() {
5695                    if q.if_not_exists {
5696                        return Ok(RuntimeQueryResult::ok_message(
5697                            query.to_string(),
5698                            &format!("sequence {} already exists — skipped", q.name),
5699                            "create_sequence",
5700                        ));
5701                    }
5702                    return Err(RedDBError::Internal(format!(
5703                        "sequence {} already exists",
5704                        q.name
5705                    )));
5706                }
5707                // Persist start + increment, and set current so the first
5708                // nextval returns `start`.
5709                let initial_current = q.start - q.increment;
5710                store.set_config_tree(
5711                    &start_key,
5712                    &crate::serde_json::Value::Number(q.start as f64),
5713                );
5714                store.set_config_tree(
5715                    &incr_key,
5716                    &crate::serde_json::Value::Number(q.increment as f64),
5717                );
5718                store.set_config_tree(
5719                    &curr_key,
5720                    &crate::serde_json::Value::Number(initial_current as f64),
5721                );
5722                Ok(RuntimeQueryResult::ok_message(
5723                    query.to_string(),
5724                    &format!(
5725                        "sequence {} created (start={}, increment={})",
5726                        q.name, q.start, q.increment
5727                    ),
5728                    "create_sequence",
5729                ))
5730            }
5731            QueryExpr::DropSequence(ref q) => {
5732                let store = self.inner.db.store();
5733                let base = format!("sequence.{}", q.name);
5734                let existed = store.get_config(&format!("{base}.start")).is_some();
5735                if !existed && !q.if_exists {
5736                    return Err(RedDBError::Internal(format!(
5737                        "sequence {} does not exist",
5738                        q.name
5739                    )));
5740                }
5741                for k in ["start", "increment", "current"] {
5742                    store.set_config_tree(&format!("{base}.{k}"), &crate::serde_json::Value::Null);
5743                }
5744                Ok(RuntimeQueryResult::ok_message(
5745                    query.to_string(),
5746                    &format!("sequence {} dropped", q.name),
5747                    "drop_sequence",
5748                ))
5749            }
5750            // Views — CREATE [MATERIALIZED] VIEW (Phase 2.1 PG parity).
5751            //
5752            // The view definition is stored in-memory on RuntimeInner (not
5753            // persisted). SELECTs that reference the view name will substitute
5754            // the stored `QueryExpr` via `resolve_view_reference` during
5755            // planning (same entry point used by table-name resolution).
5756            //
5757            // Materialized views additionally allocate a slot in
5758            // `MaterializedViewCache`; a REFRESH repopulates that slot.
5759            QueryExpr::CreateView(ref q) => {
5760                let mut views = self.inner.views.write();
5761                if views.contains_key(&q.name) && !q.or_replace {
5762                    if q.if_not_exists {
5763                        return Ok(RuntimeQueryResult::ok_message(
5764                            query.to_string(),
5765                            &format!("view {} already exists — skipped", q.name),
5766                            "create_view",
5767                        ));
5768                    }
5769                    return Err(RedDBError::Internal(format!(
5770                        "view {} already exists",
5771                        q.name
5772                    )));
5773                }
5774                views.insert(q.name.clone(), Arc::new(q.clone()));
5775                drop(views);
5776
5777                // Materialized view: register cache slot (data is empty until REFRESH).
5778                if q.materialized {
5779                    use crate::storage::cache::result::{MaterializedViewDef, RefreshPolicy};
5780                    let refresh = match q.refresh_every_ms {
5781                        Some(ms) => RefreshPolicy::Periodic(std::time::Duration::from_millis(ms)),
5782                        None => RefreshPolicy::Manual,
5783                    };
5784                    let dependencies = collect_table_refs(&q.query);
5785                    let def = MaterializedViewDef {
5786                        name: q.name.clone(),
5787                        query: format!("<parsed view {}>", q.name),
5788                        dependencies: dependencies.clone(),
5789                        refresh,
5790                        retention_duration_ms: q.retention_duration_ms,
5791                    };
5792                    self.inner.materialized_views.write().register(def);
5793
5794                    // Issue #593 slice 9a — persist the descriptor to
5795                    // the system catalog so the definition survives a
5796                    // restart. Upsert semantics (delete-then-insert by
5797                    // name) keep the catalog free of duplicate rows
5798                    // across `CREATE OR REPLACE` churn.
5799                    let descriptor =
5800                        crate::runtime::continuous_materialized_view::MaterializedViewDescriptor {
5801                            name: q.name.clone(),
5802                            source_sql: query.to_string(),
5803                            source_collections: dependencies,
5804                            refresh_every_ms: q.refresh_every_ms,
5805                            retention_duration_ms: q.retention_duration_ms,
5806                        };
5807                    let store = self.inner.db.store();
5808                    crate::runtime::continuous_materialized_view::persist_descriptor(
5809                        store.as_ref(),
5810                        &descriptor,
5811                    )?;
5812
5813                    // Issue #594 slice 9b — provision a Table-shaped
5814                    // backing collection named after the view. The
5815                    // rewriter skips materialized views (see
5816                    // `rewrite_view_refs_inner`) so `SELECT FROM v`
5817                    // resolves to this collection directly. Empty
5818                    // until REFRESH wires through it in 9c.
5819                    self.ensure_materialized_view_backing(&q.name)?;
5820                }
5821                // Plan cache may have cached a plan that didn't know about this
5822                // view — invalidate so future references pick up the new binding.
5823                // Result cache gets flushed too: OR REPLACE must not serve a
5824                // prior execution of the obsolete body.
5825                self.invalidate_plan_cache();
5826                self.invalidate_result_cache();
5827
5828                Ok(RuntimeQueryResult::ok_message(
5829                    query.to_string(),
5830                    &format!(
5831                        "{}view {} created",
5832                        if q.materialized { "materialized " } else { "" },
5833                        q.name
5834                    ),
5835                    "create_view",
5836                ))
5837            }
5838            QueryExpr::DropView(ref q) => {
5839                let mut views = self.inner.views.write();
5840                let removed = views.remove(&q.name);
5841                let existed = removed.is_some();
5842                let removed_materialized =
5843                    removed.as_ref().map(|v| v.materialized).unwrap_or(false);
5844                drop(views);
5845                if q.materialized || existed {
5846                    // Try the materialised cache too — silent if absent.
5847                    self.inner.materialized_views.write().remove(&q.name);
5848                    // Issue #593 slice 9a — remove any persisted
5849                    // catalog row. Idempotent: a no-op when the view
5850                    // was never materialized (no row was ever written).
5851                    let store = self.inner.db.store();
5852                    crate::runtime::continuous_materialized_view::remove_by_name(
5853                        store.as_ref(),
5854                        &q.name,
5855                    )?;
5856                }
5857                // Issue #594 slice 9b — drop the backing collection
5858                // that was provisioned at CREATE time. Only mat views
5859                // ever had one; regular views never did.
5860                if removed_materialized || q.materialized {
5861                    self.drop_materialized_view_backing(&q.name)?;
5862                }
5863                // Drop any plan / result cache entries that baked the
5864                // view body into their QueryExpr.
5865                self.invalidate_plan_cache();
5866                self.invalidate_result_cache();
5867                if !existed && !q.if_exists {
5868                    return Err(RedDBError::Internal(format!(
5869                        "view {} does not exist",
5870                        q.name
5871                    )));
5872                }
5873                self.invalidate_plan_cache();
5874                Ok(RuntimeQueryResult::ok_message(
5875                    query.to_string(),
5876                    &format!("view {} dropped", q.name),
5877                    "drop_view",
5878                ))
5879            }
5880            QueryExpr::RefreshMaterializedView(ref q) => {
5881                // Look up the view definition, execute its underlying query,
5882                // and stash the serialized result in the materialised cache.
5883                let view = {
5884                    let views = self.inner.views.read();
5885                    views.get(&q.name).cloned()
5886                };
5887                let view = match view {
5888                    Some(v) => v,
5889                    None => {
5890                        return Err(RedDBError::Internal(format!(
5891                            "view {} does not exist",
5892                            q.name
5893                        )))
5894                    }
5895                };
5896                if !view.materialized {
5897                    return Err(RedDBError::Internal(format!(
5898                        "view {} is not materialized — REFRESH requires \
5899                         CREATE MATERIALIZED VIEW",
5900                        q.name
5901                    )));
5902                }
5903                // Execute the underlying query fresh.
5904                let started = std::time::Instant::now();
5905                let now_ms = std::time::SystemTime::now()
5906                    .duration_since(std::time::UNIX_EPOCH)
5907                    .map(|d| d.as_millis() as u64)
5908                    .unwrap_or(0);
5909                match self.execute_query_expr((*view.query).clone()) {
5910                    Ok(inner_result) => {
5911                        // Issue #595 slice 9c — atomically replace the
5912                        // backing collection's contents under a single
5913                        // WAL group. Concurrent SELECT from the view
5914                        // sees either the prior or new contents, never
5915                        // partial. A crash before the WAL commit lands
5916                        // leaves the prior contents intact on recovery.
5917                        let entities =
5918                            view_records_to_entities(&q.name, &inner_result.result.records);
5919                        let row_count = entities.len() as u64;
5920                        let store = self.inner.db.store();
5921                        let serialized_records = match store.refresh_collection(&q.name, entities) {
5922                            Ok(records) => records,
5923                            Err(err) => {
5924                                let duration_ms = started.elapsed().as_millis() as u64;
5925                                let msg = err.to_string();
5926                                self.inner
5927                                    .materialized_views
5928                                    .write()
5929                                    .record_refresh_failure(
5930                                        &q.name,
5931                                        msg.clone(),
5932                                        duration_ms,
5933                                        now_ms,
5934                                    );
5935                                return Err(RedDBError::Internal(format!(
5936                                    "REFRESH MATERIALIZED VIEW {}: {msg}",
5937                                    q.name
5938                                )));
5939                            }
5940                        };
5941
5942                        // Issue #596 slice 9d — emit a Refresh
5943                        // ChangeRecord into the logical-WAL spool so
5944                        // replicas deterministically replay the same
5945                        // backing-collection contents via
5946                        // `LogicalChangeApplier::apply_record`.
5947                        if let Some(ref primary) = self.inner.db.replication {
5948                            let lsn = self.inner.cdc.emit(
5949                                crate::replication::cdc::ChangeOperation::Refresh,
5950                                &q.name,
5951                                0,
5952                                "refresh",
5953                            );
5954                            self.invalidate_result_cache_for_table(&q.name);
5955                            let timestamp = std::time::SystemTime::now()
5956                                .duration_since(std::time::UNIX_EPOCH)
5957                                .unwrap_or_default()
5958                                .as_millis() as u64;
5959                            let record = ChangeRecord::for_refresh(
5960                                lsn,
5961                                timestamp,
5962                                q.name.clone(),
5963                                serialized_records,
5964                            );
5965                            let encoded = record.encode();
5966                            primary.wal_buffer.append(record.lsn, encoded.clone());
5967                            if let Some(spool) = &primary.logical_wal_spool {
5968                                let _ = spool.append(record.lsn, &encoded);
5969                            }
5970                        }
5971
5972                        let duration_ms = started.elapsed().as_millis() as u64;
5973                        let serialized = format!("{:?}", inner_result.result);
5974                        self.inner
5975                            .materialized_views
5976                            .write()
5977                            .record_refresh_success(
5978                                &q.name,
5979                                serialized.into_bytes(),
5980                                row_count,
5981                                duration_ms,
5982                                now_ms,
5983                            );
5984                        // SELECT FROM v now reads through the rewriter
5985                        // skip into the backing collection — drop the
5986                        // result cache so prior empty-backing reads
5987                        // don't shadow the new contents.
5988                        self.invalidate_result_cache();
5989                        Ok(RuntimeQueryResult::ok_message(
5990                            query.to_string(),
5991                            &format!("materialized view {} refreshed", q.name),
5992                            "refresh_materialized_view",
5993                        ))
5994                    }
5995                    Err(err) => {
5996                        let duration_ms = started.elapsed().as_millis() as u64;
5997                        let msg = err.to_string();
5998                        self.inner
5999                            .materialized_views
6000                            .write()
6001                            .record_refresh_failure(&q.name, msg.clone(), duration_ms, now_ms);
6002                        Err(err)
6003                    }
6004                }
6005            }
6006            // Row Level Security (Phase 2.5 PG parity).
6007            //
6008            // Policies live in an in-memory registry keyed by (table, name).
6009            // Enforcement (AND-ing the policy's USING clause into every
6010            // query's WHERE for the table) arrives in Phase 2.5.2 via the
6011            // filter compiler; this dispatch only manages the catalog.
6012            QueryExpr::CreatePolicy(ref q) => {
6013                let key = (q.table.clone(), q.name.clone());
6014                self.inner
6015                    .rls_policies
6016                    .write()
6017                    .insert(key, Arc::new(q.clone()));
6018                self.invalidate_plan_cache();
6019                // Issue #120 — surface policy names in the
6020                // schema-vocabulary so AskPipeline (#121) can resolve
6021                // a policy reference back to its table.
6022                self.schema_vocabulary_apply(
6023                    crate::runtime::schema_vocabulary::DdlEvent::CreatePolicy {
6024                        collection: q.table.clone(),
6025                        policy: q.name.clone(),
6026                    },
6027                );
6028                Ok(RuntimeQueryResult::ok_message(
6029                    query.to_string(),
6030                    &format!("policy {} on {} created", q.name, q.table),
6031                    "create_policy",
6032                ))
6033            }
6034            QueryExpr::DropPolicy(ref q) => {
6035                let removed = self
6036                    .inner
6037                    .rls_policies
6038                    .write()
6039                    .remove(&(q.table.clone(), q.name.clone()))
6040                    .is_some();
6041                if !removed && !q.if_exists {
6042                    return Err(RedDBError::Internal(format!(
6043                        "policy {} on {} does not exist",
6044                        q.name, q.table
6045                    )));
6046                }
6047                self.invalidate_plan_cache();
6048                // Issue #120 — keep the schema-vocabulary policy
6049                // entry in sync.
6050                self.schema_vocabulary_apply(
6051                    crate::runtime::schema_vocabulary::DdlEvent::DropPolicy {
6052                        collection: q.table.clone(),
6053                        policy: q.name.clone(),
6054                    },
6055                );
6056                Ok(RuntimeQueryResult::ok_message(
6057                    query.to_string(),
6058                    &format!("policy {} on {} dropped", q.name, q.table),
6059                    "drop_policy",
6060                ))
6061            }
6062            // Foreign Data Wrappers (Phase 3.2 PG parity).
6063            //
6064            // CREATE SERVER / CREATE FOREIGN TABLE register into the shared
6065            // `ForeignTableRegistry`. The read path consults that registry
6066            // before dispatching a SELECT — when the table name matches a
6067            // registered foreign table, we forward the scan to the wrapper
6068            // and skip the normal collection lookup.
6069            //
6070            // Phase 3.2 is in-memory only; persistence across restarts is a
6071            // 3.2.2 follow-up that mirrors the view registry pattern.
6072            QueryExpr::CreateServer(ref q) => {
6073                use crate::storage::fdw::FdwOptions;
6074                let registry = Arc::clone(&self.inner.foreign_tables);
6075                if registry.server(&q.name).is_some() {
6076                    if q.if_not_exists {
6077                        return Ok(RuntimeQueryResult::ok_message(
6078                            query.to_string(),
6079                            &format!("server {} already exists — skipped", q.name),
6080                            "create_server",
6081                        ));
6082                    }
6083                    return Err(RedDBError::Internal(format!(
6084                        "server {} already exists",
6085                        q.name
6086                    )));
6087                }
6088                let mut opts = FdwOptions::new();
6089                for (k, v) in &q.options {
6090                    opts.values.insert(k.clone(), v.clone());
6091                }
6092                registry
6093                    .create_server(&q.name, &q.wrapper, opts)
6094                    .map_err(|e| RedDBError::Internal(e.to_string()))?;
6095                Ok(RuntimeQueryResult::ok_message(
6096                    query.to_string(),
6097                    &format!("server {} created (wrapper {})", q.name, q.wrapper),
6098                    "create_server",
6099                ))
6100            }
6101            QueryExpr::DropServer(ref q) => {
6102                let existed = self.inner.foreign_tables.drop_server(&q.name);
6103                if !existed && !q.if_exists {
6104                    return Err(RedDBError::Internal(format!(
6105                        "server {} does not exist",
6106                        q.name
6107                    )));
6108                }
6109                Ok(RuntimeQueryResult::ok_message(
6110                    query.to_string(),
6111                    &format!(
6112                        "server {} dropped{}",
6113                        q.name,
6114                        if q.cascade { " (cascade)" } else { "" }
6115                    ),
6116                    "drop_server",
6117                ))
6118            }
6119            QueryExpr::CreateForeignTable(ref q) => {
6120                use crate::storage::fdw::{FdwOptions, ForeignColumn, ForeignTable};
6121                let registry = Arc::clone(&self.inner.foreign_tables);
6122                if registry.foreign_table(&q.name).is_some() {
6123                    if q.if_not_exists {
6124                        return Ok(RuntimeQueryResult::ok_message(
6125                            query.to_string(),
6126                            &format!("foreign table {} already exists — skipped", q.name),
6127                            "create_foreign_table",
6128                        ));
6129                    }
6130                    return Err(RedDBError::Internal(format!(
6131                        "foreign table {} already exists",
6132                        q.name
6133                    )));
6134                }
6135                let mut opts = FdwOptions::new();
6136                for (k, v) in &q.options {
6137                    opts.values.insert(k.clone(), v.clone());
6138                }
6139                let columns: Vec<ForeignColumn> = q
6140                    .columns
6141                    .iter()
6142                    .map(|c| ForeignColumn {
6143                        name: c.name.clone(),
6144                        data_type: c.data_type.clone(),
6145                        not_null: c.not_null,
6146                    })
6147                    .collect();
6148                registry
6149                    .create_foreign_table(ForeignTable {
6150                        name: q.name.clone(),
6151                        server_name: q.server.clone(),
6152                        columns,
6153                        options: opts,
6154                    })
6155                    .map_err(|e| RedDBError::Internal(e.to_string()))?;
6156                self.invalidate_plan_cache();
6157                Ok(RuntimeQueryResult::ok_message(
6158                    query.to_string(),
6159                    &format!("foreign table {} created (server {})", q.name, q.server),
6160                    "create_foreign_table",
6161                ))
6162            }
6163            QueryExpr::DropForeignTable(ref q) => {
6164                let existed = self.inner.foreign_tables.drop_foreign_table(&q.name);
6165                if !existed && !q.if_exists {
6166                    return Err(RedDBError::Internal(format!(
6167                        "foreign table {} does not exist",
6168                        q.name
6169                    )));
6170                }
6171                self.invalidate_plan_cache();
6172                Ok(RuntimeQueryResult::ok_message(
6173                    query.to_string(),
6174                    &format!("foreign table {} dropped", q.name),
6175                    "drop_foreign_table",
6176                ))
6177            }
6178            // COPY table FROM 'path' (Phase 1.5 PG parity).
6179            //
6180            // Stream CSV rows through the shared `CsvImporter`. The collection
6181            // is auto-created on first insert (via `insert_auto`-style path);
6182            // VACUUM/ANALYZE afterwards is up to the caller.
6183            QueryExpr::CopyFrom(ref q) => {
6184                use crate::storage::import::{CsvConfig, CsvImporter};
6185                let store = self.inner.db.store();
6186                let cfg = CsvConfig {
6187                    collection: q.table.clone(),
6188                    has_header: q.has_header,
6189                    delimiter: q.delimiter.map(|c| c as u8).unwrap_or(b','),
6190                    ..CsvConfig::default()
6191                };
6192                let importer = CsvImporter::new(cfg);
6193                let stats = importer
6194                    .import_file(&q.path, store.as_ref())
6195                    .map_err(|e| RedDBError::Internal(format!("COPY failed: {e}")))?;
6196                // Tables are written → invalidate cached plans / result cache.
6197                self.note_table_write(&q.table);
6198                Ok(RuntimeQueryResult::ok_message(
6199                    query.to_string(),
6200                    &format!(
6201                        "COPY imported {} rows into {} ({} errors skipped, {}ms)",
6202                        stats.records_imported, q.table, stats.errors_skipped, stats.duration_ms
6203                    ),
6204                    "copy_from",
6205                ))
6206            }
6207            // Maintenance commands (Phase 1.2 PG parity).
6208            //
6209            // - VACUUM [FULL] [table]: refreshes planner stats for the target
6210            //   collection(s) and — when FULL — triggers a full pager persist
6211            //   (flushes dirty pages + fsync). Also invalidates the result cache
6212            //   so subsequent reads re-execute against the freshly compacted
6213            //   storage. RedDB's segment/btree GC runs continuously via the
6214            //   background lifecycle; explicit space reclamation for sealed
6215            //   segments arrives with Phase 2.3 (MVCC + dead-tuple reclamation).
6216            // - ANALYZE [table]: reruns `analyze_collection` +
6217            //   `persist_table_stats` via `refresh_table_planner_stats` so the
6218            //   planner has fresh histograms, distinct estimates, null counts.
6219            //
6220            // Both commands accept an optional target; omitting the target
6221            // iterates every collection in the store.
6222            QueryExpr::MaintenanceCommand(ref cmd) => {
6223                use crate::storage::query::ast::MaintenanceCommand as Mc;
6224                let store = self.inner.db.store();
6225                let (kind, msg) = match cmd {
6226                    Mc::Analyze { target } => {
6227                        let targets: Vec<String> = match target {
6228                            Some(t) => vec![t.clone()],
6229                            None => store.list_collections(),
6230                        };
6231                        for t in &targets {
6232                            self.refresh_table_planner_stats(t);
6233                        }
6234                        (
6235                            "analyze",
6236                            format!("ANALYZE refreshed stats for {} table(s)", targets.len()),
6237                        )
6238                    }
6239                    Mc::Vacuum { target, full } => {
6240                        let targets: Vec<String> = match target {
6241                            Some(t) => vec![t.clone()],
6242                            None => store.list_collections(),
6243                        };
6244                        let cutoff_xid = self.mvcc_vacuum_cutoff_xid();
6245                        let mut vacuum_stats =
6246                            crate::storage::unified::store::MvccVacuumStats::default();
6247                        for t in &targets {
6248                            let stats = store.vacuum_mvcc_history(t, cutoff_xid).map_err(|e| {
6249                                RedDBError::Internal(format!(
6250                                    "VACUUM MVCC history failed for {t}: {e}"
6251                                ))
6252                            })?;
6253                            if stats.reclaimed_versions > 0 {
6254                                self.rebuild_runtime_indexes_for_table(t)?;
6255                            }
6256                            vacuum_stats.add(&stats);
6257                        }
6258                        self.inner.snapshot_manager.prune_aborted(cutoff_xid);
6259                        // Stats refresh covers every target (same as ANALYZE).
6260                        for t in &targets {
6261                            self.refresh_table_planner_stats(t);
6262                        }
6263                        // FULL forces a pager persist (dirty-page flush + fsync).
6264                        // Regular VACUUM relies on the background writer / segment
6265                        // lifecycle so the command is non-blocking.
6266                        let persisted = if *full {
6267                            match store.persist() {
6268                                Ok(()) => true,
6269                                Err(e) => {
6270                                    return Err(RedDBError::Internal(format!(
6271                                        "VACUUM FULL persist failed: {e:?}"
6272                                    )));
6273                                }
6274                            }
6275                        } else {
6276                            false
6277                        };
6278                        // Result cache depended on pre-vacuum state.
6279                        self.invalidate_result_cache();
6280                        (
6281                            "vacuum",
6282                            format!(
6283                                "VACUUM{} processed {} table(s): scanned_versions={}, retained_versions={}, reclaimed_versions={}, retained_history_versions={}, reclaimed_history_versions={}, retained_tombstones={}, reclaimed_tombstones={}{}",
6284                                if *full { " FULL" } else { "" },
6285                                targets.len(),
6286                                vacuum_stats.scanned_versions,
6287                                vacuum_stats.retained_versions,
6288                                vacuum_stats.reclaimed_versions,
6289                                vacuum_stats.retained_history_versions,
6290                                vacuum_stats.reclaimed_history_versions,
6291                                vacuum_stats.retained_tombstones,
6292                                vacuum_stats.reclaimed_tombstones,
6293                                if persisted {
6294                                    " (pages flushed to disk)"
6295                                } else {
6296                                    ""
6297                                }
6298                            ),
6299                        )
6300                    }
6301                };
6302                Ok(RuntimeQueryResult::ok_message(
6303                    query.to_string(),
6304                    &msg,
6305                    kind,
6306                ))
6307            }
6308            // GRANT / REVOKE / ALTER USER (RBAC milestone).
6309            //
6310            // These hit the AuthStore directly. The privilege-check
6311            // gate at the top of `execute_query_expr` already decided
6312            // whether the caller may even run the statement; here we
6313            // just translate the AST into AuthStore calls.
6314            QueryExpr::Grant(ref g) => self.execute_grant_statement(query, g),
6315            QueryExpr::Revoke(ref r) => self.execute_revoke_statement(query, r),
6316            QueryExpr::AlterUser(ref a) => self.execute_alter_user_statement(query, a),
6317            QueryExpr::CreateIamPolicy { ref id, ref json } => {
6318                self.execute_create_iam_policy(query, id, json)
6319            }
6320            QueryExpr::DropIamPolicy { ref id } => self.execute_drop_iam_policy(query, id),
6321            QueryExpr::AttachPolicy {
6322                ref policy_id,
6323                ref principal,
6324            } => self.execute_attach_policy(query, policy_id, principal),
6325            QueryExpr::DetachPolicy {
6326                ref policy_id,
6327                ref principal,
6328            } => self.execute_detach_policy(query, policy_id, principal),
6329            QueryExpr::ShowPolicies { ref filter } => {
6330                self.execute_show_policies(query, filter.as_ref())
6331            }
6332            QueryExpr::ShowEffectivePermissions {
6333                ref user,
6334                ref resource,
6335            } => self.execute_show_effective_permissions(query, user, resource.as_ref()),
6336            QueryExpr::SimulatePolicy {
6337                ref user,
6338                ref action,
6339                ref resource,
6340            } => self.execute_simulate_policy(query, user, action, resource),
6341            QueryExpr::CreateMigration(ref q) => self.execute_create_migration(query, q),
6342            QueryExpr::ApplyMigration(ref q) => self.execute_apply_migration(query, q),
6343            QueryExpr::RollbackMigration(ref q) => self.execute_rollback_migration(query, q),
6344            QueryExpr::ExplainMigration(ref q) => self.execute_explain_migration(query, q),
6345        };
6346
6347        // Decrypt Value::Secret columns in-place before caching, so
6348        // cached results match the post-decrypt shape and repeat
6349        // queries skip the per-row AES-GCM pass.
6350        let mut query_result = query_result;
6351        if let Ok(ref mut result) = query_result {
6352            if result.statement_type == "select" {
6353                self.apply_secret_decryption(result);
6354            }
6355        }
6356
6357        // Cache SELECT results for 30s.
6358        // Skip: pre-serialized JSON (large clone), and result sets > 5 rows.
6359        // Large multi-row results (range scans, filtered scans) are rarely
6360        // repeated with the same literal values so the cache hit rate is near
6361        // zero while the clone cost (100 records × ~16 fields each) is high.
6362        // Aggregations (1 row) and point lookups (1 row) still benefit.
6363        if let Ok(ref result) = query_result {
6364            frame.write_result_cache(self, result, result_cache_scopes);
6365        }
6366
6367        query_result
6368    }
6369
6370    /// Snapshot of every registered materialized view's runtime
6371    /// state — feeds the `red.materialized_views` virtual table.
6372    /// Issue #583 slice 10.
6373    pub fn materialized_view_metadata(
6374        &self,
6375    ) -> Vec<crate::storage::cache::result::MaterializedViewMetadata> {
6376        // Issue #595 slice 9c — `current_row_count` is now scraped
6377        // live from the backing collection rather than read from the
6378        // cache slot. Mirrors the slice-10 invariant on
6379        // `queue_pending_gauge` in #527: the live store is the source
6380        // of truth, the cache slot only carries last-refresh telemetry
6381        // (timing, error, refresh cadence).
6382        let store = self.inner.db.store();
6383        let mut entries = self.inner.materialized_views.read().metadata();
6384        for entry in &mut entries {
6385            if let Some(manager) = store.get_collection(&entry.name) {
6386                entry.current_row_count = manager.count() as u64;
6387            }
6388        }
6389        entries
6390    }
6391
6392    /// Drive scheduled refreshes for materialized views with a
6393    /// `REFRESH EVERY <duration>` clause. Called from the background
6394    /// scheduler thread (and from unit tests with a fake clock via
6395    /// `claim_due_at`). Each invocation atomically claims the set of
6396    /// due views (so two concurrent ticks never double-fire the same
6397    /// view) and runs each refresh through the standard execution
6398    /// path — failures are captured in `last_error` and the prior
6399    /// content stays intact. Issue #583 slice 10.
6400    /// Snapshot of every tracked retention sweeper state — feeds the
6401    /// three extra columns on `red.retention`. Issue #584 slice 12.
6402    pub(crate) fn retention_sweeper_snapshot(
6403        &self,
6404    ) -> Vec<(String, crate::runtime::retention_sweeper::SweeperState)> {
6405        self.inner.retention_sweeper.read().snapshot()
6406    }
6407
6408    /// Drive one tick of the retention sweeper. Iterates collections
6409    /// with a retention policy set, physically deletes at most
6410    /// `batch_size` expired rows per collection, and records the
6411    /// `last_sweep_at_ms` / `rows_swept_total` / pending estimate that
6412    /// `red.retention` exposes. Called from the background sweeper
6413    /// thread; safe to invoke directly from tests with a small batch
6414    /// size to drain rows deterministically. Issue #584 slice 12.
6415    ///
6416    /// Deletes are issued as `DELETE FROM <collection> WHERE
6417    /// <ts_column> < <cutoff>` through the standard `execute_query`
6418    /// chokepoint so WAL participation and snapshot guards apply
6419    /// exactly as for a user-issued DELETE — replicas replay the
6420    /// sweeper's deletes via the same WAL stream with no special
6421    /// handling on the replication side.
6422    ///
6423    /// Batching is enforced by tightening the cutoff: if more than
6424    /// `batch_size` rows are expired, the cutoff is dropped to the
6425    /// `batch_size`-th oldest expired timestamp + 1 so the predicate
6426    /// matches roughly `batch_size` rows; the remainder is reported
6427    /// as `current_rows_pending_sweep_estimate` and drained on the
6428    /// next tick.
6429    pub fn sweep_retention_tick(&self, batch_size: usize) {
6430        if batch_size == 0 {
6431            return;
6432        }
6433        let now_ms = std::time::SystemTime::now()
6434            .duration_since(std::time::UNIX_EPOCH)
6435            .map(|d| d.as_millis() as u64)
6436            .unwrap_or(0);
6437
6438        let store = self.inner.db.store();
6439        let collections = store.list_collections();
6440        for name in collections {
6441            let Some(contract) = self.inner.db.collection_contract(&name) else {
6442                continue;
6443            };
6444            let Some(retention_ms) = contract.retention_duration_ms else {
6445                continue;
6446            };
6447            let Some(ts_column) =
6448                crate::runtime::retention_filter::resolve_timestamp_column(&contract)
6449            else {
6450                continue;
6451            };
6452            let Some(manager) = store.get_collection(&name) else {
6453                continue;
6454            };
6455            let cutoff = (now_ms as i64).saturating_sub(retention_ms as i64);
6456
6457            // Single pass: collect expired timestamps. We keep the
6458            // full Vec rather than a bounded heap because the partial
6459            // sort below is the simplest correct way to find the
6460            // batch-th oldest; for the slice's "1000-row default
6461            // batch" target this is bounded enough for production
6462            // operation, and the alternative (in-place heap of size
6463            // batch+1) is a follow-up optimisation.
6464            let mut expired_ts: Vec<i64> = Vec::new();
6465            manager.for_each_entity(|entity| {
6466                let ts = match ts_column.as_str() {
6467                    "created_at" => Some(entity.created_at as i64),
6468                    "updated_at" => Some(entity.updated_at as i64),
6469                    other => entity
6470                        .data
6471                        .as_row()
6472                        .and_then(|row| row.get_field(other))
6473                        .and_then(|v| match v {
6474                            crate::storage::schema::Value::TimestampMs(t) => Some(*t),
6475                            crate::storage::schema::Value::Timestamp(t) => {
6476                                Some(t.saturating_mul(1_000))
6477                            }
6478                            crate::storage::schema::Value::BigInt(t) => Some(*t),
6479                            crate::storage::schema::Value::UnsignedInteger(t) => {
6480                                i64::try_from(*t).ok()
6481                            }
6482                            crate::storage::schema::Value::Integer(t) => Some(*t),
6483                            _ => None,
6484                        }),
6485                };
6486                if let Some(t) = ts {
6487                    if t < cutoff {
6488                        expired_ts.push(t);
6489                    }
6490                }
6491                true
6492            });
6493
6494            let total_expired = expired_ts.len() as u64;
6495            if total_expired == 0 {
6496                self.inner
6497                    .retention_sweeper
6498                    .write()
6499                    .record_tick(&name, 0, 0, now_ms);
6500                continue;
6501            }
6502
6503            let (effective_cutoff, pending) = if (total_expired as usize) <= batch_size {
6504                (cutoff, 0u64)
6505            } else {
6506                // Tighten the cutoff to the (batch_size)-th oldest
6507                // expired timestamp + 1 so DELETE matches roughly
6508                // `batch_size` rows.
6509                expired_ts.sort_unstable();
6510                let nth = expired_ts[batch_size - 1];
6511                (
6512                    nth.saturating_add(1),
6513                    total_expired.saturating_sub(batch_size as u64),
6514                )
6515            };
6516
6517            let stmt = format!(
6518                "DELETE FROM {} WHERE {} < {}",
6519                name, ts_column, effective_cutoff
6520            );
6521            let deleted = match self.execute_query(&stmt) {
6522                Ok(r) => r.affected_rows,
6523                Err(_) => 0,
6524            };
6525
6526            self.inner
6527                .retention_sweeper
6528                .write()
6529                .record_tick(&name, deleted, pending, now_ms);
6530        }
6531    }
6532
6533    pub fn refresh_due_materialized_views(&self) {
6534        let due = {
6535            let mut cache = self.inner.materialized_views.write();
6536            cache.claim_due_at(std::time::Instant::now())
6537        };
6538        for name in due {
6539            // Round-trip through `execute_query` (rather than the
6540            // prepared-statement `execute_query_expr` fast path, which
6541            // explicitly rejects DDL/maintenance statements). Failures
6542            // are captured inside the RefreshMaterializedView handler
6543            // via `record_refresh_failure`; the scheduler ignores the
6544            // Result so one bad view doesn't halt the loop.
6545            let stmt = format!("REFRESH MATERIALIZED VIEW {}", name);
6546            let _ = self.execute_query(&stmt);
6547        }
6548    }
6549
6550    /// Execute a pre-parsed `QueryExpr` directly, bypassing SQL parsing and the
6551    /// plan cache. Used by the prepared-statement fast path so that `execute_prepared`
6552    /// calls pay zero parse + cache overhead.
6553    ///
6554    /// Applies secret decryption on SELECT results, identical to `execute_query`.
6555    pub fn execute_query_expr(&self, expr: QueryExpr) -> RedDBResult<RuntimeQueryResult> {
6556        let _config_snapshot_guard = ConfigSnapshotGuard::install(Arc::clone(&self.inner.db));
6557        let _secret_store_guard = SecretStoreGuard::install(self.inner.auth_store.read().clone());
6558        // View rewrite (Phase 2.1): substitute any `QueryExpr::Table(tq)`
6559        // whose `tq.table` matches a registered view with the view's
6560        // underlying query. Safe to call even when no views are registered.
6561        let expr = self.rewrite_view_refs(expr);
6562
6563        self.validate_model_operations_before_auth(&expr)?;
6564        // Granular RBAC privilege check. Runs before dispatch so a
6565        // denied caller never reaches storage. Fail-closed: any error
6566        // resolving the action / resource produces PermissionDenied.
6567        if let Err(err) = self.check_query_privilege(&expr) {
6568            return Err(RedDBError::Query(format!("permission denied: {err}")));
6569        }
6570
6571        let statement = query_expr_name(&expr);
6572        let mode = detect_mode(statement);
6573        let query_str = statement;
6574
6575        let result = self.dispatch_expr(expr, query_str, mode)?;
6576        let mut r = result;
6577        if r.statement_type == "select" {
6578            self.apply_secret_decryption(&mut r);
6579        }
6580        Ok(r)
6581    }
6582
6583    pub(super) fn validate_model_operations_before_auth(
6584        &self,
6585        expr: &QueryExpr,
6586    ) -> RedDBResult<()> {
6587        use crate::catalog::CollectionModel;
6588        use crate::runtime::ddl::polymorphic_resolver;
6589        use crate::storage::query::ast::KvCommand;
6590
6591        let system_schema_target = match expr {
6592            QueryExpr::DropTable(q) => Some(q.name.as_str()),
6593            QueryExpr::DropGraph(q) => Some(q.name.as_str()),
6594            QueryExpr::DropVector(q) => Some(q.name.as_str()),
6595            QueryExpr::DropDocument(q) => Some(q.name.as_str()),
6596            QueryExpr::DropKv(q) => Some(q.name.as_str()),
6597            QueryExpr::DropCollection(q) => Some(q.name.as_str()),
6598            QueryExpr::Truncate(q) => Some(q.name.as_str()),
6599            _ => None,
6600        };
6601        if system_schema_target.is_some_and(crate::runtime::impl_ddl::is_system_schema_name) {
6602            return Err(RedDBError::Query("system schema is read-only".to_string()));
6603        }
6604
6605        let expected = match expr {
6606            QueryExpr::DropTable(q) => Some((q.name.as_str(), CollectionModel::Table)),
6607            QueryExpr::DropGraph(q) => Some((q.name.as_str(), CollectionModel::Graph)),
6608            QueryExpr::DropVector(q) => Some((q.name.as_str(), CollectionModel::Vector)),
6609            QueryExpr::DropDocument(q) => Some((q.name.as_str(), CollectionModel::Document)),
6610            QueryExpr::DropKv(q) => Some((q.name.as_str(), q.model)),
6611            QueryExpr::DropCollection(q) => q.model.map(|model| (q.name.as_str(), model)),
6612            QueryExpr::Truncate(q) => q.model.map(|model| (q.name.as_str(), model)),
6613            QueryExpr::KvCommand(cmd) => {
6614                let (collection, model) = match cmd {
6615                    KvCommand::Put {
6616                        collection, model, ..
6617                    }
6618                    | KvCommand::Get {
6619                        collection, model, ..
6620                    }
6621                    | KvCommand::Incr {
6622                        collection, model, ..
6623                    }
6624                    | KvCommand::Cas {
6625                        collection, model, ..
6626                    }
6627                    | KvCommand::Delete {
6628                        collection, model, ..
6629                    } => (collection.as_str(), *model),
6630                    KvCommand::Rotate { collection, .. }
6631                    | KvCommand::History { collection, .. }
6632                    | KvCommand::List { collection, .. }
6633                    | KvCommand::Purge { collection, .. } => {
6634                        (collection.as_str(), CollectionModel::Vault)
6635                    }
6636                    KvCommand::InvalidateTags { collection, .. } => {
6637                        (collection.as_str(), CollectionModel::Kv)
6638                    }
6639                    KvCommand::Watch {
6640                        collection, model, ..
6641                    } => (collection.as_str(), *model),
6642                    KvCommand::Unseal { collection, .. } => {
6643                        (collection.as_str(), CollectionModel::Vault)
6644                    }
6645                };
6646                Some((collection, model))
6647            }
6648            QueryExpr::ConfigCommand(cmd) => {
6649                self.validate_config_command_before_auth(cmd)?;
6650                None
6651            }
6652            _ => None,
6653        };
6654
6655        let Some((name, expected_model)) = expected else {
6656            return Ok(());
6657        };
6658        let snapshot = self.inner.db.catalog_model_snapshot();
6659        let Some(actual_model) = snapshot
6660            .collections
6661            .iter()
6662            .find(|collection| collection.name == name)
6663            .map(|collection| collection.declared_model.unwrap_or(collection.model))
6664        else {
6665            return Ok(());
6666        };
6667        polymorphic_resolver::ensure_model_match(expected_model, actual_model)
6668    }
6669
6670    /// Walk a `QueryExpr` and replace `QueryExpr::Table(tq)` nodes whose
6671    /// `tq.table` matches a registered view name with the view's stored
6672    /// body. Recurses through joins so `SELECT ... FROM t JOIN myview ...`
6673    /// resolves correctly. Pure operation — no side effects.
6674    pub(super) fn rewrite_view_refs(&self, expr: QueryExpr) -> QueryExpr {
6675        // Fast path: no views registered → return original expression.
6676        if self.inner.views.read().is_empty() {
6677            return expr;
6678        }
6679        self.rewrite_view_refs_inner(expr)
6680    }
6681
6682    fn rewrite_view_refs_inner(&self, expr: QueryExpr) -> QueryExpr {
6683        use crate::storage::query::ast::{Filter, TableSource};
6684        match expr {
6685            QueryExpr::Table(mut tq) => {
6686                // 1. If the TableSource is a subquery, recurse into it so
6687                //    `SELECT ... FROM (SELECT ... FROM myview) t` expands.
6688                //    The legacy `table` field (set to a synthetic
6689                //    "__subq_NNNN" sentinel) stays as-is so callers that
6690                //    read it keep compiling.
6691                if let Some(TableSource::Subquery(body)) = tq.source.take() {
6692                    tq.source = Some(TableSource::Subquery(Box::new(
6693                        self.rewrite_view_refs_inner(*body),
6694                    )));
6695                    return QueryExpr::Table(tq);
6696                }
6697
6698                // 2. Restore the source field (took it above for match).
6699                // When the source was `None` or `TableSource::Name(_)`, the
6700                // real lookup key is `tq.table` — check the view registry.
6701                let maybe_view = {
6702                    let views = self.inner.views.read();
6703                    views.get(&tq.table).cloned()
6704                };
6705                let Some(view) = maybe_view else {
6706                    return QueryExpr::Table(tq);
6707                };
6708
6709                // Issue #594 slice 9b — materialized views are read
6710                // from their backing collection, not by substituting
6711                // the body. Returning the TableQuery as-is lets the
6712                // normal table-read path resolve `SELECT FROM v`
6713                // against the collection provisioned at CREATE time.
6714                if view.materialized {
6715                    return QueryExpr::Table(tq);
6716                }
6717
6718                // Recurse into the view body — views may reference other
6719                // views. The recursion yields the final QueryExpr we need
6720                // to merge the outer's filter / limit / offset into.
6721                let inner_expr = self.rewrite_view_refs_inner((*view.query).clone());
6722
6723                // Phase 5: when the body is a Table we merge the outer
6724                // TableQuery's WHERE / LIMIT / OFFSET into it so stacked
6725                // views filter recursively. Non-table bodies (Search,
6726                // Ask, Vector, Graph, Hybrid) can't meaningfully combine
6727                // with an outer Table query today — return the body
6728                // verbatim; outer predicates are lost. Full projection
6729                // merge lands in Phase 5.2.
6730                match inner_expr {
6731                    QueryExpr::Table(mut inner_tq) => {
6732                        if let Some(outer_filter) = tq.filter.take() {
6733                            inner_tq.filter = Some(match inner_tq.filter.take() {
6734                                Some(existing) => {
6735                                    Filter::And(Box::new(existing), Box::new(outer_filter))
6736                                }
6737                                None => outer_filter,
6738                            });
6739                            // Keep the `Expr` form in lock-step with the
6740                            // merged `Filter`. The executor prefers
6741                            // `where_expr` and nulls `filter` when it is
6742                            // present (see `execute_query_inner`), so a
6743                            // stacked view whose outer predicate was only
6744                            // merged into `filter` would silently drop that
6745                            // predicate at eval time (#635).
6746                            inner_tq.where_expr = inner_tq
6747                                .filter
6748                                .as_ref()
6749                                .map(crate::storage::query::sql_lowering::filter_to_expr);
6750                        }
6751                        if let Some(outer_limit) = tq.limit {
6752                            inner_tq.limit = Some(match inner_tq.limit {
6753                                Some(existing) => existing.min(outer_limit),
6754                                None => outer_limit,
6755                            });
6756                        }
6757                        if let Some(outer_offset) = tq.offset {
6758                            inner_tq.offset = Some(match inner_tq.offset {
6759                                Some(existing) => existing + outer_offset,
6760                                None => outer_offset,
6761                            });
6762                        }
6763                        QueryExpr::Table(inner_tq)
6764                    }
6765                    other => other,
6766                }
6767            }
6768            QueryExpr::Join(mut jq) => {
6769                jq.left = Box::new(self.rewrite_view_refs_inner(*jq.left));
6770                jq.right = Box::new(self.rewrite_view_refs_inner(*jq.right));
6771                QueryExpr::Join(jq)
6772            }
6773            // Other variants don't carry nested QueryExpr that can reference
6774            // a view by table name. Return as-is.
6775            other => other,
6776        }
6777    }
6778
6779    /// Internal dispatch: route a `QueryExpr` to the appropriate executor.
6780    /// Shared by `execute_query` (after parse/cache) and `execute_query_expr`
6781    /// (direct call from prepared-statement handler).
6782    fn authorize_relational_table_select(
6783        &self,
6784        mut table: TableQuery,
6785        frame: &dyn super::statement_frame::ReadFrame,
6786    ) -> RedDBResult<Option<TableQuery>> {
6787        if let Some(TableSource::Subquery(inner)) = table.source.take() {
6788            let authorized_inner = self.authorize_relational_select_expr(*inner, frame)?;
6789            table.source = Some(TableSource::Subquery(Box::new(authorized_inner)));
6790            return Ok(Some(table));
6791        }
6792
6793        self.check_table_column_projection_authz(&table, frame)?;
6794
6795        if self.inner.rls_enabled_tables.read().contains(&table.table) {
6796            return Ok(inject_rls_filters(self, frame, table));
6797        }
6798
6799        Ok(Some(table))
6800    }
6801
6802    fn authorize_relational_join_select(
6803        &self,
6804        mut join: JoinQuery,
6805        frame: &dyn super::statement_frame::ReadFrame,
6806    ) -> RedDBResult<Option<JoinQuery>> {
6807        self.check_join_column_projection_authz(&join, frame)?;
6808        join.left = Box::new(self.authorize_relational_join_child(*join.left, frame)?);
6809        join.right = Box::new(self.authorize_relational_join_child(*join.right, frame)?);
6810        Ok(inject_rls_into_join(self, frame, join))
6811    }
6812
6813    fn authorize_relational_join_child(
6814        &self,
6815        expr: QueryExpr,
6816        frame: &dyn super::statement_frame::ReadFrame,
6817    ) -> RedDBResult<QueryExpr> {
6818        match expr {
6819            QueryExpr::Table(mut table) => {
6820                if let Some(TableSource::Subquery(inner)) = table.source.take() {
6821                    let authorized_inner = self.authorize_relational_select_expr(*inner, frame)?;
6822                    table.source = Some(TableSource::Subquery(Box::new(authorized_inner)));
6823                }
6824                Ok(QueryExpr::Table(table))
6825            }
6826            QueryExpr::Join(join) => self
6827                .authorize_relational_join_select(join, frame)?
6828                .map(QueryExpr::Join)
6829                .ok_or_else(|| {
6830                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6831                }),
6832            other => Ok(other),
6833        }
6834    }
6835
6836    fn authorize_relational_select_expr(
6837        &self,
6838        expr: QueryExpr,
6839        frame: &dyn super::statement_frame::ReadFrame,
6840    ) -> RedDBResult<QueryExpr> {
6841        match expr {
6842            QueryExpr::Table(table) => self
6843                .authorize_relational_table_select(table, frame)?
6844                .map(QueryExpr::Table)
6845                .ok_or_else(|| {
6846                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6847                }),
6848            QueryExpr::Join(join) => self
6849                .authorize_relational_join_select(join, frame)?
6850                .map(QueryExpr::Join)
6851                .ok_or_else(|| {
6852                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6853                }),
6854            other => Ok(other),
6855        }
6856    }
6857
6858    fn check_table_column_projection_authz(
6859        &self,
6860        table: &TableQuery,
6861        frame: &dyn super::statement_frame::ReadFrame,
6862    ) -> RedDBResult<()> {
6863        let Some((username, role)) = frame.identity() else {
6864            return Ok(());
6865        };
6866        let Some(auth_store) = self.inner.auth_store.read().clone() else {
6867            return Ok(());
6868        };
6869
6870        let columns = self.resolved_table_projection_columns(table)?;
6871        let request = ColumnAccessRequest::select(table.table.clone(), columns);
6872        let principal = UserId::from_parts(frame.effective_scope(), username);
6873        let ctx = runtime_iam_context(role, frame.effective_scope());
6874        let outcome = auth_store.check_column_projection_authz(&principal, &request, &ctx);
6875        if outcome.allowed() {
6876            return Ok(());
6877        }
6878
6879        if let Some(denied) = outcome.first_denied_column() {
6880            return Err(RedDBError::Query(format!(
6881                "permission denied: principal=`{username}` cannot select column `{}`",
6882                denied.resource.name
6883            )));
6884        }
6885        Err(RedDBError::Query(format!(
6886            "permission denied: principal=`{username}` cannot select table `{}`",
6887            table.table
6888        )))
6889    }
6890
6891    fn check_join_column_projection_authz(
6892        &self,
6893        join: &JoinQuery,
6894        frame: &dyn super::statement_frame::ReadFrame,
6895    ) -> RedDBResult<()> {
6896        let mut by_table: HashMap<String, BTreeSet<String>> = HashMap::new();
6897        let projections = crate::storage::query::sql_lowering::effective_join_projections(join);
6898        self.collect_join_projection_columns(join, &projections, &mut by_table)?;
6899
6900        for (table, columns) in by_table {
6901            let query = TableQuery {
6902                table,
6903                source: None,
6904                alias: None,
6905                select_items: Vec::new(),
6906                columns: columns.into_iter().map(Projection::Column).collect(),
6907                where_expr: None,
6908                filter: None,
6909                group_by_exprs: Vec::new(),
6910                group_by: Vec::new(),
6911                having_expr: None,
6912                having: None,
6913                order_by: Vec::new(),
6914                limit: None,
6915                limit_param: None,
6916                offset: None,
6917                offset_param: None,
6918                expand: None,
6919                as_of: None,
6920                sessionize: None,
6921            };
6922            self.check_table_column_projection_authz(&query, frame)?;
6923        }
6924        Ok(())
6925    }
6926
6927    fn collect_join_projection_columns(
6928        &self,
6929        join: &JoinQuery,
6930        projections: &[Projection],
6931        out: &mut HashMap<String, BTreeSet<String>>,
6932    ) -> RedDBResult<()> {
6933        let left = table_side_context(join.left.as_ref());
6934        let right = table_side_context(join.right.as_ref());
6935
6936        if projections
6937            .iter()
6938            .any(|projection| matches!(projection, Projection::All))
6939        {
6940            for side in [left.as_ref(), right.as_ref()].into_iter().flatten() {
6941                out.entry(side.table.clone())
6942                    .or_default()
6943                    .extend(self.table_all_projection_columns(&side.table)?);
6944            }
6945            return Ok(());
6946        }
6947
6948        for projection in projections {
6949            collect_projection_columns_for_join_side(
6950                projection,
6951                left.as_ref(),
6952                right.as_ref(),
6953                out,
6954            )?;
6955        }
6956        Ok(())
6957    }
6958
6959    fn resolved_table_projection_columns(&self, table: &TableQuery) -> RedDBResult<Vec<String>> {
6960        let projections = crate::storage::query::sql_lowering::effective_table_projections(table);
6961        if projections
6962            .iter()
6963            .any(|projection| matches!(projection, Projection::All))
6964        {
6965            return self.table_all_projection_columns(&table.table);
6966        }
6967
6968        let mut columns = BTreeSet::new();
6969        for projection in &projections {
6970            collect_projection_columns_for_table(
6971                projection,
6972                &table.table,
6973                table.alias.as_deref(),
6974                &mut columns,
6975            );
6976        }
6977        Ok(columns.into_iter().collect())
6978    }
6979
6980    fn table_all_projection_columns(&self, table: &str) -> RedDBResult<Vec<String>> {
6981        if let Some(contract) = self.inner.db.collection_contract_arc(table) {
6982            let columns: Vec<String> = contract
6983                .declared_columns
6984                .iter()
6985                .map(|column| column.name.clone())
6986                .collect();
6987            if !columns.is_empty() {
6988                return Ok(columns);
6989            }
6990        }
6991
6992        let records = scan_runtime_table_source_records_limited(&self.inner.db, table, Some(1))?;
6993        Ok(records
6994            .first()
6995            .map(|record| {
6996                record
6997                    .column_names()
6998                    .into_iter()
6999                    .map(|column| column.to_string())
7000                    .collect()
7001            })
7002            .unwrap_or_default())
7003    }
7004
7005    fn resolve_table_expr_subqueries(
7006        &self,
7007        mut table: TableQuery,
7008        frame: &dyn super::statement_frame::ReadFrame,
7009    ) -> RedDBResult<TableQuery> {
7010        if let Some(TableSource::Subquery(inner)) = table.source.take() {
7011            let inner = self.resolve_select_expr_subqueries(*inner, frame)?;
7012            table.source = Some(TableSource::Subquery(Box::new(inner)));
7013        }
7014
7015        let outer_scopes = relation_scopes_for_query(&QueryExpr::Table(table.clone()));
7016        for item in &mut table.select_items {
7017            if let crate::storage::query::ast::SelectItem::Expr { expr, .. } = item {
7018                *expr = self.resolve_expr_subqueries(expr.clone(), &outer_scopes, frame)?;
7019            }
7020        }
7021        if let Some(where_expr) = table.where_expr.take() {
7022            table.where_expr =
7023                Some(self.resolve_expr_subqueries(where_expr, &outer_scopes, frame)?);
7024            table.filter = None;
7025        }
7026        if let Some(having_expr) = table.having_expr.take() {
7027            table.having_expr =
7028                Some(self.resolve_expr_subqueries(having_expr, &outer_scopes, frame)?);
7029            table.having = None;
7030        }
7031        for expr in &mut table.group_by_exprs {
7032            *expr = self.resolve_expr_subqueries(expr.clone(), &outer_scopes, frame)?;
7033        }
7034        for clause in &mut table.order_by {
7035            if let Some(expr) = clause.expr.take() {
7036                clause.expr = Some(self.resolve_expr_subqueries(expr, &outer_scopes, frame)?);
7037            }
7038        }
7039        Ok(table)
7040    }
7041
7042    fn resolve_select_expr_subqueries(
7043        &self,
7044        expr: QueryExpr,
7045        frame: &dyn super::statement_frame::ReadFrame,
7046    ) -> RedDBResult<QueryExpr> {
7047        match expr {
7048            QueryExpr::Table(table) => self
7049                .resolve_table_expr_subqueries(table, frame)
7050                .map(QueryExpr::Table),
7051            QueryExpr::Join(mut join) => {
7052                join.left = Box::new(self.resolve_select_expr_subqueries(*join.left, frame)?);
7053                join.right = Box::new(self.resolve_select_expr_subqueries(*join.right, frame)?);
7054                Ok(QueryExpr::Join(join))
7055            }
7056            other => Ok(other),
7057        }
7058    }
7059
7060    fn resolve_expr_subqueries(
7061        &self,
7062        expr: crate::storage::query::ast::Expr,
7063        outer_scopes: &[String],
7064        frame: &dyn super::statement_frame::ReadFrame,
7065    ) -> RedDBResult<crate::storage::query::ast::Expr> {
7066        use crate::storage::query::ast::Expr;
7067
7068        match expr {
7069            Expr::Subquery { query, span } => {
7070                let values = self.execute_expr_subquery_values(query, outer_scopes, frame)?;
7071                if values.len() > 1 {
7072                    return Err(RedDBError::Query(
7073                        "scalar subquery returned more than one row".to_string(),
7074                    ));
7075                }
7076                Ok(Expr::Literal {
7077                    value: values.into_iter().next().unwrap_or(Value::Null),
7078                    span,
7079                })
7080            }
7081            Expr::BinaryOp { op, lhs, rhs, span } => Ok(Expr::BinaryOp {
7082                op,
7083                lhs: Box::new(self.resolve_expr_subqueries(*lhs, outer_scopes, frame)?),
7084                rhs: Box::new(self.resolve_expr_subqueries(*rhs, outer_scopes, frame)?),
7085                span,
7086            }),
7087            Expr::UnaryOp { op, operand, span } => Ok(Expr::UnaryOp {
7088                op,
7089                operand: Box::new(self.resolve_expr_subqueries(*operand, outer_scopes, frame)?),
7090                span,
7091            }),
7092            Expr::Cast {
7093                inner,
7094                target,
7095                span,
7096            } => Ok(Expr::Cast {
7097                inner: Box::new(self.resolve_expr_subqueries(*inner, outer_scopes, frame)?),
7098                target,
7099                span,
7100            }),
7101            Expr::FunctionCall { name, args, span } => {
7102                let args = args
7103                    .into_iter()
7104                    .map(|arg| self.resolve_expr_subqueries(arg, outer_scopes, frame))
7105                    .collect::<RedDBResult<Vec<_>>>()?;
7106                Ok(Expr::FunctionCall { name, args, span })
7107            }
7108            Expr::Case {
7109                branches,
7110                else_,
7111                span,
7112            } => {
7113                let branches = branches
7114                    .into_iter()
7115                    .map(|(cond, value)| {
7116                        Ok((
7117                            self.resolve_expr_subqueries(cond, outer_scopes, frame)?,
7118                            self.resolve_expr_subqueries(value, outer_scopes, frame)?,
7119                        ))
7120                    })
7121                    .collect::<RedDBResult<Vec<_>>>()?;
7122                let else_ = else_
7123                    .map(|expr| self.resolve_expr_subqueries(*expr, outer_scopes, frame))
7124                    .transpose()?
7125                    .map(Box::new);
7126                Ok(Expr::Case {
7127                    branches,
7128                    else_,
7129                    span,
7130                })
7131            }
7132            Expr::IsNull {
7133                operand,
7134                negated,
7135                span,
7136            } => Ok(Expr::IsNull {
7137                operand: Box::new(self.resolve_expr_subqueries(*operand, outer_scopes, frame)?),
7138                negated,
7139                span,
7140            }),
7141            Expr::InList {
7142                target,
7143                values,
7144                negated,
7145                span,
7146            } => {
7147                let target =
7148                    Box::new(self.resolve_expr_subqueries(*target, outer_scopes, frame)?);
7149                let mut resolved = Vec::new();
7150                for value in values {
7151                    if let Expr::Subquery { query, .. } = value {
7152                        resolved.extend(
7153                            self.execute_expr_subquery_values(query, outer_scopes, frame)?
7154                                .into_iter()
7155                                .map(Expr::lit),
7156                        );
7157                    } else {
7158                        resolved.push(self.resolve_expr_subqueries(value, outer_scopes, frame)?);
7159                    }
7160                }
7161                Ok(Expr::InList {
7162                    target,
7163                    values: resolved,
7164                    negated,
7165                    span,
7166                })
7167            }
7168            Expr::Between {
7169                target,
7170                low,
7171                high,
7172                negated,
7173                span,
7174            } => Ok(Expr::Between {
7175                target: Box::new(self.resolve_expr_subqueries(*target, outer_scopes, frame)?),
7176                low: Box::new(self.resolve_expr_subqueries(*low, outer_scopes, frame)?),
7177                high: Box::new(self.resolve_expr_subqueries(*high, outer_scopes, frame)?),
7178                negated,
7179                span,
7180            }),
7181            other => Ok(other),
7182        }
7183    }
7184
7185    fn execute_expr_subquery_values(
7186        &self,
7187        subquery: crate::storage::query::ast::ExprSubquery,
7188        outer_scopes: &[String],
7189        frame: &dyn super::statement_frame::ReadFrame,
7190    ) -> RedDBResult<Vec<Value>> {
7191        let query = *subquery.query;
7192        if query_references_outer_scope(&query, outer_scopes) {
7193            return Err(RedDBError::Query(
7194                "NOT_YET_SUPPORTED: correlated subqueries are not supported yet; track follow-up issue #470-correlated-subqueries".to_string(),
7195            ));
7196        }
7197        let query = self.rewrite_view_refs(query);
7198        let query = self.resolve_select_expr_subqueries(query, frame)?;
7199        let query = self.authorize_relational_select_expr(query, frame)?;
7200        let result = match query {
7201            QueryExpr::Table(table) => {
7202                execute_runtime_table_query(&self.inner.db, &table, Some(&self.inner.index_store))?
7203            }
7204            QueryExpr::Join(join) => execute_runtime_join_query(&self.inner.db, &join)?,
7205            other => {
7206                return Err(RedDBError::Query(format!(
7207                    "expression subquery must be a SELECT query, got {}",
7208                    query_expr_name(&other)
7209                )))
7210            }
7211        };
7212        first_column_values(result)
7213    }
7214
7215    fn dispatch_expr(
7216        &self,
7217        expr: QueryExpr,
7218        query_str: &str,
7219        mode: QueryMode,
7220    ) -> RedDBResult<RuntimeQueryResult> {
7221        let statement = query_expr_name(&expr);
7222        match expr {
7223            QueryExpr::Graph(_) | QueryExpr::Path(_) => {
7224                // Graph queries are not cacheable as prepared statements.
7225                Err(RedDBError::Query(
7226                    "graph queries cannot be used as prepared statements".to_string(),
7227                ))
7228            }
7229            QueryExpr::Table(table) => {
7230                let scope = self.ai_scope();
7231                let table = self.resolve_table_expr_subqueries(
7232                    table,
7233                    &scope as &dyn super::statement_frame::ReadFrame,
7234                )?;
7235                if super::red_schema::is_virtual_table(&table.table) {
7236                    return Ok(RuntimeQueryResult {
7237                        query: query_str.to_string(),
7238                        mode,
7239                        statement,
7240                        engine: "runtime-red-schema",
7241                        result: super::red_schema::red_query(
7242                            self,
7243                            &table.table,
7244                            &table,
7245                            &scope as &dyn super::statement_frame::ReadFrame,
7246                        )?,
7247                        affected_rows: 0,
7248                        statement_type: "select",
7249                    });
7250                }
7251                let Some(table_with_rls) = self.authorize_relational_table_select(
7252                    table,
7253                    &scope as &dyn super::statement_frame::ReadFrame,
7254                )?
7255                else {
7256                    return Ok(RuntimeQueryResult {
7257                        query: query_str.to_string(),
7258                        mode,
7259                        statement,
7260                        engine: "runtime-table-rls",
7261                        result: crate::storage::query::unified::UnifiedResult::empty(),
7262                        affected_rows: 0,
7263                        statement_type: "select",
7264                    });
7265                };
7266                Ok(RuntimeQueryResult {
7267                    query: query_str.to_string(),
7268                    mode,
7269                    statement,
7270                    engine: "runtime-table",
7271                    result: execute_runtime_table_query(
7272                        &self.inner.db,
7273                        &table_with_rls,
7274                        Some(&self.inner.index_store),
7275                    )?,
7276                    affected_rows: 0,
7277                    statement_type: "select",
7278                })
7279            }
7280            QueryExpr::Join(join) => {
7281                let scope = self.ai_scope();
7282                let Some(join_with_rls) = self.authorize_relational_join_select(
7283                    join,
7284                    &scope as &dyn super::statement_frame::ReadFrame,
7285                )?
7286                else {
7287                    return Ok(RuntimeQueryResult {
7288                        query: query_str.to_string(),
7289                        mode,
7290                        statement,
7291                        engine: "runtime-join-rls",
7292                        result: crate::storage::query::unified::UnifiedResult::empty(),
7293                        affected_rows: 0,
7294                        statement_type: "select",
7295                    });
7296                };
7297                Ok(RuntimeQueryResult {
7298                    query: query_str.to_string(),
7299                    mode,
7300                    statement,
7301                    engine: "runtime-join",
7302                    result: execute_runtime_join_query(&self.inner.db, &join_with_rls)?,
7303                    affected_rows: 0,
7304                    statement_type: "select",
7305                })
7306            }
7307            QueryExpr::Vector(vector) => Ok(RuntimeQueryResult {
7308                query: query_str.to_string(),
7309                mode,
7310                statement,
7311                engine: "runtime-vector",
7312                result: execute_runtime_vector_query(&self.inner.db, &vector)?,
7313                affected_rows: 0,
7314                statement_type: "select",
7315            }),
7316            QueryExpr::Hybrid(hybrid) => Ok(RuntimeQueryResult {
7317                query: query_str.to_string(),
7318                mode,
7319                statement,
7320                engine: "runtime-hybrid",
7321                result: execute_runtime_hybrid_query(&self.inner.db, &hybrid)?,
7322                affected_rows: 0,
7323                statement_type: "select",
7324            }),
7325            QueryExpr::Insert(ref insert) if super::red_schema::is_virtual_table(&insert.table) => {
7326                Err(RedDBError::Query(
7327                    super::red_schema::READ_ONLY_ERROR.to_string(),
7328                ))
7329            }
7330            QueryExpr::Update(ref update) if super::red_schema::is_virtual_table(&update.table) => {
7331                Err(RedDBError::Query(
7332                    super::red_schema::READ_ONLY_ERROR.to_string(),
7333                ))
7334            }
7335            QueryExpr::Delete(ref delete) if super::red_schema::is_virtual_table(&delete.table) => {
7336                Err(RedDBError::Query(
7337                    super::red_schema::READ_ONLY_ERROR.to_string(),
7338                ))
7339            }
7340            QueryExpr::Insert(ref insert) => self
7341                .with_deferred_store_wal_for_dml(self.insert_may_emit_events(insert), || {
7342                    self.execute_insert(query_str, insert)
7343                }),
7344            QueryExpr::Update(ref update) => self
7345                .with_deferred_store_wal_for_dml(self.update_may_emit_events(update), || {
7346                    self.execute_update(query_str, update)
7347                }),
7348            QueryExpr::Delete(ref delete) => self
7349                .with_deferred_store_wal_for_dml(self.delete_may_emit_events(delete), || {
7350                    self.execute_delete(query_str, delete)
7351                }),
7352            QueryExpr::SearchCommand(ref cmd) => self.execute_search_command(query_str, cmd),
7353            QueryExpr::Ask(ref ask) => self.execute_ask(query_str, ask),
7354            _ => Err(RedDBError::Query(format!(
7355                "prepared-statement execution does not support {statement} statements"
7356            ))),
7357        }
7358    }
7359
7360    /// Ultra-fast path: detect `SELECT * FROM table WHERE _entity_id = N` by string pattern
7361    /// and execute it without SQL parsing or planning. Returns None if pattern doesn't match.
7362    fn try_fast_entity_lookup(&self, query: &str) -> Option<RedDBResult<RuntimeQueryResult>> {
7363        // Pattern: "SELECT * FROM <table> WHERE _entity_id = <id>"
7364        // or "SELECT * FROM <table> WHERE _entity_id =<id>"
7365        let q = query.trim();
7366        if !q.starts_with("SELECT") && !q.starts_with("select") {
7367            return None;
7368        }
7369
7370        // Find "WHERE _entity_id = " or "WHERE _entity_id ="
7371        let where_pos = q
7372            .find("WHERE _entity_id")
7373            .or_else(|| q.find("where _entity_id"))?;
7374        let after_field = &q[where_pos + 16..].trim_start(); // skip "WHERE _entity_id"
7375        let after_eq = after_field.strip_prefix('=')?.trim_start();
7376
7377        // Parse the entity ID number
7378        let id_str = after_eq.trim();
7379        let entity_id: u64 = id_str.parse().ok()?;
7380
7381        // Extract table name: between "FROM " and " WHERE"
7382        let from_pos = q.find("FROM ").or_else(|| q.find("from "))? + 5;
7383        let table = q[from_pos..where_pos].trim();
7384        if table.is_empty()
7385            || table.contains(' ') && !table.contains(" AS ") && !table.contains(" as ")
7386        {
7387            return None; // complex query, fall through
7388        }
7389        let table_name = table.split_whitespace().next()?;
7390
7391        // Direct entity lookup — skips SQL parse, plan cache, result
7392        // cache, view rewriter, RLS gate. Safe because the gating in
7393        // `execute_query` guarantees no scope override / no
7394        // transaction context is active. MVCC visibility is still
7395        // honoured against the current snapshot.
7396        let store = self.inner.db.store();
7397        let entity = store
7398            .get(
7399                table_name,
7400                crate::storage::unified::EntityId::new(entity_id),
7401            )
7402            .filter(entity_visible_under_current_snapshot);
7403
7404        let count = if entity.is_some() { 1u64 } else { 0 };
7405
7406        // Materialize a record so downstream consumers that walk
7407        // `result.records` (embedded runtime API, decrypt pass, CLI)
7408        // see the row. Previously only `pre_serialized_json` was
7409        // filled, which caused those consumers to see zero rows and
7410        // skewed benchmarks.
7411        let records: Vec<crate::storage::query::unified::UnifiedRecord> = entity
7412            .as_ref()
7413            .and_then(|e| runtime_table_record_from_entity(e.clone()))
7414            .into_iter()
7415            .collect();
7416
7417        let json = match entity {
7418            Some(ref e) => execute_runtime_serialize_single_entity(e),
7419            None => r#"{"columns":[],"record_count":0,"selection":{"scope":"any"},"records":[]}"#
7420                .to_string(),
7421        };
7422
7423        Some(Ok(RuntimeQueryResult {
7424            query: query.to_string(),
7425            mode: crate::storage::query::modes::QueryMode::Sql,
7426            statement: "select",
7427            engine: "fast-entity-lookup",
7428            result: crate::storage::query::unified::UnifiedResult {
7429                columns: Vec::new(),
7430                records,
7431                stats: crate::storage::query::unified::QueryStats {
7432                    rows_scanned: count,
7433                    ..Default::default()
7434                },
7435                pre_serialized_json: Some(json),
7436            },
7437            affected_rows: 0,
7438            statement_type: "select",
7439        }))
7440    }
7441
7442    fn result_cache_backend(&self) -> RuntimeResultCacheBackend {
7443        match self
7444            .config_string(RESULT_CACHE_BACKEND_KEY, RESULT_CACHE_DEFAULT_BACKEND)
7445            .as_str()
7446        {
7447            "blob_cache" => RuntimeResultCacheBackend::BlobCache,
7448            "shadow" => RuntimeResultCacheBackend::Shadow,
7449            _ => RuntimeResultCacheBackend::Legacy,
7450        }
7451    }
7452
7453    pub(super) fn get_result_cache_entry(&self, key: &str) -> Option<RuntimeQueryResult> {
7454        match self.result_cache_backend() {
7455            RuntimeResultCacheBackend::Legacy => self.get_legacy_result_cache_entry(key),
7456            RuntimeResultCacheBackend::BlobCache => self.get_blob_result_cache_entry(key),
7457            RuntimeResultCacheBackend::Shadow => {
7458                let legacy = self.get_legacy_result_cache_entry(key);
7459                let blob = self.get_blob_result_cache_entry(key);
7460                if let (Some(ref legacy), Some(ref blob)) = (&legacy, &blob) {
7461                    if result_cache_fingerprint(legacy) != result_cache_fingerprint(blob) {
7462                        self.inner
7463                            .result_cache_shadow_divergences
7464                            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
7465                        tracing::warn!(
7466                            key,
7467                            metric = crate::runtime::METRIC_CACHE_SHADOW_DIVERGENCE_TOTAL,
7468                            "result cache shadow backend diverged from legacy"
7469                        );
7470                    }
7471                }
7472                legacy
7473            }
7474        }
7475    }
7476
7477    fn get_legacy_result_cache_entry(&self, key: &str) -> Option<RuntimeQueryResult> {
7478        let cache = self.inner.result_cache.read();
7479        cache.0.get(key).and_then(|entry| {
7480            if entry.cached_at.elapsed().as_secs() < RESULT_CACHE_TTL_SECS {
7481                Some(entry.result.clone())
7482            } else {
7483                None
7484            }
7485        })
7486    }
7487
7488    fn get_blob_result_cache_entry(&self, key: &str) -> Option<RuntimeQueryResult> {
7489        let hit = self
7490            .inner
7491            .result_blob_cache
7492            .get(RESULT_CACHE_BLOB_NAMESPACE, key)?;
7493        {
7494            let cache = self.inner.result_blob_entries.read();
7495            if let Some(entry) = cache.0.get(key) {
7496                return Some(entry.result.clone());
7497            }
7498        }
7499
7500        let (result, scopes) = decode_result_cache_payload(hit.value())?;
7501        let mut cache = self.inner.result_blob_entries.write();
7502        let (ref mut map, ref mut order) = *cache;
7503        if !map.contains_key(key) {
7504            order.push_back(key.to_string());
7505        }
7506        map.insert(
7507            key.to_string(),
7508            RuntimeResultCacheEntry {
7509                result: result.clone(),
7510                cached_at: std::time::Instant::now(),
7511                scopes,
7512            },
7513        );
7514        trim_result_cache(map, order);
7515        Some(result)
7516    }
7517
7518    pub(super) fn put_result_cache_entry(&self, key: &str, entry: RuntimeResultCacheEntry) {
7519        match self.result_cache_backend() {
7520            RuntimeResultCacheBackend::Legacy => self.put_legacy_result_cache_entry(key, entry),
7521            RuntimeResultCacheBackend::BlobCache => self.put_blob_result_cache_entry(key, entry),
7522            RuntimeResultCacheBackend::Shadow => {
7523                self.put_legacy_result_cache_entry(key, entry.clone());
7524                self.put_blob_result_cache_entry(key, entry);
7525            }
7526        }
7527    }
7528
7529    fn put_legacy_result_cache_entry(&self, key: &str, entry: RuntimeResultCacheEntry) {
7530        let mut cache = self.inner.result_cache.write();
7531        let (ref mut map, ref mut order) = *cache;
7532        if !map.contains_key(key) {
7533            order.push_back(key.to_string());
7534        }
7535        map.insert(key.to_string(), entry);
7536        trim_result_cache(map, order);
7537    }
7538
7539    fn put_blob_result_cache_entry(&self, key: &str, entry: RuntimeResultCacheEntry) {
7540        let policy = crate::storage::cache::BlobCachePolicy::default()
7541            .ttl_ms(RESULT_CACHE_TTL_SECS * 1000)
7542            .priority(200);
7543        let dependencies = entry.scopes.iter().cloned().collect::<Vec<_>>();
7544        let bytes = encode_result_cache_payload(&entry)
7545            .unwrap_or_else(|| result_cache_fingerprint(&entry.result).into_bytes());
7546        let put = crate::storage::cache::BlobCachePut::new(bytes)
7547            .with_dependencies(dependencies)
7548            .with_policy(policy);
7549        if self
7550            .inner
7551            .result_blob_cache
7552            .put(RESULT_CACHE_BLOB_NAMESPACE, key, put)
7553            .is_err()
7554        {
7555            return;
7556        }
7557
7558        let mut cache = self.inner.result_blob_entries.write();
7559        let (ref mut map, ref mut order) = *cache;
7560        if !map.contains_key(key) {
7561            order.push_back(key.to_string());
7562        }
7563        map.insert(key.to_string(), entry);
7564        trim_result_cache(map, order);
7565    }
7566
7567    pub fn result_cache_shadow_divergences(&self) -> u64 {
7568        self.inner
7569            .result_cache_shadow_divergences
7570            .load(std::sync::atomic::Ordering::Relaxed)
7571    }
7572
7573    /// Invalidate the result cache (call after any write operation).
7574    /// Full clear — use for DDL (DROP TABLE, schema changes) or when table is unknown.
7575    pub fn invalidate_result_cache(&self) {
7576        let mut cache = self.inner.result_cache.write();
7577        cache.0.clear();
7578        cache.1.clear();
7579        let mut blob_entries = self.inner.result_blob_entries.write();
7580        blob_entries.0.clear();
7581        blob_entries.1.clear();
7582        self.inner
7583            .result_blob_cache
7584            .invalidate_namespace(RESULT_CACHE_BLOB_NAMESPACE);
7585        let mut ask_entries = self.inner.ask_answer_cache_entries.write();
7586        ask_entries.0.clear();
7587        ask_entries.1.clear();
7588        self.inner
7589            .result_blob_cache
7590            .invalidate_namespace(ASK_ANSWER_CACHE_NAMESPACE);
7591    }
7592
7593    /// Invalidate only result cache entries that declared a dependency on `table`.
7594    /// Cheaper than a full clear: unrelated tables keep their cached results.
7595    pub(crate) fn invalidate_result_cache_for_table(&self, table: &str) {
7596        // Hot-path probe both backends before taking write locks. The blob
7597        // backend is node-local, same as the legacy result cache.
7598        let legacy_has_match = {
7599            let cache = self.inner.result_cache.read();
7600            let (ref map, _) = *cache;
7601            !map.is_empty() && map.values().any(|entry| entry.scopes.contains(table))
7602        };
7603        let blob_has_match = {
7604            let cache = self.inner.result_blob_entries.read();
7605            let (ref map, _) = *cache;
7606            !map.is_empty() && map.values().any(|entry| entry.scopes.contains(table))
7607        };
7608        if legacy_has_match {
7609            let mut cache = self.inner.result_cache.write();
7610            let (ref mut map, ref mut order) = *cache;
7611            map.retain(|_, entry| !entry.scopes.contains(table));
7612            order.retain(|key| map.contains_key(key));
7613        }
7614
7615        if matches!(
7616            self.result_cache_backend(),
7617            RuntimeResultCacheBackend::BlobCache | RuntimeResultCacheBackend::Shadow
7618        ) {
7619            let mut blob_entries = self.inner.result_blob_entries.write();
7620            let (ref mut blob_map, ref mut blob_order) = *blob_entries;
7621            blob_map.clear();
7622            blob_order.clear();
7623            self.inner
7624                .result_blob_cache
7625                .invalidate_namespace(RESULT_CACHE_BLOB_NAMESPACE);
7626        } else if blob_has_match {
7627            let mut blob_entries = self.inner.result_blob_entries.write();
7628            let (ref mut blob_map, ref mut blob_order) = *blob_entries;
7629            blob_map.retain(|_, entry| !entry.scopes.contains(table));
7630            blob_order.retain(|key| blob_map.contains_key(key));
7631        }
7632        let mut ask_entries = self.inner.ask_answer_cache_entries.write();
7633        ask_entries.0.clear();
7634        ask_entries.1.clear();
7635        self.inner
7636            .result_blob_cache
7637            .invalidate_namespace(ASK_ANSWER_CACHE_NAMESPACE);
7638    }
7639
7640    pub(crate) fn invalidate_plan_cache(&self) {
7641        self.inner.query_cache.write().clear();
7642        self.inner
7643            .ddl_epoch
7644            .fetch_add(1, std::sync::atomic::Ordering::Release);
7645    }
7646
7647    /// Read the monotonic DDL epoch counter. Bumped by every
7648    /// `invalidate_plan_cache` call so prepared-statement holders can
7649    /// detect schema drift between PREPARE and EXECUTE.
7650    pub fn ddl_epoch(&self) -> u64 {
7651        self.inner
7652            .ddl_epoch
7653            .load(std::sync::atomic::Ordering::Acquire)
7654    }
7655
7656    pub(crate) fn clear_table_planner_stats(&self, table: &str) {
7657        let store = self.inner.db.store();
7658        crate::storage::query::planner::stats_catalog::clear_table_stats(store.as_ref(), table);
7659        self.invalidate_plan_cache();
7660    }
7661
7662    /// Replay `tenant_tables.*.column` keys from red_config at boot so
7663    /// `CREATE TABLE ... TENANT BY (col)` declarations persist across
7664    /// restarts (Phase 2.5.4). Reads every row of the `red_config`
7665    /// collection, picks the keys matching the tenant-marker shape,
7666    /// and calls `register_tenant_table` for each.
7667    ///
7668    /// Safe no-op when `red_config` doesn't exist (first boot on a
7669    /// fresh datadir).
7670    pub(crate) fn rehydrate_tenant_tables(&self) {
7671        let store = self.inner.db.store();
7672        let Some(manager) = store.get_collection("red_config") else {
7673            return;
7674        };
7675        // Replay in insertion order (SegmentManager iteration). Multiple
7676        // toggles on the same table leave several rows behind — the
7677        // last one processed wins because each register/unregister
7678        // call overwrites the in-memory state.
7679        for entity in manager.query_all(|_| true) {
7680            let crate::storage::unified::entity::EntityData::Row(row) = &entity.data else {
7681                continue;
7682            };
7683            let Some(named) = &row.named else { continue };
7684            let Some(crate::storage::schema::Value::Text(key)) = named.get("key") else {
7685                continue;
7686            };
7687            // Shape: tenant_tables.{table}.column
7688            let Some(rest) = key.strip_prefix("tenant_tables.") else {
7689                continue;
7690            };
7691            let Some((table, suffix)) = rest.rsplit_once('.') else {
7692                // Issue #205 — a `tenant_tables.*` row that doesn't
7693                // split cleanly is a schema-shape regression: the
7694                // metadata writer must always emit the `.column`
7695                // suffix, so reaching this branch means an upgrade
7696                // with incompatible state or external tampering.
7697                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7698                    collection: "red_config".to_string(),
7699                    detail: format!("malformed tenant_tables key: {key}"),
7700                }
7701                .emit_global();
7702                continue;
7703            };
7704            if suffix != "column" {
7705                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7706                    collection: "red_config".to_string(),
7707                    detail: format!("unexpected tenant_tables suffix: {key}"),
7708                }
7709                .emit_global();
7710                continue;
7711            }
7712            match named.get("value") {
7713                Some(crate::storage::schema::Value::Text(column)) => {
7714                    self.register_tenant_table(table, column);
7715                }
7716                // Null / missing value = DISABLE TENANCY marker.
7717                Some(crate::storage::schema::Value::Null) | None => {
7718                    self.unregister_tenant_table(table);
7719                }
7720                _ => {}
7721            }
7722        }
7723    }
7724
7725    /// Replay every persisted `MaterializedViewDescriptor` from the
7726    /// `red_materialized_view_defs` system collection (issue #593
7727    /// slice 9a). For each descriptor, re-parse the original SQL,
7728    /// extract the `QueryExpr::CreateView` it produced, and populate
7729    /// the in-memory registries (`inner.views` and
7730    /// `inner.materialized_views`) directly — no write paths run, so
7731    /// rehydrate does not re-persist what it just read.
7732    ///
7733    /// Malformed rows (missing `name`/`source_sql`, parse errors) are
7734    /// skipped with a `SchemaCorruption` operator event so a single
7735    /// bad entry does not block startup.
7736    pub(crate) fn rehydrate_materialized_view_descriptors(&self) {
7737        let store = self.inner.db.store();
7738        let descriptors = crate::runtime::continuous_materialized_view::load_all(store.as_ref());
7739        for descriptor in descriptors {
7740            let parsed = match crate::storage::query::parser::parse(&descriptor.source_sql) {
7741                Ok(qc) => qc,
7742                Err(err) => {
7743                    crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7744                        collection:
7745                            crate::runtime::continuous_materialized_view::CATALOG_COLLECTION
7746                                .to_string(),
7747                        detail: format!(
7748                            "failed to re-parse materialized-view source for {}: {err}",
7749                            descriptor.name
7750                        ),
7751                    }
7752                    .emit_global();
7753                    continue;
7754                }
7755            };
7756            let crate::storage::query::ast::QueryExpr::CreateView(create) = parsed.query else {
7757                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7758                    collection: crate::runtime::continuous_materialized_view::CATALOG_COLLECTION
7759                        .to_string(),
7760                    detail: format!(
7761                        "materialized-view source for {} did not re-parse as CREATE VIEW",
7762                        descriptor.name
7763                    ),
7764                }
7765                .emit_global();
7766                continue;
7767            };
7768            // Populate in-memory view registry.
7769            let view_name = create.name.clone();
7770            self.inner
7771                .views
7772                .write()
7773                .insert(view_name.clone(), Arc::new(create));
7774            // Materialized cache slot (data empty until next REFRESH).
7775            use crate::storage::cache::result::{MaterializedViewDef, RefreshPolicy};
7776            let refresh = match descriptor.refresh_every_ms {
7777                Some(ms) => RefreshPolicy::Periodic(std::time::Duration::from_millis(ms)),
7778                None => RefreshPolicy::Manual,
7779            };
7780            let def = MaterializedViewDef {
7781                name: view_name.clone(),
7782                query: format!("<parsed view {}>", view_name),
7783                dependencies: descriptor.source_collections.clone(),
7784                refresh,
7785                retention_duration_ms: descriptor.retention_duration_ms,
7786            };
7787            self.inner.materialized_views.write().register(def);
7788        }
7789        // A rehydrated view shape may differ from any plans the cache
7790        // bootstrapped before this method ran — flush to be safe.
7791        self.invalidate_plan_cache();
7792    }
7793
7794    pub(crate) fn rehydrate_declared_column_schemas(&self) {
7795        let store = self.inner.db.store();
7796        for contract in self.inner.db.collection_contracts() {
7797            let columns: Vec<String> = contract
7798                .declared_columns
7799                .iter()
7800                .map(|column| column.name.clone())
7801                .collect();
7802            let Some(manager) = store.get_collection(&contract.name) else {
7803                continue;
7804            };
7805            manager.set_column_schema_if_empty(columns);
7806        }
7807    }
7808
7809    /// Register a table as tenant-scoped (Phase 2.5.4). Installs the
7810    /// in-memory column mapping, the implicit RLS policy, and enables
7811    /// row-level security on the table. Idempotent — re-registering
7812    /// the same `(table, column)` replaces the prior auto-policy.
7813    pub fn register_tenant_table(&self, table: &str, column: &str) {
7814        use crate::storage::query::ast::{
7815            CompareOp, CreatePolicyQuery, Expr, FieldRef, Filter, Span,
7816        };
7817        self.inner
7818            .tenant_tables
7819            .write()
7820            .insert(table.to_string(), column.to_string());
7821
7822        // Build the policy: col = CURRENT_TENANT()
7823        // Uses CompareExpr so the comparison happens at runtime against
7824        // the thread-local tenant value read by the CURRENT_TENANT
7825        // scalar. Spans are synthetic — there's no source location for
7826        // an auto-generated policy.
7827        let lhs = Expr::Column {
7828            field: FieldRef::TableColumn {
7829                table: table.to_string(),
7830                column: column.to_string(),
7831            },
7832            span: Span::synthetic(),
7833        };
7834        let rhs = Expr::FunctionCall {
7835            name: "CURRENT_TENANT".to_string(),
7836            args: Vec::new(),
7837            span: Span::synthetic(),
7838        };
7839        let policy_filter = Filter::CompareExpr {
7840            lhs,
7841            op: CompareOp::Eq,
7842            rhs,
7843        };
7844
7845        let policy = CreatePolicyQuery {
7846            name: "__tenant_iso".to_string(),
7847            table: table.to_string(),
7848            action: None, // None = ALL actions (SELECT/INSERT/UPDATE/DELETE)
7849            role: None,   // None = every role
7850            using: Box::new(policy_filter),
7851            // Auto-tenancy defaults to Table targets. Collections of
7852            // other kinds (graph / vector / queue / timeseries) that
7853            // opt in via `ALTER ... ENABLE TENANCY` should use the
7854            // matching kind — but for now we keep the auto-policy
7855            // kind-agnostic so the evaluator can apply it to any
7856            // entity living in the collection.
7857            target_kind: crate::storage::query::ast::PolicyTargetKind::Table,
7858        };
7859
7860        // Replace any prior auto-policy for this table (column rename).
7861        self.inner.rls_policies.write().insert(
7862            (table.to_string(), "__tenant_iso".to_string()),
7863            Arc::new(policy),
7864        );
7865        self.inner
7866            .rls_enabled_tables
7867            .write()
7868            .insert(table.to_string());
7869
7870        // Auto-build a hash index on the tenant column. Every read/write
7871        // against a tenant-scoped table carries an implicit
7872        // `col = CURRENT_TENANT()` predicate from the auto-policy, so an
7873        // index on that column is on the hot path of every query. Without
7874        // it, every SELECT/UPDATE/DELETE degrades to a full scan.
7875        self.ensure_tenant_index(table, column);
7876    }
7877
7878    /// Auto-create the hash index that backs the tenant-iso RLS predicate.
7879    /// Skipped when:
7880    ///   * the column is dotted (nested path — flat secondary indices
7881    ///     don't cover those today; RLS still works via the policy)
7882    ///   * `__tenant_idx_{table}` already exists (idempotent on rehydrate)
7883    ///   * the user already registered an index whose first column matches
7884    ///     (avoids redundant duplicates of a user-defined composite)
7885    fn ensure_tenant_index(&self, table: &str, column: &str) {
7886        if column.contains('.') {
7887            return;
7888        }
7889        let index_name = format!("__tenant_idx_{table}");
7890        let registry = self.inner.index_store.list_indices(table);
7891        if registry.iter().any(|idx| idx.name == index_name) {
7892            return;
7893        }
7894        if registry
7895            .iter()
7896            .any(|idx| idx.columns.first().map(|c| c.as_str()) == Some(column))
7897        {
7898            return;
7899        }
7900
7901        let store = self.inner.db.store();
7902        let Some(manager) = store.get_collection(table) else {
7903            return;
7904        };
7905        let entities = manager.query_all(|_| true);
7906        let entity_fields: Vec<(
7907            crate::storage::unified::EntityId,
7908            Vec<(String, crate::storage::schema::Value)>,
7909        )> = entities
7910            .iter()
7911            .map(|e| {
7912                let fields = match &e.data {
7913                    crate::storage::EntityData::Row(row) => {
7914                        if let Some(ref named) = row.named {
7915                            named.iter().map(|(k, v)| (k.clone(), v.clone())).collect()
7916                        } else if let Some(ref schema) = row.schema {
7917                            schema
7918                                .iter()
7919                                .zip(row.columns.iter())
7920                                .map(|(k, v)| (k.clone(), v.clone()))
7921                                .collect()
7922                        } else {
7923                            Vec::new()
7924                        }
7925                    }
7926                    crate::storage::EntityData::Node(node) => node
7927                        .properties
7928                        .iter()
7929                        .map(|(k, v)| (k.clone(), v.clone()))
7930                        .collect(),
7931                    _ => Vec::new(),
7932                };
7933                (e.id, fields)
7934            })
7935            .collect();
7936
7937        let columns = vec![column.to_string()];
7938        if self
7939            .inner
7940            .index_store
7941            .create_index(
7942                &index_name,
7943                table,
7944                &columns,
7945                super::index_store::IndexMethodKind::Hash,
7946                false,
7947                &entity_fields,
7948            )
7949            .is_err()
7950        {
7951            return;
7952        }
7953        self.inner
7954            .index_store
7955            .register(super::index_store::RegisteredIndex {
7956                name: index_name,
7957                collection: table.to_string(),
7958                columns,
7959                method: super::index_store::IndexMethodKind::Hash,
7960                unique: false,
7961            });
7962        self.invalidate_plan_cache();
7963    }
7964
7965    /// Drop the auto-generated tenant index, if one exists. Called from
7966    /// `unregister_tenant_table` so DISABLE TENANCY / DROP TABLE clean up.
7967    fn drop_tenant_index(&self, table: &str) {
7968        let index_name = format!("__tenant_idx_{table}");
7969        self.inner.index_store.drop_index(&index_name, table);
7970    }
7971
7972    /// Retrieve the tenant column for a table, if any (Phase 2.5.4).
7973    /// Used by the INSERT auto-fill path to know which column to
7974    /// populate with `current_tenant()` when the user didn't name it.
7975    pub fn tenant_column(&self, table: &str) -> Option<String> {
7976        self.inner.tenant_tables.read().get(table).cloned()
7977    }
7978
7979    /// Remove a table's tenant registration (Phase 2.5.4). Called by
7980    /// DROP TABLE / ALTER TABLE DISABLE TENANCY. Removes the auto-policy
7981    /// but leaves any user-installed explicit policies intact.
7982    pub fn unregister_tenant_table(&self, table: &str) {
7983        self.inner.tenant_tables.write().remove(table);
7984        self.inner
7985            .rls_policies
7986            .write()
7987            .remove(&(table.to_string(), "__tenant_iso".to_string()));
7988        self.drop_tenant_index(table);
7989        // Only clear RLS enablement if no other policies remain.
7990        let has_other_policies = self
7991            .inner
7992            .rls_policies
7993            .read()
7994            .keys()
7995            .any(|(t, _)| t == table);
7996        if !has_other_policies {
7997            self.inner.rls_enabled_tables.write().remove(table);
7998        }
7999    }
8000
8001    /// Record that the running transaction has marked `id` in `collection`
8002    /// for deletion (Phase 2.3.2b MVCC tombstones). `stamper_xid` is the
8003    /// xid that was written into `xmax` — either the parent txn xid or
8004    /// the innermost savepoint sub-xid. Savepoint rollback filters by
8005    /// this xid to revive only its own tombstones.
8006    pub(crate) fn record_pending_tombstone(
8007        &self,
8008        conn_id: u64,
8009        collection: &str,
8010        id: crate::storage::unified::entity::EntityId,
8011        stamper_xid: crate::storage::transaction::snapshot::Xid,
8012        previous_xmax: crate::storage::transaction::snapshot::Xid,
8013    ) {
8014        self.inner
8015            .pending_tombstones
8016            .write()
8017            .entry(conn_id)
8018            .or_default()
8019            .push((collection.to_string(), id, stamper_xid, previous_xmax));
8020    }
8021
8022    pub(crate) fn record_pending_versioned_update(
8023        &self,
8024        conn_id: u64,
8025        collection: &str,
8026        old_id: crate::storage::unified::entity::EntityId,
8027        new_id: crate::storage::unified::entity::EntityId,
8028        stamper_xid: crate::storage::transaction::snapshot::Xid,
8029        previous_xmax: crate::storage::transaction::snapshot::Xid,
8030    ) {
8031        self.inner
8032            .pending_versioned_updates
8033            .write()
8034            .entry(conn_id)
8035            .or_default()
8036            .push((
8037                collection.to_string(),
8038                old_id,
8039                new_id,
8040                stamper_xid,
8041                previous_xmax,
8042            ));
8043    }
8044
8045    fn with_deferred_store_wal_if_transaction<T>(
8046        &self,
8047        f: impl FnOnce() -> RedDBResult<T>,
8048    ) -> RedDBResult<T> {
8049        let conn_id = current_connection_id();
8050        if !self.inner.tx_contexts.read().contains_key(&conn_id) {
8051            return f();
8052        }
8053
8054        crate::storage::UnifiedStore::begin_deferred_store_wal_capture();
8055        let result = f();
8056        let captured = crate::storage::UnifiedStore::take_deferred_store_wal_capture();
8057        match result {
8058            Ok(value) => {
8059                self.record_pending_store_wal_actions(conn_id, captured);
8060                Ok(value)
8061            }
8062            Err(err) => Err(err),
8063        }
8064    }
8065
8066    fn with_deferred_store_wal_for_dml<T>(
8067        &self,
8068        capture_autocommit_events: bool,
8069        f: impl FnOnce() -> RedDBResult<T>,
8070    ) -> RedDBResult<T> {
8071        let conn_id = current_connection_id();
8072        if self.inner.tx_contexts.read().contains_key(&conn_id) {
8073            return self.with_deferred_store_wal_if_transaction(f);
8074        }
8075        if !capture_autocommit_events {
8076            return f();
8077        }
8078
8079        crate::storage::UnifiedStore::begin_deferred_store_wal_capture();
8080        let result = f();
8081        let captured = crate::storage::UnifiedStore::take_deferred_store_wal_capture();
8082        self.inner
8083            .db
8084            .store()
8085            .append_deferred_store_wal_actions(captured)
8086            .map_err(|err| RedDBError::Internal(err.to_string()))?;
8087        result
8088    }
8089
8090    fn insert_may_emit_events(&self, query: &InsertQuery) -> bool {
8091        !query.suppress_events
8092            && self.collection_has_event_subscriptions_for_operation(
8093                &query.table,
8094                crate::catalog::SubscriptionOperation::Insert,
8095            )
8096    }
8097
8098    fn update_may_emit_events(&self, query: &UpdateQuery) -> bool {
8099        !query.suppress_events
8100            && self.collection_has_event_subscriptions_for_operation(
8101                &query.table,
8102                crate::catalog::SubscriptionOperation::Update,
8103            )
8104    }
8105
8106    fn delete_may_emit_events(&self, query: &DeleteQuery) -> bool {
8107        !query.suppress_events
8108            && self.collection_has_event_subscriptions_for_operation(
8109                &query.table,
8110                crate::catalog::SubscriptionOperation::Delete,
8111            )
8112    }
8113
8114    fn collection_has_event_subscriptions_for_operation(
8115        &self,
8116        collection: &str,
8117        operation: crate::catalog::SubscriptionOperation,
8118    ) -> bool {
8119        let Some(contract) = self.db().collection_contract_arc(collection) else {
8120            return false;
8121        };
8122        contract.subscriptions.iter().any(|subscription| {
8123            subscription.enabled
8124                && (subscription.ops_filter.is_empty()
8125                    || subscription.ops_filter.contains(&operation))
8126        })
8127    }
8128
8129    fn record_pending_store_wal_actions(
8130        &self,
8131        conn_id: u64,
8132        actions: crate::storage::unified::DeferredStoreWalActions,
8133    ) {
8134        if actions.is_empty() {
8135            return;
8136        }
8137        let mut guard = self.inner.pending_store_wal_actions.write();
8138        guard.entry(conn_id).or_default().extend(actions);
8139    }
8140
8141    fn flush_pending_store_wal_actions(&self, conn_id: u64) -> RedDBResult<()> {
8142        let Some(actions) = self
8143            .inner
8144            .pending_store_wal_actions
8145            .write()
8146            .remove(&conn_id)
8147        else {
8148            return Ok(());
8149        };
8150        self.inner
8151            .db
8152            .store()
8153            .append_deferred_store_wal_actions(actions)
8154            .map_err(|err| RedDBError::Internal(err.to_string()))
8155    }
8156
8157    fn discard_pending_store_wal_actions(&self, conn_id: u64) {
8158        self.inner
8159            .pending_store_wal_actions
8160            .write()
8161            .remove(&conn_id);
8162    }
8163
8164    fn xid_conflicts_with_snapshot(
8165        &self,
8166        xid: crate::storage::transaction::snapshot::Xid,
8167        snapshot: &crate::storage::transaction::snapshot::Snapshot,
8168        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
8169    ) -> bool {
8170        xid != 0
8171            && !own_xids.contains(&xid)
8172            && !self.inner.snapshot_manager.is_aborted(xid)
8173            && !self.inner.snapshot_manager.is_active(xid)
8174            && (xid > snapshot.xid || snapshot.in_progress.contains(&xid))
8175    }
8176
8177    fn conflict_error(
8178        collection: &str,
8179        logical_id: crate::storage::unified::entity::EntityId,
8180        xid: crate::storage::transaction::snapshot::Xid,
8181    ) -> RedDBError {
8182        RedDBError::Query(format!(
8183            "serialization conflict: table row {collection}/{} was modified by concurrent transaction {xid}",
8184            logical_id.raw()
8185        ))
8186    }
8187
8188    fn check_logical_row_conflict(
8189        &self,
8190        collection: &str,
8191        logical_id: crate::storage::unified::entity::EntityId,
8192        excluded_ids: &[crate::storage::unified::entity::EntityId],
8193        snapshot: &crate::storage::transaction::snapshot::Snapshot,
8194        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
8195    ) -> RedDBResult<()> {
8196        let store = self.inner.db.store();
8197        let Some(manager) = store.get_collection(collection) else {
8198            return Ok(());
8199        };
8200
8201        for candidate in manager.query_all(|_| true) {
8202            if excluded_ids.contains(&candidate.id) || candidate.logical_id() != logical_id {
8203                continue;
8204            }
8205            if self.xid_conflicts_with_snapshot(candidate.xmin, snapshot, own_xids) {
8206                return Err(Self::conflict_error(collection, logical_id, candidate.xmin));
8207            }
8208            if self.xid_conflicts_with_snapshot(candidate.xmax, snapshot, own_xids) {
8209                return Err(Self::conflict_error(collection, logical_id, candidate.xmax));
8210            }
8211        }
8212        Ok(())
8213    }
8214
8215    pub(crate) fn check_table_row_write_conflicts(
8216        &self,
8217        conn_id: u64,
8218        snapshot: &crate::storage::transaction::snapshot::Snapshot,
8219        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
8220    ) -> RedDBResult<()> {
8221        let versioned_updates = self
8222            .inner
8223            .pending_versioned_updates
8224            .read()
8225            .get(&conn_id)
8226            .cloned()
8227            .unwrap_or_default();
8228        let tombstones = self
8229            .inner
8230            .pending_tombstones
8231            .read()
8232            .get(&conn_id)
8233            .cloned()
8234            .unwrap_or_default();
8235
8236        let store = self.inner.db.store();
8237        for (collection, old_id, new_id, xid, previous_xmax) in versioned_updates {
8238            let Some(manager) = store.get_collection(&collection) else {
8239                continue;
8240            };
8241            let Some(old) = manager.get(old_id) else {
8242                continue;
8243            };
8244            let logical_id = old.logical_id();
8245            if self.xid_conflicts_with_snapshot(previous_xmax, snapshot, own_xids) {
8246                return Err(Self::conflict_error(&collection, logical_id, previous_xmax));
8247            }
8248            if old.xmax != xid && self.xid_conflicts_with_snapshot(old.xmax, snapshot, own_xids) {
8249                return Err(Self::conflict_error(&collection, logical_id, old.xmax));
8250            }
8251            self.check_logical_row_conflict(
8252                &collection,
8253                logical_id,
8254                &[old_id, new_id],
8255                snapshot,
8256                own_xids,
8257            )?;
8258        }
8259
8260        for (collection, id, xid, previous_xmax) in tombstones {
8261            let Some(manager) = store.get_collection(&collection) else {
8262                continue;
8263            };
8264            let Some(entity) = manager.get(id) else {
8265                continue;
8266            };
8267            let logical_id = entity.logical_id();
8268            if self.xid_conflicts_with_snapshot(previous_xmax, snapshot, own_xids) {
8269                return Err(Self::conflict_error(&collection, logical_id, previous_xmax));
8270            }
8271            if entity.xmax != xid
8272                && self.xid_conflicts_with_snapshot(entity.xmax, snapshot, own_xids)
8273            {
8274                return Err(Self::conflict_error(&collection, logical_id, entity.xmax));
8275            }
8276            self.check_logical_row_conflict(&collection, logical_id, &[id], snapshot, own_xids)?;
8277        }
8278
8279        Ok(())
8280    }
8281
8282    pub(crate) fn restore_pending_write_stamps(&self, conn_id: u64) {
8283        let versioned_updates = self
8284            .inner
8285            .pending_versioned_updates
8286            .read()
8287            .get(&conn_id)
8288            .cloned()
8289            .unwrap_or_default();
8290        let tombstones = self
8291            .inner
8292            .pending_tombstones
8293            .read()
8294            .get(&conn_id)
8295            .cloned()
8296            .unwrap_or_default();
8297
8298        let store = self.inner.db.store();
8299        for (collection, old_id, _new_id, xid, _previous_xmax) in versioned_updates {
8300            if let Some(manager) = store.get_collection(&collection) {
8301                if let Some(mut entity) = manager.get(old_id) {
8302                    entity.set_xmax(xid);
8303                    let _ = manager.update(entity);
8304                }
8305            }
8306        }
8307        for (collection, id, xid, _previous_xmax) in tombstones {
8308            if let Some(manager) = store.get_collection(&collection) {
8309                if let Some(mut entity) = manager.get(id) {
8310                    entity.set_xmax(xid);
8311                    let _ = manager.update(entity);
8312                }
8313            }
8314        }
8315    }
8316
8317    pub(crate) fn finalize_pending_versioned_updates(&self, conn_id: u64) {
8318        self.inner
8319            .pending_versioned_updates
8320            .write()
8321            .remove(&conn_id);
8322    }
8323
8324    pub(crate) fn revive_pending_versioned_updates(&self, conn_id: u64) {
8325        let Some(pending) = self
8326            .inner
8327            .pending_versioned_updates
8328            .write()
8329            .remove(&conn_id)
8330        else {
8331            return;
8332        };
8333
8334        let store = self.inner.db.store();
8335        for (collection, old_id, new_id, xid, previous_xmax) in pending {
8336            if let Some(manager) = store.get_collection(&collection) {
8337                if let Some(mut old) = manager.get(old_id) {
8338                    if old.xmax == xid {
8339                        old.set_xmax(previous_xmax);
8340                        let _ = manager.update(old);
8341                    }
8342                }
8343            }
8344            let _ = store.delete_batch(&collection, &[new_id]);
8345        }
8346    }
8347
8348    pub(crate) fn revive_versioned_updates_since(&self, conn_id: u64, stamper_xid: u64) -> usize {
8349        let mut guard = self.inner.pending_versioned_updates.write();
8350        let Some(pending) = guard.get_mut(&conn_id) else {
8351            return 0;
8352        };
8353
8354        let store = self.inner.db.store();
8355        let mut reverted = 0usize;
8356        pending.retain(|(collection, old_id, new_id, xid, previous_xmax)| {
8357            if *xid < stamper_xid {
8358                return true;
8359            }
8360            if let Some(manager) = store.get_collection(collection) {
8361                if let Some(mut old) = manager.get(*old_id) {
8362                    if old.xmax == *xid {
8363                        old.set_xmax(*previous_xmax);
8364                        let _ = manager.update(old);
8365                    }
8366                }
8367            }
8368            let _ = store.delete_batch(collection, &[*new_id]);
8369            reverted += 1;
8370            false
8371        });
8372        if pending.is_empty() {
8373            guard.remove(&conn_id);
8374        }
8375        reverted
8376    }
8377
8378    /// Flush tombstones on COMMIT. The xmax stamp is already the durable
8379    /// delete marker; commit only drops the rollback journal and emits
8380    /// side effects. Physical reclamation is left for VACUUM so old
8381    /// snapshots can still resolve the pre-delete row version.
8382    pub(crate) fn finalize_pending_tombstones(&self, conn_id: u64) {
8383        let Some(pending) = self.inner.pending_tombstones.write().remove(&conn_id) else {
8384            return;
8385        };
8386        if pending.is_empty() {
8387            return;
8388        }
8389
8390        let store = self.inner.db.store();
8391        for (collection, id, _xid, _previous_xmax) in pending {
8392            store.context_index().remove_entity(id);
8393            self.cdc_emit(
8394                crate::replication::cdc::ChangeOperation::Delete,
8395                &collection,
8396                id.raw(),
8397                "entity",
8398            );
8399        }
8400    }
8401
8402    /// Revive tombstones on ROLLBACK — reset `xmax` to 0 so the tuples
8403    /// become visible again to future snapshots. Best-effort: a row
8404    /// already reclaimed by a concurrent VACUUM stays gone, but VACUUM
8405    /// never reclaims tuples whose xmax is still referenced by any
8406    /// active snapshot, so this case is only reachable via external
8407    /// storage corruption.
8408    pub(crate) fn revive_pending_tombstones(&self, conn_id: u64) {
8409        let Some(pending) = self.inner.pending_tombstones.write().remove(&conn_id) else {
8410            return;
8411        };
8412
8413        let store = self.inner.db.store();
8414        for (collection, id, xid, previous_xmax) in pending {
8415            let Some(manager) = store.get_collection(&collection) else {
8416                continue;
8417            };
8418            if let Some(mut entity) = manager.get(id) {
8419                if entity.xmax == xid {
8420                    entity.set_xmax(previous_xmax);
8421                    let _ = manager.update(entity);
8422                }
8423            }
8424        }
8425    }
8426
8427    pub(crate) fn finalize_pending_kv_watch_events(&self, conn_id: u64) {
8428        let Some(pending) = self.inner.pending_kv_watch_events.write().remove(&conn_id) else {
8429            return;
8430        };
8431        for event in pending {
8432            self.cdc_emit_kv(
8433                event.op,
8434                &event.collection,
8435                &event.key,
8436                0,
8437                event.before,
8438                event.after,
8439            );
8440        }
8441    }
8442
8443    pub(crate) fn discard_pending_kv_watch_events(&self, conn_id: u64) {
8444        self.inner.pending_kv_watch_events.write().remove(&conn_id);
8445    }
8446
8447    /// Materialise the entire graph store while applying MVCC visibility
8448    /// AND per-collection RLS to each candidate node and edge. Mirrors
8449    /// `materialize_graph` but routes every entity through the same
8450    /// gate the SELECT path uses, with the correct `PolicyTargetKind`
8451    /// per entity kind (`Nodes` for graph nodes, `Edges` for graph
8452    /// edges). Returns the filtered `GraphStore` plus the
8453    /// `node_id → properties` map the executor needs for `RETURN n.*`
8454    /// projections.
8455    fn materialize_graph_with_rls(
8456        &self,
8457    ) -> RedDBResult<(
8458        crate::storage::engine::GraphStore,
8459        std::collections::HashMap<
8460            String,
8461            std::collections::HashMap<String, crate::storage::schema::Value>,
8462        >,
8463        crate::storage::query::unified::EdgeProperties,
8464    )> {
8465        use crate::storage::engine::GraphStore;
8466        use crate::storage::query::ast::{PolicyAction, PolicyTargetKind};
8467        use crate::storage::unified::entity::{EntityData, EntityKind};
8468        use std::collections::{HashMap, HashSet};
8469
8470        let store = self.inner.db.store();
8471        let snap_ctx = capture_current_snapshot();
8472        let role = current_auth_identity().map(|(_, r)| r.as_str().to_string());
8473
8474        let graph = GraphStore::new();
8475        let mut node_properties: HashMap<String, HashMap<String, crate::storage::schema::Value>> =
8476            HashMap::new();
8477        let mut edge_properties: crate::storage::query::unified::EdgeProperties = HashMap::new();
8478        let mut allowed_nodes: HashSet<String> = HashSet::new();
8479
8480        // Per-collection cached compiled filters — Nodes-kind for
8481        // first pass, Edges-kind for the second. None entries mean
8482        // "RLS enabled, zero matching policy → deny all of this kind".
8483        let mut node_rls: HashMap<String, Option<crate::storage::query::ast::Filter>> =
8484            HashMap::new();
8485        let mut edge_rls: HashMap<String, Option<crate::storage::query::ast::Filter>> =
8486            HashMap::new();
8487
8488        let collections = store.list_collections();
8489
8490        // First pass — gather nodes.
8491        for collection in &collections {
8492            let Some(manager) = store.get_collection(collection) else {
8493                continue;
8494            };
8495            let entities = manager.query_all(|_| true);
8496            for entity in entities {
8497                if !entity_visible_with_context(snap_ctx.as_ref(), &entity) {
8498                    continue;
8499                }
8500                let EntityKind::GraphNode(ref node) = entity.kind else {
8501                    continue;
8502                };
8503                if !node_passes_rls(self, collection, role.as_deref(), &mut node_rls, &entity) {
8504                    continue;
8505                }
8506                let id_str = entity.id.raw().to_string();
8507                graph
8508                    .add_node_with_label(
8509                        &id_str,
8510                        &node.label,
8511                        &super::graph_node_label(&node.node_type),
8512                    )
8513                    .map_err(|err| RedDBError::Query(err.to_string()))?;
8514                allowed_nodes.insert(id_str.clone());
8515                if let EntityData::Node(node_data) = &entity.data {
8516                    node_properties.insert(id_str, node_data.properties.clone());
8517                }
8518            }
8519        }
8520
8521        // Second pass — gather edges. An edge appears only when both
8522        // endpoint nodes survived the RLS pass AND the edge itself
8523        // passes its own RLS gate.
8524        for collection in &collections {
8525            let Some(manager) = store.get_collection(collection) else {
8526                continue;
8527            };
8528            let entities = manager.query_all(|_| true);
8529            for entity in entities {
8530                if !entity_visible_with_context(snap_ctx.as_ref(), &entity) {
8531                    continue;
8532                }
8533                let EntityKind::GraphEdge(ref edge) = entity.kind else {
8534                    continue;
8535                };
8536                if !allowed_nodes.contains(&edge.from_node)
8537                    || !allowed_nodes.contains(&edge.to_node)
8538                {
8539                    continue;
8540                }
8541                if !edge_passes_rls(self, collection, role.as_deref(), &mut edge_rls, &entity) {
8542                    continue;
8543                }
8544                let weight = match &entity.data {
8545                    EntityData::Edge(e) => e.weight,
8546                    _ => edge.weight as f32 / 1000.0,
8547                };
8548                let edge_label = super::graph_edge_label(&edge.label);
8549                graph
8550                    .add_edge_with_label(&edge.from_node, &edge.to_node, &edge_label, weight)
8551                    .map_err(|err| RedDBError::Query(err.to_string()))?;
8552                if let EntityData::Edge(edge_data) = &entity.data {
8553                    edge_properties.insert(
8554                        (edge.from_node.clone(), edge_label, edge.to_node.clone()),
8555                        edge_data.properties.clone(),
8556                    );
8557                }
8558            }
8559        }
8560
8561        // Suppress unused-PolicyAction/PolicyTargetKind warnings — both
8562        // are used inside the helper closures via the per-kind helpers
8563        // declared at the bottom of this file.
8564        let _ = (PolicyAction::Select, PolicyTargetKind::Nodes);
8565
8566        Ok((graph, node_properties, edge_properties))
8567    }
8568
8569    /// Phase 1.1 MVCC universal: post-save hook that stamps `xmin` on a
8570    /// freshly-inserted entity when the current connection holds an
8571    /// open transaction. Used by graph / vector / queue / timeseries
8572    /// write paths that go through the DevX builder API (`db.node(...)
8573    /// .save()` and friends) — those live in the storage crate and
8574    /// can't reach `current_xid()` without crossing layers, so the
8575    /// application layer calls this helper right after `save()` to
8576    /// finalise the MVCC stamp.
8577    ///
8578    /// Autocommit (outside BEGIN) is a no-op — no extra lookup or
8579    /// write, so the non-transactional hot path stays untouched.
8580    ///
8581    /// Best-effort: if the collection or entity disappears between
8582    /// the save and the stamp (concurrent DROP), we silently skip.
8583    pub(crate) fn stamp_xmin_if_in_txn(
8584        &self,
8585        collection: &str,
8586        id: crate::storage::unified::entity::EntityId,
8587    ) {
8588        let Some(xid) = self.current_xid() else {
8589            return;
8590        };
8591        let store = self.inner.db.store();
8592        let Some(manager) = store.get_collection(collection) else {
8593            return;
8594        };
8595        if let Some(mut entity) = manager.get(id) {
8596            entity.set_xmin(xid);
8597            let _ = manager.update(entity);
8598        }
8599    }
8600
8601    /// Revive tombstones stamped by `stamper_xid` or any sub-xid
8602    /// allocated after it (Phase 2.3.2e savepoint rollback). Any
8603    /// pending entries with `xid < stamper_xid` stay queued because
8604    /// they belong to the enclosing scope — they'll either flush on
8605    /// COMMIT or revive on an outer ROLLBACK TO SAVEPOINT.
8606    ///
8607    /// Returns the number of tuples whose `xmax` was wiped back to 0.
8608    pub(crate) fn revive_tombstones_since(&self, conn_id: u64, stamper_xid: u64) -> usize {
8609        let mut guard = self.inner.pending_tombstones.write();
8610        let Some(pending) = guard.get_mut(&conn_id) else {
8611            return 0;
8612        };
8613
8614        let store = self.inner.db.store();
8615        let mut revived = 0usize;
8616        pending.retain(|(collection, id, xid, previous_xmax)| {
8617            if *xid < stamper_xid {
8618                // Stamped before the savepoint — keep in queue.
8619                return true;
8620            }
8621            if let Some(manager) = store.get_collection(collection) {
8622                if let Some(mut entity) = manager.get(*id) {
8623                    if entity.xmax == *xid {
8624                        entity.set_xmax(*previous_xmax);
8625                        let _ = manager.update(entity);
8626                        revived += 1;
8627                    }
8628                }
8629            }
8630            false
8631        });
8632        if pending.is_empty() {
8633            guard.remove(&conn_id);
8634        }
8635        revived
8636    }
8637
8638    /// Return the snapshot the current connection should use for visibility
8639    /// checks (Phase 2.3 PG parity).
8640    ///
8641    /// * If the connection is inside a BEGIN-wrapped transaction, reuse
8642    ///   the snapshot stored in its `TxnContext`.
8643    /// * Otherwise (autocommit), capture a fresh snapshot tied to an
8644    ///   implicit xid=0 — the read path treats pre-MVCC rows as always
8645    ///   visible so this degrades to "see everything committed".
8646    pub fn current_snapshot(&self) -> crate::storage::transaction::snapshot::Snapshot {
8647        let conn_id = current_connection_id();
8648        if let Some(ctx) = self.inner.tx_contexts.read().get(&conn_id).cloned() {
8649            return ctx.snapshot;
8650        }
8651        // Autocommit: take a fresh snapshot bounded by `peek_next_xid` so
8652        // every already-committed xid (which is strictly less) passes the
8653        // `xmin <= snap.xid` gate, while concurrently-active xids land in
8654        // the `in_progress` set and stay hidden until they commit. Using
8655        // xid=0 would incorrectly hide every MVCC-stamped tuple.
8656        let high_water = self.inner.snapshot_manager.peek_next_xid();
8657        self.inner.snapshot_manager.snapshot(high_water)
8658    }
8659
8660    /// Xid of the current connection's active transaction, or `None` when
8661    /// running outside a BEGIN/COMMIT block. Write paths call this to
8662    /// decide whether to stamp `xmin`/`xmax` on tuples.
8663    /// Phase 2.3.2e: when a savepoint is open, `writer_xid` returns the
8664    /// sub-xid so new writes can be selectively rolled back. Otherwise
8665    /// the parent txn's xid is returned, matching pre-savepoint
8666    /// behaviour. Callers that need the enclosing *transaction* xid
8667    /// (e.g. VACUUM min-active calculations) should read `ctx.xid`
8668    /// directly.
8669    pub fn current_xid(&self) -> Option<crate::storage::transaction::snapshot::Xid> {
8670        let conn_id = current_connection_id();
8671        self.inner
8672            .tx_contexts
8673            .read()
8674            .get(&conn_id)
8675            .map(|ctx| ctx.writer_xid())
8676    }
8677
8678    /// Access the shared `SnapshotManager` — useful for VACUUM to compute
8679    /// the oldest-active xid when reclaiming dead tuples.
8680    pub fn snapshot_manager(&self) -> Arc<crate::storage::transaction::snapshot::SnapshotManager> {
8681        Arc::clone(&self.inner.snapshot_manager)
8682    }
8683
8684    fn mvcc_vacuum_cutoff_xid(&self) -> crate::storage::transaction::snapshot::Xid {
8685        let manager = &self.inner.snapshot_manager;
8686        let next_xid = manager.peek_next_xid();
8687        let mut cutoff = next_xid;
8688        if let Some(oldest_active) = manager.oldest_active_xid() {
8689            cutoff = cutoff.min(oldest_active);
8690        }
8691        if let Some(oldest_pinned) = manager.oldest_pinned_xid() {
8692            cutoff = cutoff.min(oldest_pinned);
8693        }
8694        let retention_xids = self.config_u64("runtime.mvcc.vacuum_retention_xids", 0);
8695        if retention_xids > 0 {
8696            cutoff = cutoff.min(next_xid.saturating_sub(retention_xids));
8697        }
8698        cutoff
8699    }
8700
8701    fn rebuild_runtime_indexes_for_table(&self, table: &str) -> RedDBResult<()> {
8702        let registered = self.inner.index_store.list_indices(table);
8703        if registered.is_empty() {
8704            return Ok(());
8705        }
8706        let store = self.inner.db.store();
8707        let Some(manager) = store.get_collection(table) else {
8708            return Ok(());
8709        };
8710        let entity_fields = manager
8711            .query_all(|entity| matches!(entity.kind, crate::storage::EntityKind::TableRow { .. }))
8712            .into_iter()
8713            .map(|entity| (entity.id, table_row_index_fields(&entity)))
8714            .collect::<Vec<_>>();
8715
8716        for index in registered {
8717            self.inner.index_store.drop_index(&index.name, table);
8718            self.inner
8719                .index_store
8720                .create_index(
8721                    &index.name,
8722                    table,
8723                    &index.columns,
8724                    index.method,
8725                    index.unique,
8726                    &entity_fields,
8727                )
8728                .map_err(RedDBError::Internal)?;
8729            self.inner.index_store.register(index);
8730        }
8731        self.invalidate_plan_cache();
8732        Ok(())
8733    }
8734
8735    /// Own-tx xids (parent + open/released savepoints) for the current
8736    /// connection. Transports + tests that build a `SnapshotContext`
8737    /// manually (outside the `execute_query` scope) need this set so
8738    /// the writer's own uncommitted tuples stay visible to self.
8739    pub fn current_txn_own_xids(
8740        &self,
8741    ) -> std::collections::HashSet<crate::storage::transaction::snapshot::Xid> {
8742        let mut set = std::collections::HashSet::new();
8743        if let Some(ctx) = self.inner.tx_contexts.read().get(&current_connection_id()) {
8744            set.insert(ctx.xid);
8745            for (_, sub) in &ctx.savepoints {
8746                set.insert(*sub);
8747            }
8748            for sub in &ctx.released_sub_xids {
8749                set.insert(*sub);
8750            }
8751        }
8752        set
8753    }
8754
8755    /// Access the shared `ForeignTableRegistry` (Phase 3.2 PG parity).
8756    ///
8757    /// Callers use this to check whether a table name is a registered
8758    /// foreign table (`registry.is_foreign_table(name)`) and, if so, to
8759    /// scan it (`registry.scan(name)`). The read-path rewriter consults
8760    /// this before dispatching into native-collection lookup.
8761    pub fn foreign_tables(&self) -> Arc<crate::storage::fdw::ForeignTableRegistry> {
8762        Arc::clone(&self.inner.foreign_tables)
8763    }
8764
8765    /// Is Row-Level Security enabled for this table? (Phase 2.5 PG parity)
8766    pub fn is_rls_enabled(&self, table: &str) -> bool {
8767        self.inner.rls_enabled_tables.read().contains(table)
8768    }
8769
8770    /// Collect the USING predicates that apply to this `(table, role, action)`.
8771    ///
8772    /// Returned filters should be OR-combined (a row passes RLS when *any*
8773    /// matching policy accepts it) and then AND-ed into the query's WHERE.
8774    /// When the table has RLS disabled this returns an empty Vec — callers
8775    /// can fast-path back to the unfiltered read.
8776    pub fn matching_rls_policies(
8777        &self,
8778        table: &str,
8779        role: Option<&str>,
8780        action: crate::storage::query::ast::PolicyAction,
8781    ) -> Vec<crate::storage::query::ast::Filter> {
8782        // Default kind = Table preserves the pre-Phase-2.5.5 behaviour:
8783        // callers that don't name a kind only see Table-scoped
8784        // policies (which is what execute SELECT / UPDATE / DELETE
8785        // expect).
8786        self.matching_rls_policies_for_kind(
8787            table,
8788            role,
8789            action,
8790            crate::storage::query::ast::PolicyTargetKind::Table,
8791        )
8792    }
8793
8794    /// Kind-aware variant used by cross-model scans (Phase 2.5.5).
8795    ///
8796    /// Graph scans request `Nodes` / `Edges`, vector ANN requests
8797    /// `Vectors`, queue consumers request `Messages`, and timeseries
8798    /// range scans request `Points`. Policies tagged with a
8799    /// different kind are skipped so a graph-scoped policy doesn't
8800    /// accidentally gate a table SELECT on the same collection.
8801    pub fn matching_rls_policies_for_kind(
8802        &self,
8803        table: &str,
8804        role: Option<&str>,
8805        action: crate::storage::query::ast::PolicyAction,
8806        kind: crate::storage::query::ast::PolicyTargetKind,
8807    ) -> Vec<crate::storage::query::ast::Filter> {
8808        if !self.is_rls_enabled(table) {
8809            return Vec::new();
8810        }
8811        let policies = self.inner.rls_policies.read();
8812        policies
8813            .iter()
8814            .filter_map(|((t, _), p)| {
8815                if t != table {
8816                    return None;
8817                }
8818                // Kind gate — Table policies also apply to every
8819                // other kind *iff* the policy predicate evaluates
8820                // against entity fields that exist uniformly; the
8821                // caller's kind filter is the stricter check, so
8822                // match literally. Auto-tenancy policies stamp
8823                // Table and the caller passes the concrete kind —
8824                // we allow Table policies to apply cross-kind for
8825                // backwards compat.
8826                if p.target_kind != kind
8827                    && p.target_kind != crate::storage::query::ast::PolicyTargetKind::Table
8828                {
8829                    return None;
8830                }
8831                // Action gate — `None` means "ALL" actions.
8832                if let Some(a) = p.action {
8833                    if a != action {
8834                        return None;
8835                    }
8836                }
8837                // Role gate — `None` means "any role".
8838                if let Some(p_role) = p.role.as_deref() {
8839                    match role {
8840                        Some(r) if r == p_role => {}
8841                        _ => return None,
8842                    }
8843                }
8844                Some((*p.using).clone())
8845            })
8846            .collect()
8847    }
8848
8849    pub(crate) fn refresh_table_planner_stats(&self, table: &str) {
8850        let store = self.inner.db.store();
8851        if let Some(stats) =
8852            crate::storage::query::planner::stats_catalog::analyze_collection(store.as_ref(), table)
8853        {
8854            crate::storage::query::planner::stats_catalog::persist_table_stats(
8855                store.as_ref(),
8856                &stats,
8857            );
8858        } else {
8859            crate::storage::query::planner::stats_catalog::clear_table_stats(store.as_ref(), table);
8860        }
8861        self.invalidate_plan_cache();
8862    }
8863
8864    pub(crate) fn note_table_write(&self, table: &str) {
8865        // Skip the write lock when the table is already marked
8866        // dirty. With single-row UPDATEs in a loop this used to
8867        // grab the planner_dirty_tables write lock N times even
8868        // though the first call already flipped the flag.
8869        let already_dirty = self.inner.planner_dirty_tables.read().contains(table);
8870        if !already_dirty {
8871            self.inner
8872                .planner_dirty_tables
8873                .write()
8874                .insert(table.to_string());
8875        }
8876        self.invalidate_result_cache_for_table(table);
8877    }
8878
8879    /// Wrap the planner's `RuntimeQueryExplain` as rows on a
8880    /// `RuntimeQueryResult` so callers over the SQL interface see the
8881    /// plan tree in the same shape a SELECT produces.
8882    ///
8883    /// Columns: `op`, `source`, `est_rows`, `est_cost`, `depth`.
8884    /// Nodes are walked depth-first; `depth` counts from 0 at the
8885    /// root so a text renderer can indent without re-walking.
8886    fn explain_as_rows(&self, raw_query: &str, inner_sql: &str) -> RedDBResult<RuntimeQueryResult> {
8887        let explain = self.explain_query(inner_sql)?;
8888
8889        let columns = vec![
8890            "op".to_string(),
8891            "source".to_string(),
8892            "est_rows".to_string(),
8893            "est_cost".to_string(),
8894            "depth".to_string(),
8895        ];
8896
8897        let mut records: Vec<crate::storage::query::unified::UnifiedRecord> = Vec::new();
8898
8899        // Prepend `CteScan` markers when the query carried a leading
8900        // WITH clause. The CTE bodies are already inlined into the
8901        // main plan tree, but operators reading EXPLAIN need to see
8902        // which named CTEs were resolved — without this row the plan
8903        // would look indistinguishable from a hand-inlined query.
8904        for name in &explain.cte_materializations {
8905            use std::sync::Arc;
8906            let mut rec = crate::storage::query::unified::UnifiedRecord::default();
8907            rec.set_arc(Arc::from("op"), Value::text("CteScan".to_string()));
8908            rec.set_arc(Arc::from("source"), Value::text(name.clone()));
8909            rec.set_arc(Arc::from("est_rows"), Value::Float(0.0));
8910            rec.set_arc(Arc::from("est_cost"), Value::Float(0.0));
8911            rec.set_arc(Arc::from("depth"), Value::Integer(0));
8912            records.push(rec);
8913        }
8914
8915        walk_plan_node(&explain.logical_plan.root, 0, &mut records);
8916
8917        let result = crate::storage::query::unified::UnifiedResult {
8918            columns,
8919            records,
8920            stats: Default::default(),
8921            pre_serialized_json: None,
8922        };
8923
8924        Ok(RuntimeQueryResult {
8925            query: raw_query.to_string(),
8926            mode: explain.mode,
8927            statement: "explain",
8928            engine: "runtime-explain",
8929            result,
8930            affected_rows: 0,
8931            statement_type: "select",
8932        })
8933    }
8934
8935    // -----------------------------------------------------------------
8936    // Granular RBAC — privilege gate + GRANT/REVOKE/ALTER USER dispatch
8937    // -----------------------------------------------------------------
8938
8939    /// Project a `QueryExpr` to the (action, resource) pair the
8940    /// privilege engine cares about. Returns `Ok(())` for statements
8941    /// that don't touch user data (transaction control, SHOW, SET, etc.).
8942    pub(super) fn check_query_privilege(
8943        &self,
8944        expr: &crate::storage::query::ast::QueryExpr,
8945    ) -> Result<(), String> {
8946        use crate::auth::privileges::{Action, AuthzContext, Resource};
8947        use crate::auth::UserId;
8948        use crate::storage::query::ast::QueryExpr;
8949
8950        // No auth store wired (embedded mode / fresh DB / tests) → bypass.
8951        // The bootstrap path itself goes through `execute_query` so this
8952        // is the only sensible default; once auth is wired, the gate
8953        // becomes active.
8954        let auth_store = match self.inner.auth_store.read().clone() {
8955            Some(s) => s,
8956            None => return Ok(()),
8957        };
8958
8959        // Resolve principal + role from the thread-local identity.
8960        // Anonymous (no identity) is allowed to read the bootstrap path
8961        // only when auth_store says so; we treat missing identity as
8962        // platform-admin-equivalent here so embedded test harnesses
8963        // continue to work without setting an identity.
8964        let (username, role) = match current_auth_identity() {
8965            Some(p) => p,
8966            None => return Ok(()),
8967        };
8968        let tenant = current_tenant();
8969
8970        let ctx = AuthzContext {
8971            principal: &username,
8972            effective_role: role,
8973            tenant: tenant.as_deref(),
8974        };
8975        let principal_id = UserId::from_parts(tenant.as_deref(), &username);
8976
8977        // Map QueryExpr → (Action, Resource).
8978        let (action, resource) = match expr {
8979            QueryExpr::Table(t) => (Action::Select, Resource::table_from_name(&t.table)),
8980            QueryExpr::QueueSelect(q) => (Action::Select, Resource::table_from_name(&q.queue)),
8981            QueryExpr::Graph(g) => {
8982                if auth_store.iam_authorization_enabled() {
8983                    self.check_graph_property_projection_privilege(
8984                        &auth_store,
8985                        &principal_id,
8986                        role,
8987                        tenant.as_deref(),
8988                        g,
8989                    )?;
8990                    return Ok(());
8991                }
8992                return Ok(());
8993            }
8994            QueryExpr::Vector(v) => {
8995                if auth_store.iam_authorization_enabled() {
8996                    self.check_table_like_column_projection_privilege(
8997                        &auth_store,
8998                        &principal_id,
8999                        role,
9000                        tenant.as_deref(),
9001                        &v.collection,
9002                        &["content".to_string()],
9003                    )?;
9004                    return Ok(());
9005                }
9006                return Ok(());
9007            }
9008            QueryExpr::Insert(i) => (Action::Insert, Resource::table_from_name(&i.table)),
9009            QueryExpr::Update(u) => (Action::Update, Resource::table_from_name(&u.table)),
9010            QueryExpr::Delete(d) => (Action::Delete, Resource::table_from_name(&d.table)),
9011            // Joins inherit the read privilege from any constituent
9012            // table — for now we emit a single Select on the database
9013            // (admins bypass; non-admins need a Database/Schema grant).
9014            QueryExpr::Join(_) => (Action::Select, Resource::Database),
9015            // GRANT / REVOKE / ALTER USER are authority statements;
9016            // require Admin (the helper methods enforce).
9017            QueryExpr::Grant(_) | QueryExpr::Revoke(_) | QueryExpr::AlterUser(_) => {
9018                return if role == crate::auth::Role::Admin {
9019                    Ok(())
9020                } else {
9021                    Err(format!(
9022                        "principal=`{}` role=`{:?}` cannot issue ACL/auth DDL",
9023                        username, role
9024                    ))
9025                };
9026            }
9027            QueryExpr::CreateIamPolicy { id, .. } => {
9028                return self.check_policy_management_privilege(
9029                    &auth_store,
9030                    &principal_id,
9031                    role,
9032                    tenant.as_deref(),
9033                    "policy:put",
9034                    "policy",
9035                    id,
9036                );
9037            }
9038            QueryExpr::DropIamPolicy { id } => {
9039                return self.check_policy_management_privilege(
9040                    &auth_store,
9041                    &principal_id,
9042                    role,
9043                    tenant.as_deref(),
9044                    "policy:drop",
9045                    "policy",
9046                    id,
9047                );
9048            }
9049            QueryExpr::AttachPolicy { policy_id, .. } => {
9050                return self.check_policy_management_privilege(
9051                    &auth_store,
9052                    &principal_id,
9053                    role,
9054                    tenant.as_deref(),
9055                    "policy:attach",
9056                    "policy",
9057                    policy_id,
9058                );
9059            }
9060            QueryExpr::DetachPolicy { policy_id, .. } => {
9061                return self.check_policy_management_privilege(
9062                    &auth_store,
9063                    &principal_id,
9064                    role,
9065                    tenant.as_deref(),
9066                    "policy:detach",
9067                    "policy",
9068                    policy_id,
9069                );
9070            }
9071            QueryExpr::ShowPolicies { .. } | QueryExpr::ShowEffectivePermissions { .. } => {
9072                return Ok(());
9073            }
9074            QueryExpr::SimulatePolicy { .. } => {
9075                return self.check_policy_management_privilege(
9076                    &auth_store,
9077                    &principal_id,
9078                    role,
9079                    tenant.as_deref(),
9080                    "policy:simulate",
9081                    "policy",
9082                    "*",
9083                );
9084            }
9085            // DROP and TRUNCATE — Write-role gate + per-collection IAM policy
9086            // when IAM mode is active. Other DDL stays role-only for now.
9087            QueryExpr::DropTable(q) => {
9088                return self.check_ddl_collection_privilege(
9089                    &auth_store,
9090                    &principal_id,
9091                    role,
9092                    tenant.as_deref(),
9093                    &username,
9094                    "drop",
9095                    &q.name,
9096                );
9097            }
9098            QueryExpr::DropGraph(q) => {
9099                return self.check_ddl_collection_privilege(
9100                    &auth_store,
9101                    &principal_id,
9102                    role,
9103                    tenant.as_deref(),
9104                    &username,
9105                    "drop",
9106                    &q.name,
9107                );
9108            }
9109            QueryExpr::DropVector(q) => {
9110                return self.check_ddl_collection_privilege(
9111                    &auth_store,
9112                    &principal_id,
9113                    role,
9114                    tenant.as_deref(),
9115                    &username,
9116                    "drop",
9117                    &q.name,
9118                );
9119            }
9120            QueryExpr::DropDocument(q) => {
9121                return self.check_ddl_collection_privilege(
9122                    &auth_store,
9123                    &principal_id,
9124                    role,
9125                    tenant.as_deref(),
9126                    &username,
9127                    "drop",
9128                    &q.name,
9129                );
9130            }
9131            QueryExpr::DropKv(q) => {
9132                return self.check_ddl_collection_privilege(
9133                    &auth_store,
9134                    &principal_id,
9135                    role,
9136                    tenant.as_deref(),
9137                    &username,
9138                    "drop",
9139                    &q.name,
9140                );
9141            }
9142            QueryExpr::DropCollection(q) => {
9143                return self.check_ddl_collection_privilege(
9144                    &auth_store,
9145                    &principal_id,
9146                    role,
9147                    tenant.as_deref(),
9148                    &username,
9149                    "drop",
9150                    &q.name,
9151                );
9152            }
9153            QueryExpr::Truncate(q) => {
9154                return self.check_ddl_collection_privilege(
9155                    &auth_store,
9156                    &principal_id,
9157                    role,
9158                    tenant.as_deref(),
9159                    &username,
9160                    "truncate",
9161                    &q.name,
9162                );
9163            }
9164            // Remaining DDL — gate on Write role. Fine-grained grants TBD.
9165            QueryExpr::CreateTable(_)
9166            | QueryExpr::CreateCollection(_)
9167            | QueryExpr::CreateVector(_)
9168            | QueryExpr::AlterTable(_)
9169            | QueryExpr::CreateIndex(_)
9170            | QueryExpr::DropIndex(_)
9171            | QueryExpr::CreateSchema(_)
9172            | QueryExpr::DropSchema(_)
9173            | QueryExpr::CreateSequence(_)
9174            | QueryExpr::DropSequence(_)
9175            | QueryExpr::CreateView(_)
9176            | QueryExpr::DropView(_)
9177            | QueryExpr::RefreshMaterializedView(_)
9178            | QueryExpr::CreatePolicy(_)
9179            | QueryExpr::DropPolicy(_)
9180            | QueryExpr::CreateServer(_)
9181            | QueryExpr::DropServer(_)
9182            | QueryExpr::CreateForeignTable(_)
9183            | QueryExpr::DropForeignTable(_)
9184            | QueryExpr::CreateTimeSeries(_)
9185            | QueryExpr::DropTimeSeries(_)
9186            | QueryExpr::CreateQueue(_)
9187            | QueryExpr::AlterQueue(_)
9188            | QueryExpr::DropQueue(_)
9189            | QueryExpr::CreateTree(_)
9190            | QueryExpr::DropTree(_) => {
9191                return if role >= crate::auth::Role::Write {
9192                    Ok(())
9193                } else {
9194                    Err(format!(
9195                        "principal=`{}` role=`{:?}` cannot issue DDL",
9196                        username, role
9197                    ))
9198                };
9199            }
9200            // Migration DDL — CREATE MIGRATION requires Write role (schema author).
9201            QueryExpr::CreateMigration(_) => {
9202                return if role >= crate::auth::Role::Write {
9203                    Ok(())
9204                } else {
9205                    Err(format!(
9206                        "principal=`{}` role=`{:?}` cannot issue CREATE MIGRATION",
9207                        username, role
9208                    ))
9209                };
9210            }
9211            // APPLY / ROLLBACK change data and schema — require Admin.
9212            QueryExpr::ApplyMigration(_) | QueryExpr::RollbackMigration(_) => {
9213                return if role == crate::auth::Role::Admin {
9214                    Ok(())
9215                } else {
9216                    Err(format!(
9217                        "principal=`{}` role=`{:?}` cannot issue APPLY/ROLLBACK MIGRATION",
9218                        username, role
9219                    ))
9220                };
9221            }
9222            // EXPLAIN MIGRATION is read-only — any authenticated principal.
9223            QueryExpr::ExplainMigration(_) => return Ok(()),
9224            // Everything else (SET, SHOW, transaction control, graph
9225            // commands, queue/tree commands, MaintenanceCommand …)
9226            // is allowed for any authenticated principal.
9227            _ => return Ok(()),
9228        };
9229
9230        if auth_store.iam_authorization_enabled() {
9231            let iam_action = legacy_action_to_iam(action);
9232            let iam_resource = legacy_resource_to_iam(&resource, tenant.as_deref());
9233            let iam_ctx = runtime_iam_context(role, tenant.as_deref());
9234            if !auth_store.check_policy_authz(&principal_id, iam_action, &iam_resource, &iam_ctx) {
9235                return Err(format!(
9236                    "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
9237                    username, iam_action, iam_resource.kind, iam_resource.name
9238                ));
9239            }
9240
9241            if let QueryExpr::Table(table) = expr {
9242                self.check_table_column_projection_privilege(
9243                    &auth_store,
9244                    &principal_id,
9245                    &iam_ctx,
9246                    table,
9247                )?;
9248            }
9249
9250            if let QueryExpr::Update(update) = expr {
9251                let columns = update_set_target_columns(update);
9252                if !columns.is_empty() {
9253                    let request = column_access_request_for_table_update(&update.table, columns);
9254                    let outcome =
9255                        auth_store.check_column_projection_authz(&principal_id, &request, &iam_ctx);
9256                    if let Some(denied) = outcome.first_denied_column() {
9257                        return Err(format!(
9258                            "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM column policy",
9259                            username, iam_action, denied.resource.kind, denied.resource.name
9260                        ));
9261                    }
9262                    if !outcome.allowed() {
9263                        return Err(format!(
9264                            "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
9265                            username,
9266                            iam_action,
9267                            outcome.table_resource.kind,
9268                            outcome.table_resource.name
9269                        ));
9270                    }
9271                }
9272
9273                if let Some(columns) = update_returning_columns_for_policy(self, update) {
9274                    let request = column_access_request_for_table_select(&update.table, columns);
9275                    let outcome =
9276                        auth_store.check_column_projection_authz(&principal_id, &request, &iam_ctx);
9277                    if let Some(denied) = outcome.first_denied_column() {
9278                        return Err(format!(
9279                            "principal=`{}` action=`select` resource=`{}:{}` denied by IAM column policy",
9280                            username, denied.resource.kind, denied.resource.name
9281                        ));
9282                    }
9283                    if !outcome.allowed() {
9284                        return Err(format!(
9285                            "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
9286                            username, outcome.table_resource.kind, outcome.table_resource.name
9287                        ));
9288                    }
9289                }
9290            }
9291
9292            Ok(())
9293        } else {
9294            auth_store
9295                .check_grant(&ctx, action, &resource)
9296                .map_err(|e| e.to_string())
9297        }
9298    }
9299
9300    fn check_table_column_projection_privilege(
9301        &self,
9302        auth_store: &Arc<crate::auth::store::AuthStore>,
9303        principal: &crate::auth::UserId,
9304        ctx: &crate::auth::policies::EvalContext,
9305        table: &crate::storage::query::ast::TableQuery,
9306    ) -> Result<(), String> {
9307        use crate::auth::{ColumnAccessRequest, ColumnDecisionEffect};
9308
9309        let columns = requested_table_columns_for_policy(table);
9310        if columns.is_empty() {
9311            return Ok(());
9312        }
9313
9314        let request = ColumnAccessRequest::select(table.table.clone(), columns);
9315        let outcome = auth_store.check_column_projection_authz(principal, &request, ctx);
9316        if outcome.allowed() {
9317            return Ok(());
9318        }
9319
9320        if !matches!(
9321            outcome.table_decision,
9322            crate::auth::policies::Decision::Allow { .. }
9323                | crate::auth::policies::Decision::AdminBypass
9324        ) {
9325            return Err(format!(
9326                "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
9327                principal, outcome.table_resource.kind, outcome.table_resource.name
9328            ));
9329        }
9330
9331        let denied = outcome
9332            .first_denied_column()
9333            .filter(|decision| decision.effective == ColumnDecisionEffect::Denied);
9334        match denied {
9335            Some(decision) => Err(format!(
9336                "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
9337                principal, decision.resource.kind, decision.resource.name
9338            )),
9339            None => Ok(()),
9340        }
9341    }
9342
9343    fn check_graph_property_projection_privilege(
9344        &self,
9345        auth_store: &Arc<crate::auth::store::AuthStore>,
9346        principal: &crate::auth::UserId,
9347        role: crate::auth::Role,
9348        tenant: Option<&str>,
9349        query: &crate::storage::query::ast::GraphQuery,
9350    ) -> Result<(), String> {
9351        let columns = explicit_graph_projection_properties(query);
9352        if columns.is_empty() {
9353            return Ok(());
9354        }
9355        self.check_table_like_column_projection_privilege(
9356            auth_store, principal, role, tenant, "graph", &columns,
9357        )
9358    }
9359
9360    fn check_table_like_column_projection_privilege(
9361        &self,
9362        auth_store: &Arc<crate::auth::store::AuthStore>,
9363        principal: &crate::auth::UserId,
9364        role: crate::auth::Role,
9365        tenant: Option<&str>,
9366        table: &str,
9367        columns: &[String],
9368    ) -> Result<(), String> {
9369        let iam_ctx = runtime_iam_context(role, tenant);
9370        let request =
9371            crate::auth::ColumnAccessRequest::select(table.to_string(), columns.iter().cloned());
9372        let outcome = auth_store.check_column_projection_authz(principal, &request, &iam_ctx);
9373        if outcome.allowed() {
9374            return Ok(());
9375        }
9376        let denied = outcome
9377            .first_denied_column()
9378            .map(|d| d.resource.name.clone())
9379            .unwrap_or_else(|| format!("{table}.<unknown>"));
9380        Err(format!(
9381            "principal=`{}` action=`select` resource=`column:{}` denied by IAM policy",
9382            principal, denied
9383        ))
9384    }
9385
9386    fn check_policy_management_privilege(
9387        &self,
9388        auth_store: &Arc<crate::auth::store::AuthStore>,
9389        principal: &crate::auth::UserId,
9390        role: crate::auth::Role,
9391        tenant: Option<&str>,
9392        action: &str,
9393        resource_kind: &str,
9394        resource_name: &str,
9395    ) -> Result<(), String> {
9396        if !auth_store.iam_authorization_enabled() {
9397            return if role == crate::auth::Role::Admin {
9398                Ok(())
9399            } else {
9400                Err(format!(
9401                    "principal=`{}` role=`{:?}` cannot issue ACL/auth DDL",
9402                    principal, role
9403                ))
9404            };
9405        }
9406
9407        let mut resource = crate::auth::policies::ResourceRef::new(
9408            resource_kind.to_string(),
9409            resource_name.to_string(),
9410        );
9411        if let Some(t) = tenant {
9412            resource = resource.with_tenant(t.to_string());
9413        }
9414        let ctx = runtime_iam_context(role, tenant);
9415        if auth_store.check_policy_authz(principal, action, &resource, &ctx) {
9416            Ok(())
9417        } else {
9418            Err(format!(
9419                "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
9420                principal, action, resource.kind, resource.name
9421            ))
9422        }
9423    }
9424
9425    /// IAM privilege check for DROP / TRUNCATE on a named collection.
9426    ///
9427    /// In legacy mode (IAM not enabled): requires Write role.
9428    /// In IAM mode: requires an explicit `drop` / `truncate` policy on
9429    /// `collection:<name>` (Admin role auto-passes via AdminBypass).
9430    /// Records an audit log entry for both allow and deny outcomes.
9431    fn check_ddl_collection_privilege(
9432        &self,
9433        auth_store: &Arc<crate::auth::store::AuthStore>,
9434        principal: &crate::auth::UserId,
9435        role: crate::auth::Role,
9436        tenant: Option<&str>,
9437        username: &str,
9438        action: &str,
9439        collection: &str,
9440    ) -> Result<(), String> {
9441        if role < crate::auth::Role::Write {
9442            let msg = format!(
9443                "principal=`{}` role=`{:?}` cannot issue DDL",
9444                username, role
9445            );
9446            self.inner.audit_log.record(
9447                action,
9448                username,
9449                collection,
9450                "denied",
9451                crate::json::Value::Null,
9452            );
9453            return Err(msg);
9454        }
9455
9456        if !auth_store.iam_authorization_enabled() {
9457            self.inner.audit_log.record(
9458                action,
9459                username,
9460                collection,
9461                "ok",
9462                crate::json::Value::Null,
9463            );
9464            return Ok(());
9465        }
9466
9467        let resource_name = collection.to_string();
9468        let mut resource = crate::auth::policies::ResourceRef::new(
9469            "collection".to_string(),
9470            resource_name.clone(),
9471        );
9472        if let Some(t) = tenant {
9473            resource = resource.with_tenant(t.to_string());
9474        }
9475        let ctx = runtime_iam_context(role, tenant);
9476        if auth_store.check_policy_authz(principal, action, &resource, &ctx) {
9477            self.inner.audit_log.record(
9478                action,
9479                username,
9480                &resource_name,
9481                "ok",
9482                crate::json::Value::Null,
9483            );
9484            Ok(())
9485        } else {
9486            self.inner.audit_log.record(
9487                action,
9488                username,
9489                &resource_name,
9490                "denied",
9491                crate::json::Value::Null,
9492            );
9493            Err(format!(
9494                "principal=`{}` action=`{}` resource=`collection:{}` denied by IAM policy",
9495                username, action, resource_name
9496            ))
9497        }
9498    }
9499
9500    /// Translate the parsed [`GrantStmt`] into AuthStore mutations.
9501    fn execute_grant_statement(
9502        &self,
9503        query: &str,
9504        stmt: &crate::storage::query::ast::GrantStmt,
9505    ) -> RedDBResult<RuntimeQueryResult> {
9506        use crate::auth::privileges::{Action, GrantPrincipal, Resource};
9507        use crate::auth::UserId;
9508        use crate::storage::query::ast::{GrantObjectKind, GrantPrincipalRef};
9509
9510        let auth_store = self
9511            .inner
9512            .auth_store
9513            .read()
9514            .clone()
9515            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9516
9517        // Granter identity + role.
9518        let (gname, grole) = current_auth_identity().ok_or_else(|| {
9519            RedDBError::Query("GRANT requires an authenticated principal".to_string())
9520        })?;
9521        let granter = UserId::from_parts(current_tenant().as_deref(), &gname);
9522        let granter_role = grole;
9523
9524        // Build the action set.
9525        let mut actions: Vec<Action> = Vec::new();
9526        if stmt.all {
9527            actions.push(Action::All);
9528        } else {
9529            for kw in &stmt.actions {
9530                let a = Action::from_keyword(kw).ok_or_else(|| {
9531                    RedDBError::Query(format!("unknown privilege keyword `{}`", kw))
9532                })?;
9533                actions.push(a);
9534            }
9535        }
9536
9537        // Audit emit (printed; structured emission is Agent #4's lane).
9538        let mut applied = 0usize;
9539        for obj in &stmt.objects {
9540            let resource = match stmt.object_kind {
9541                GrantObjectKind::Table => Resource::Table {
9542                    schema: obj.schema.clone(),
9543                    table: obj.name.clone(),
9544                },
9545                GrantObjectKind::Schema => Resource::Schema(obj.name.clone()),
9546                GrantObjectKind::Database => Resource::Database,
9547                GrantObjectKind::Function => Resource::Function {
9548                    schema: obj.schema.clone(),
9549                    name: obj.name.clone(),
9550                },
9551            };
9552            for principal in &stmt.principals {
9553                let p = match principal {
9554                    GrantPrincipalRef::Public => GrantPrincipal::Public,
9555                    GrantPrincipalRef::Group(g) => GrantPrincipal::Group(g.clone()),
9556                    GrantPrincipalRef::User { tenant, name } => {
9557                        GrantPrincipal::User(UserId::from_parts(tenant.as_deref(), name))
9558                    }
9559                };
9560                // Tenant of the grant follows the granter's tenant
9561                // (cross-tenant guard inside `AuthStore::grant`).
9562                let tenant = granter.tenant.clone();
9563                auth_store
9564                    .grant(
9565                        &granter,
9566                        granter_role,
9567                        p.clone(),
9568                        resource.clone(),
9569                        actions.clone(),
9570                        stmt.with_grant_option,
9571                        tenant.clone(),
9572                    )
9573                    .map_err(|e| RedDBError::Query(e.to_string()))?;
9574
9575                // IAM policy translation: every GRANT also lands as a
9576                // synthetic `_grant_<id>` policy attached to the
9577                // principal so the new evaluator sees it.
9578                if let Some(policy) =
9579                    grant_to_iam_policy(&p, &resource, &actions, tenant.as_deref())
9580                {
9581                    let pid = policy.id.clone();
9582                    auth_store
9583                        .put_policy_internal(policy)
9584                        .map_err(|e| RedDBError::Query(e.to_string()))?;
9585                    let attachment = match &p {
9586                        GrantPrincipal::User(uid) => {
9587                            crate::auth::store::PrincipalRef::User(uid.clone())
9588                        }
9589                        GrantPrincipal::Group(group) => {
9590                            crate::auth::store::PrincipalRef::Group(group.clone())
9591                        }
9592                        GrantPrincipal::Public => crate::auth::store::PrincipalRef::Group(
9593                            crate::auth::store::PUBLIC_IAM_GROUP.to_string(),
9594                        ),
9595                    };
9596                    auth_store
9597                        .attach_policy(attachment, &pid)
9598                        .map_err(|e| RedDBError::Query(e.to_string()))?;
9599                }
9600                applied += 1;
9601                tracing::info!(
9602                    target: "audit",
9603                    principal = %granter,
9604                    action = "grant",
9605                    "GRANT applied"
9606                );
9607            }
9608        }
9609
9610        self.invalidate_result_cache();
9611        Ok(RuntimeQueryResult::ok_message(
9612            query.to_string(),
9613            &format!("GRANT applied to {} target(s)", applied),
9614            "grant",
9615        ))
9616    }
9617
9618    /// Translate the parsed [`RevokeStmt`] into AuthStore mutations.
9619    fn execute_revoke_statement(
9620        &self,
9621        query: &str,
9622        stmt: &crate::storage::query::ast::RevokeStmt,
9623    ) -> RedDBResult<RuntimeQueryResult> {
9624        use crate::auth::privileges::{Action, GrantPrincipal, Resource};
9625        use crate::auth::UserId;
9626        use crate::storage::query::ast::{GrantObjectKind, GrantPrincipalRef};
9627
9628        let auth_store = self
9629            .inner
9630            .auth_store
9631            .read()
9632            .clone()
9633            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9634
9635        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
9636            RedDBError::Query("REVOKE requires an authenticated principal".to_string())
9637        })?;
9638        let granter_role = grole;
9639
9640        let actions: Vec<Action> = if stmt.all {
9641            vec![Action::All]
9642        } else {
9643            stmt.actions
9644                .iter()
9645                .map(|kw| Action::from_keyword(kw).unwrap_or(Action::Select))
9646                .collect()
9647        };
9648
9649        let mut total_removed = 0usize;
9650        for obj in &stmt.objects {
9651            let resource = match stmt.object_kind {
9652                GrantObjectKind::Table => Resource::Table {
9653                    schema: obj.schema.clone(),
9654                    table: obj.name.clone(),
9655                },
9656                GrantObjectKind::Schema => Resource::Schema(obj.name.clone()),
9657                GrantObjectKind::Database => Resource::Database,
9658                GrantObjectKind::Function => Resource::Function {
9659                    schema: obj.schema.clone(),
9660                    name: obj.name.clone(),
9661                },
9662            };
9663            for principal in &stmt.principals {
9664                let p = match principal {
9665                    GrantPrincipalRef::Public => GrantPrincipal::Public,
9666                    GrantPrincipalRef::Group(g) => GrantPrincipal::Group(g.clone()),
9667                    GrantPrincipalRef::User { tenant, name } => {
9668                        GrantPrincipal::User(UserId::from_parts(tenant.as_deref(), name))
9669                    }
9670                };
9671                let removed = auth_store
9672                    .revoke(granter_role, &p, &resource, &actions)
9673                    .map_err(|e| RedDBError::Query(e.to_string()))?;
9674                let _removed_policies =
9675                    auth_store.delete_synthetic_grant_policies(&p, &resource, &actions);
9676                total_removed += removed;
9677            }
9678        }
9679
9680        self.invalidate_result_cache();
9681        Ok(RuntimeQueryResult::ok_message(
9682            query.to_string(),
9683            &format!("REVOKE removed {} grant(s)", total_removed),
9684            "revoke",
9685        ))
9686    }
9687
9688    /// Translate the parsed [`AlterUserStmt`] into AuthStore mutations.
9689    fn execute_alter_user_statement(
9690        &self,
9691        query: &str,
9692        stmt: &crate::storage::query::ast::AlterUserStmt,
9693    ) -> RedDBResult<RuntimeQueryResult> {
9694        use crate::auth::privileges::UserAttributes;
9695        use crate::auth::UserId;
9696        use crate::storage::query::ast::AlterUserAttribute;
9697
9698        let auth_store = self
9699            .inner
9700            .auth_store
9701            .read()
9702            .clone()
9703            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9704
9705        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
9706            RedDBError::Query("ALTER USER requires an authenticated principal".to_string())
9707        })?;
9708        if grole != crate::auth::Role::Admin {
9709            return Err(RedDBError::Query(
9710                "ALTER USER requires Admin role".to_string(),
9711            ));
9712        }
9713
9714        let target = UserId::from_parts(stmt.tenant.as_deref(), &stmt.username);
9715
9716        // Apply attributes incrementally — each one reads the current
9717        // record, mutates the relevant field, writes back.
9718        let mut attrs = auth_store.user_attributes(&target);
9719        let mut enable_change: Option<bool> = None;
9720
9721        for a in &stmt.attributes {
9722            match a {
9723                AlterUserAttribute::ValidUntil(ts) => {
9724                    // Parse ISO-ish timestamp → ms since epoch. Fall
9725                    // back to integer-ms parsing for callers that pass
9726                    // `'1234567890123'`.
9727                    let ms = parse_timestamp_to_ms(ts).ok_or_else(|| {
9728                        RedDBError::Query(format!("invalid VALID UNTIL timestamp `{ts}`"))
9729                    })?;
9730                    attrs.valid_until = Some(ms);
9731                }
9732                AlterUserAttribute::ConnectionLimit(n) => {
9733                    if *n < 0 {
9734                        return Err(RedDBError::Query(
9735                            "CONNECTION LIMIT must be non-negative".to_string(),
9736                        ));
9737                    }
9738                    attrs.connection_limit = Some(*n as u32);
9739                }
9740                AlterUserAttribute::SetSearchPath(p) => {
9741                    attrs.search_path = Some(p.clone());
9742                }
9743                AlterUserAttribute::AddGroup(g) => {
9744                    if !attrs.groups.iter().any(|existing| existing == g) {
9745                        attrs.groups.push(g.clone());
9746                        attrs.groups.sort();
9747                    }
9748                }
9749                AlterUserAttribute::DropGroup(g) => {
9750                    attrs.groups.retain(|existing| existing != g);
9751                }
9752                AlterUserAttribute::Enable => enable_change = Some(true),
9753                AlterUserAttribute::Disable => enable_change = Some(false),
9754                AlterUserAttribute::Password(_) => {
9755                    // Out of scope — accept the AST but no-op so the
9756                    // parser stays compatible with future password
9757                    // rotation work.
9758                }
9759            }
9760        }
9761
9762        auth_store
9763            .set_user_attributes(&target, attrs)
9764            .map_err(|e| RedDBError::Query(e.to_string()))?;
9765        if let Some(en) = enable_change {
9766            auth_store
9767                .set_user_enabled(&target, en)
9768                .map_err(|e| RedDBError::Query(e.to_string()))?;
9769        }
9770        self.invalidate_result_cache();
9771        tracing::info!(
9772            target: "audit",
9773            principal = %target,
9774            action = "alter_user",
9775            "ALTER USER applied"
9776        );
9777
9778        Ok(RuntimeQueryResult::ok_message(
9779            query.to_string(),
9780            &format!("ALTER USER {} applied", target),
9781            "alter_user",
9782        ))
9783    }
9784
9785    // -----------------------------------------------------------------
9786    // IAM policy executors
9787    // -----------------------------------------------------------------
9788
9789    fn execute_create_iam_policy(
9790        &self,
9791        query: &str,
9792        id: &str,
9793        json: &str,
9794    ) -> RedDBResult<RuntimeQueryResult> {
9795        use crate::auth::policies::Policy;
9796
9797        let auth_store = self
9798            .inner
9799            .auth_store
9800            .read()
9801            .clone()
9802            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9803
9804        // Parse + validate. The kernel rejects oversize / bad shape /
9805        // bad action keywords. If the supplied id differs from the JSON
9806        // id, override it with the SQL-provided id (the JSON id is
9807        // optional context — the SQL DDL form is authoritative).
9808        let mut policy = Policy::from_json_str(json)
9809            .map_err(|e| RedDBError::Query(format!("policy parse: {e}")))?;
9810        if policy.id != id {
9811            policy.id = id.to_string();
9812        }
9813        let pid = policy.id.clone();
9814        auth_store
9815            .put_policy(policy)
9816            .map_err(|e| RedDBError::Query(e.to_string()))?;
9817
9818        let principal = current_auth_identity()
9819            .map(|(u, _)| u)
9820            .unwrap_or_else(|| "anonymous".into());
9821        tracing::info!(
9822            target: "audit",
9823            principal = %principal,
9824            action = "iam:policy.put",
9825            matched_policy_id = %pid,
9826            "CREATE POLICY applied"
9827        );
9828        self.inner.audit_log.record(
9829            "iam/policy.put",
9830            &principal,
9831            &pid,
9832            "ok",
9833            crate::json::Value::Null,
9834        );
9835
9836        self.invalidate_result_cache();
9837        Ok(RuntimeQueryResult::ok_message(
9838            query.to_string(),
9839            &format!("policy `{pid}` stored"),
9840            "create_iam_policy",
9841        ))
9842    }
9843
9844    fn execute_drop_iam_policy(&self, query: &str, id: &str) -> RedDBResult<RuntimeQueryResult> {
9845        let auth_store = self
9846            .inner
9847            .auth_store
9848            .read()
9849            .clone()
9850            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9851        auth_store
9852            .delete_policy(id)
9853            .map_err(|e| RedDBError::Query(e.to_string()))?;
9854
9855        let principal = current_auth_identity()
9856            .map(|(u, _)| u)
9857            .unwrap_or_else(|| "anonymous".into());
9858        tracing::info!(
9859            target: "audit",
9860            principal = %principal,
9861            action = "iam:policy.drop",
9862            matched_policy_id = %id,
9863            "DROP POLICY applied"
9864        );
9865        self.inner.audit_log.record(
9866            "iam/policy.drop",
9867            &principal,
9868            id,
9869            "ok",
9870            crate::json::Value::Null,
9871        );
9872
9873        self.invalidate_result_cache();
9874        Ok(RuntimeQueryResult::ok_message(
9875            query.to_string(),
9876            &format!("policy `{id}` dropped"),
9877            "drop_iam_policy",
9878        ))
9879    }
9880
9881    fn execute_attach_policy(
9882        &self,
9883        query: &str,
9884        policy_id: &str,
9885        principal: &crate::storage::query::ast::PolicyPrincipalRef,
9886    ) -> RedDBResult<RuntimeQueryResult> {
9887        use crate::auth::store::PrincipalRef;
9888        use crate::auth::UserId;
9889        use crate::storage::query::ast::PolicyPrincipalRef;
9890
9891        let auth_store = self
9892            .inner
9893            .auth_store
9894            .read()
9895            .clone()
9896            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9897        let p = match principal {
9898            PolicyPrincipalRef::User(u) => {
9899                PrincipalRef::User(UserId::from_parts(u.tenant.as_deref(), &u.username))
9900            }
9901            PolicyPrincipalRef::Group(g) => PrincipalRef::Group(g.clone()),
9902        };
9903        let pretty_target = principal_label(principal);
9904        auth_store
9905            .attach_policy(p, policy_id)
9906            .map_err(|e| RedDBError::Query(e.to_string()))?;
9907
9908        let principal_str = current_auth_identity()
9909            .map(|(u, _)| u)
9910            .unwrap_or_else(|| "anonymous".into());
9911        tracing::info!(
9912            target: "audit",
9913            principal = %principal_str,
9914            action = "iam:policy.attach",
9915            matched_policy_id = %policy_id,
9916            target = %pretty_target,
9917            "ATTACH POLICY applied"
9918        );
9919        self.inner.audit_log.record(
9920            "iam/policy.attach",
9921            &principal_str,
9922            &pretty_target,
9923            "ok",
9924            crate::json::Value::Null,
9925        );
9926
9927        self.invalidate_result_cache();
9928        Ok(RuntimeQueryResult::ok_message(
9929            query.to_string(),
9930            &format!("policy `{policy_id}` attached to {pretty_target}"),
9931            "attach_policy",
9932        ))
9933    }
9934
9935    fn execute_detach_policy(
9936        &self,
9937        query: &str,
9938        policy_id: &str,
9939        principal: &crate::storage::query::ast::PolicyPrincipalRef,
9940    ) -> RedDBResult<RuntimeQueryResult> {
9941        use crate::auth::store::PrincipalRef;
9942        use crate::auth::UserId;
9943        use crate::storage::query::ast::PolicyPrincipalRef;
9944
9945        let auth_store = self
9946            .inner
9947            .auth_store
9948            .read()
9949            .clone()
9950            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
9951        let p = match principal {
9952            PolicyPrincipalRef::User(u) => {
9953                PrincipalRef::User(UserId::from_parts(u.tenant.as_deref(), &u.username))
9954            }
9955            PolicyPrincipalRef::Group(g) => PrincipalRef::Group(g.clone()),
9956        };
9957        let pretty_target = principal_label(principal);
9958        auth_store
9959            .detach_policy(p, policy_id)
9960            .map_err(|e| RedDBError::Query(e.to_string()))?;
9961
9962        let principal_str = current_auth_identity()
9963            .map(|(u, _)| u)
9964            .unwrap_or_else(|| "anonymous".into());
9965        tracing::info!(
9966            target: "audit",
9967            principal = %principal_str,
9968            action = "iam:policy.detach",
9969            matched_policy_id = %policy_id,
9970            target = %pretty_target,
9971            "DETACH POLICY applied"
9972        );
9973        self.inner.audit_log.record(
9974            "iam/policy.detach",
9975            &principal_str,
9976            &pretty_target,
9977            "ok",
9978            crate::json::Value::Null,
9979        );
9980
9981        self.invalidate_result_cache();
9982        Ok(RuntimeQueryResult::ok_message(
9983            query.to_string(),
9984            &format!("policy `{policy_id}` detached from {pretty_target}"),
9985            "detach_policy",
9986        ))
9987    }
9988
9989    fn execute_show_policies(
9990        &self,
9991        query: &str,
9992        filter: Option<&crate::storage::query::ast::PolicyPrincipalRef>,
9993    ) -> RedDBResult<RuntimeQueryResult> {
9994        use crate::auth::UserId;
9995        use crate::storage::query::ast::PolicyPrincipalRef;
9996        use crate::storage::query::unified::UnifiedRecord;
9997        use crate::storage::schema::Value as SchemaValue;
9998        use std::sync::Arc;
9999
10000        let auth_store = self
10001            .inner
10002            .auth_store
10003            .read()
10004            .clone()
10005            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10006
10007        let pols = match filter {
10008            None => auth_store.list_policies(),
10009            Some(PolicyPrincipalRef::User(u)) => {
10010                let id = UserId::from_parts(u.tenant.as_deref(), &u.username);
10011                auth_store.effective_policies(&id)
10012            }
10013            Some(PolicyPrincipalRef::Group(g)) => auth_store.group_policies(g),
10014        };
10015
10016        let mut records = Vec::with_capacity(pols.len());
10017        for p in pols.iter() {
10018            let mut rec = UnifiedRecord::default();
10019            rec.set_arc(Arc::from("id"), SchemaValue::text(p.id.clone()));
10020            rec.set_arc(
10021                Arc::from("statements"),
10022                SchemaValue::Integer(p.statements.len() as i64),
10023            );
10024            rec.set_arc(
10025                Arc::from("tenant"),
10026                p.tenant
10027                    .as_deref()
10028                    .map(|t| SchemaValue::text(t.to_string()))
10029                    .unwrap_or(SchemaValue::Null),
10030            );
10031            rec.set_arc(Arc::from("json"), SchemaValue::text(p.to_json_string()));
10032            records.push(rec);
10033        }
10034        let mut result = crate::storage::query::unified::UnifiedResult::empty();
10035        result.records = records;
10036        Ok(RuntimeQueryResult {
10037            query: query.to_string(),
10038            mode: crate::storage::query::modes::QueryMode::Sql,
10039            statement: "show_policies",
10040            engine: "iam-policies",
10041            result,
10042            affected_rows: 0,
10043            statement_type: "select",
10044        })
10045    }
10046
10047    fn execute_show_effective_permissions(
10048        &self,
10049        query: &str,
10050        user: &crate::storage::query::ast::PolicyUserRef,
10051        resource: Option<&crate::storage::query::ast::PolicyResourceRef>,
10052    ) -> RedDBResult<RuntimeQueryResult> {
10053        use crate::auth::UserId;
10054        use crate::storage::query::unified::UnifiedRecord;
10055        use crate::storage::schema::Value as SchemaValue;
10056        use std::sync::Arc;
10057
10058        let auth_store = self
10059            .inner
10060            .auth_store
10061            .read()
10062            .clone()
10063            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10064        let id = UserId::from_parts(user.tenant.as_deref(), &user.username);
10065        let pols = auth_store.effective_policies(&id);
10066
10067        // Show one row per (policy, statement) tuple, plus any
10068        // resource-level filter passed by the caller.
10069        let mut records = Vec::new();
10070        for p in pols.iter() {
10071            for (idx, st) in p.statements.iter().enumerate() {
10072                if let Some(_r) = resource {
10073                    // Naive filter: render statement targets to strings
10074                    // and skip if no match. Conservative default = include
10075                    // (the simulator handles fine-grained matching).
10076                }
10077                let mut rec = UnifiedRecord::default();
10078                rec.set_arc(Arc::from("policy_id"), SchemaValue::text(p.id.clone()));
10079                rec.set_arc(
10080                    Arc::from("statement_index"),
10081                    SchemaValue::Integer(idx as i64),
10082                );
10083                rec.set_arc(
10084                    Arc::from("sid"),
10085                    st.sid
10086                        .as_deref()
10087                        .map(|s| SchemaValue::text(s.to_string()))
10088                        .unwrap_or(SchemaValue::Null),
10089                );
10090                rec.set_arc(
10091                    Arc::from("effect"),
10092                    SchemaValue::text(match st.effect {
10093                        crate::auth::policies::Effect::Allow => "allow",
10094                        crate::auth::policies::Effect::Deny => "deny",
10095                    }),
10096                );
10097                rec.set_arc(
10098                    Arc::from("actions"),
10099                    SchemaValue::Integer(st.actions.len() as i64),
10100                );
10101                rec.set_arc(
10102                    Arc::from("resources"),
10103                    SchemaValue::Integer(st.resources.len() as i64),
10104                );
10105                records.push(rec);
10106            }
10107        }
10108        let mut result = crate::storage::query::unified::UnifiedResult::empty();
10109        result.records = records;
10110        Ok(RuntimeQueryResult {
10111            query: query.to_string(),
10112            mode: crate::storage::query::modes::QueryMode::Sql,
10113            statement: "show_effective_permissions",
10114            engine: "iam-policies",
10115            result,
10116            affected_rows: 0,
10117            statement_type: "select",
10118        })
10119    }
10120
10121    fn execute_simulate_policy(
10122        &self,
10123        query: &str,
10124        user: &crate::storage::query::ast::PolicyUserRef,
10125        action: &str,
10126        resource: &crate::storage::query::ast::PolicyResourceRef,
10127    ) -> RedDBResult<RuntimeQueryResult> {
10128        use crate::auth::policies::ResourceRef;
10129        use crate::auth::store::SimCtx;
10130        use crate::auth::UserId;
10131        use crate::storage::query::unified::UnifiedRecord;
10132        use crate::storage::schema::Value as SchemaValue;
10133        use std::sync::Arc;
10134
10135        let auth_store = self
10136            .inner
10137            .auth_store
10138            .read()
10139            .clone()
10140            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10141        let id = UserId::from_parts(user.tenant.as_deref(), &user.username);
10142        let r = ResourceRef::new(resource.kind.clone(), resource.name.clone());
10143        let outcome = auth_store.simulate(&id, action, &r, SimCtx::default());
10144
10145        let principal_str = current_auth_identity()
10146            .map(|(u, _)| u)
10147            .unwrap_or_else(|| "anonymous".into());
10148        let (decision_str, matched_pid, matched_sid) = decision_to_strings(&outcome.decision);
10149        tracing::info!(
10150            target: "audit",
10151            principal = %principal_str,
10152            action = "iam:policy.simulate",
10153            decision = %decision_str,
10154            matched_policy_id = ?matched_pid,
10155            matched_sid = ?matched_sid,
10156            "SIMULATE issued"
10157        );
10158        self.inner.audit_log.record(
10159            "iam/policy.simulate",
10160            &principal_str,
10161            &id.to_string(),
10162            "ok",
10163            crate::json::Value::Null,
10164        );
10165
10166        let mut rec = UnifiedRecord::default();
10167        rec.set_arc(Arc::from("decision"), SchemaValue::text(decision_str));
10168        rec.set_arc(
10169            Arc::from("matched_policy_id"),
10170            matched_pid
10171                .map(SchemaValue::text)
10172                .unwrap_or(SchemaValue::Null),
10173        );
10174        rec.set_arc(
10175            Arc::from("matched_sid"),
10176            matched_sid
10177                .map(SchemaValue::text)
10178                .unwrap_or(SchemaValue::Null),
10179        );
10180        rec.set_arc(Arc::from("reason"), SchemaValue::text(outcome.reason));
10181        rec.set_arc(
10182            Arc::from("trail_len"),
10183            SchemaValue::Integer(outcome.trail.len() as i64),
10184        );
10185        let mut result = crate::storage::query::unified::UnifiedResult::empty();
10186        result.records = vec![rec];
10187        Ok(RuntimeQueryResult {
10188            query: query.to_string(),
10189            mode: crate::storage::query::modes::QueryMode::Sql,
10190            statement: "simulate_policy",
10191            engine: "iam-policies",
10192            result,
10193            affected_rows: 0,
10194            statement_type: "select",
10195        })
10196    }
10197}
10198
10199/// Translate a parsed GRANT into a synthetic IAM policy whose id
10200/// starts with `_grant_<unique>`. PUBLIC is represented as an
10201/// implicit IAM group; legacy GROUP grants are still rejected by the
10202/// grant store and are not translated here.
10203fn grant_to_iam_policy(
10204    principal: &crate::auth::privileges::GrantPrincipal,
10205    resource: &crate::auth::privileges::Resource,
10206    actions: &[crate::auth::privileges::Action],
10207    tenant: Option<&str>,
10208) -> Option<crate::auth::policies::Policy> {
10209    use crate::auth::policies::{
10210        compile_action, ActionPattern, Effect, Policy, ResourcePattern, Statement,
10211    };
10212    use crate::auth::privileges::{Action, GrantPrincipal, Resource};
10213
10214    if matches!(principal, GrantPrincipal::Group(_)) {
10215        return None;
10216    }
10217
10218    let now = crate::auth::now_ms();
10219    let id = format!("_grant_{:x}_{:x}", now, std::process::id());
10220
10221    let resource_str = match resource {
10222        Resource::Database => "table:*".to_string(),
10223        Resource::Schema(s) => format!("table:{s}.*"),
10224        Resource::Table { schema, table } => match schema {
10225            Some(s) => format!("table:{s}.{table}"),
10226            None => format!("table:{table}"),
10227        },
10228        Resource::Function { schema, name } => match schema {
10229            Some(s) => format!("function:{s}.{name}"),
10230            None => format!("function:{name}"),
10231        },
10232    };
10233
10234    // Compile actions — fall back to `*` only when the grant included
10235    // `Action::All`. Map every other action keyword to its lowercase
10236    // form so it lines up with the kernel's allowlist.
10237    let action_patterns: Vec<ActionPattern> = if actions.contains(&Action::All) {
10238        vec![ActionPattern::Wildcard]
10239    } else {
10240        actions
10241            .iter()
10242            .map(|a| compile_action(&a.as_str().to_ascii_lowercase()))
10243            .collect()
10244    };
10245    if action_patterns.is_empty() {
10246        return None;
10247    }
10248
10249    // Inline resource compilation matching the kernel's `compile_resource`:
10250    //   * `*` → wildcard
10251    //   * contains `*` → glob
10252    //   * `kind:name` → exact
10253    let resource_patterns = if resource_str == "*" {
10254        vec![ResourcePattern::Wildcard]
10255    } else if resource_str.contains('*') {
10256        vec![ResourcePattern::Glob(resource_str.clone())]
10257    } else if let Some((kind, name)) = resource_str.split_once(':') {
10258        vec![ResourcePattern::Exact {
10259            kind: kind.to_string(),
10260            name: name.to_string(),
10261        }]
10262    } else {
10263        vec![ResourcePattern::Wildcard]
10264    };
10265
10266    let policy = Policy {
10267        id,
10268        version: 1,
10269        tenant: tenant.map(|t| t.to_string()),
10270        created_at: now,
10271        updated_at: now,
10272        statements: vec![Statement {
10273            sid: None,
10274            effect: Effect::Allow,
10275            actions: action_patterns,
10276            resources: resource_patterns,
10277            condition: None,
10278        }],
10279    };
10280    if policy.validate().is_err() {
10281        return None;
10282    }
10283    Some(policy)
10284}
10285
10286fn legacy_action_to_iam(action: crate::auth::privileges::Action) -> &'static str {
10287    use crate::auth::privileges::Action;
10288    match action {
10289        Action::Select => "select",
10290        Action::Insert => "insert",
10291        Action::Update => "update",
10292        Action::Delete => "delete",
10293        Action::Truncate => "truncate",
10294        Action::References => "references",
10295        Action::Execute => "execute",
10296        Action::Usage => "usage",
10297        Action::All => "*",
10298    }
10299}
10300
10301fn update_set_target_columns(query: &crate::storage::query::ast::UpdateQuery) -> Vec<String> {
10302    let mut columns = Vec::new();
10303    for (column, _) in &query.assignment_exprs {
10304        if !columns.iter().any(|seen| seen == column) {
10305            columns.push(column.clone());
10306        }
10307    }
10308    columns
10309}
10310
10311fn column_access_request_for_table_update(
10312    table_name: &str,
10313    columns: Vec<String>,
10314) -> crate::auth::ColumnAccessRequest {
10315    match table_name.split_once('.') {
10316        Some((schema, table)) => {
10317            crate::auth::ColumnAccessRequest::update(table.to_string(), columns)
10318                .with_schema(schema.to_string())
10319        }
10320        None => crate::auth::ColumnAccessRequest::update(table_name.to_string(), columns),
10321    }
10322}
10323
10324fn column_access_request_for_table_select(
10325    table_name: &str,
10326    columns: Vec<String>,
10327) -> crate::auth::ColumnAccessRequest {
10328    match table_name.split_once('.') {
10329        Some((schema, table)) => {
10330            crate::auth::ColumnAccessRequest::select(table.to_string(), columns)
10331                .with_schema(schema.to_string())
10332        }
10333        None => crate::auth::ColumnAccessRequest::select(table_name.to_string(), columns),
10334    }
10335}
10336
10337fn update_returning_columns_for_policy(
10338    runtime: &RedDBRuntime,
10339    query: &crate::storage::query::ast::UpdateQuery,
10340) -> Option<Vec<String>> {
10341    let items = query.returning.as_ref()?;
10342    let mut columns = Vec::new();
10343    let project_all = items
10344        .iter()
10345        .any(|item| matches!(item, crate::storage::query::ast::ReturningItem::All));
10346    if project_all {
10347        collect_returning_star_columns(runtime, query, &mut columns);
10348    } else {
10349        for item in items {
10350            let crate::storage::query::ast::ReturningItem::Column(column) = item else {
10351                continue;
10352            };
10353            push_returning_policy_column(&mut columns, column);
10354        }
10355    }
10356    (!columns.is_empty()).then_some(columns)
10357}
10358
10359fn collect_returning_star_columns(
10360    runtime: &RedDBRuntime,
10361    query: &crate::storage::query::ast::UpdateQuery,
10362    columns: &mut Vec<String>,
10363) {
10364    let store = runtime.db().store();
10365    let Some(manager) = store.get_collection(&query.table) else {
10366        return;
10367    };
10368    if let Some(schema) = manager.column_schema() {
10369        for column in schema.iter() {
10370            push_returning_policy_column(columns, column);
10371        }
10372    }
10373    for entity in manager.query_all(|_| true) {
10374        if !returning_entity_matches_update_target(&entity, query.target) {
10375            continue;
10376        }
10377        match &entity.data {
10378            crate::storage::EntityData::Row(row) => {
10379                for (column, _) in row.iter_fields() {
10380                    push_returning_policy_column(columns, column);
10381                }
10382            }
10383            crate::storage::EntityData::Node(node) => {
10384                push_returning_policy_column(columns, "label");
10385                push_returning_policy_column(columns, "node_type");
10386                for column in node.properties.keys() {
10387                    push_returning_policy_column(columns, column);
10388                }
10389            }
10390            crate::storage::EntityData::Edge(edge) => {
10391                push_returning_policy_column(columns, "label");
10392                push_returning_policy_column(columns, "from_rid");
10393                push_returning_policy_column(columns, "to_rid");
10394                push_returning_policy_column(columns, "weight");
10395                for column in edge.properties.keys() {
10396                    push_returning_policy_column(columns, column);
10397                }
10398            }
10399            _ => {}
10400        }
10401    }
10402}
10403
10404fn push_returning_policy_column(columns: &mut Vec<String>, column: &str) {
10405    if returning_public_envelope_column(column) {
10406        return;
10407    }
10408    if !columns.iter().any(|seen| seen == column) {
10409        columns.push(column.to_string());
10410    }
10411}
10412
10413fn returning_public_envelope_column(column: &str) -> bool {
10414    matches!(
10415        column.to_ascii_lowercase().as_str(),
10416        "rid" | "collection" | "kind" | "tenant" | "created_at" | "updated_at" | "red_entity_id"
10417    )
10418}
10419
10420fn returning_entity_matches_update_target(
10421    entity: &crate::storage::UnifiedEntity,
10422    target: crate::storage::query::ast::UpdateTarget,
10423) -> bool {
10424    use crate::storage::query::ast::UpdateTarget;
10425    match target {
10426        UpdateTarget::Rows => {
10427            matches!(returning_row_item_kind(entity), Some(ReturningRowKind::Row))
10428        }
10429        UpdateTarget::Documents => {
10430            matches!(
10431                returning_row_item_kind(entity),
10432                Some(ReturningRowKind::Document)
10433            )
10434        }
10435        UpdateTarget::Kv => matches!(returning_row_item_kind(entity), Some(ReturningRowKind::Kv)),
10436        UpdateTarget::Nodes => matches!(
10437            (&entity.kind, &entity.data),
10438            (
10439                crate::storage::EntityKind::GraphNode(_),
10440                crate::storage::EntityData::Node(_)
10441            )
10442        ),
10443        UpdateTarget::Edges => matches!(
10444            (&entity.kind, &entity.data),
10445            (
10446                crate::storage::EntityKind::GraphEdge(_),
10447                crate::storage::EntityData::Edge(_)
10448            )
10449        ),
10450    }
10451}
10452
10453#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10454enum ReturningRowKind {
10455    Row,
10456    Document,
10457    Kv,
10458}
10459
10460fn returning_row_item_kind(entity: &crate::storage::UnifiedEntity) -> Option<ReturningRowKind> {
10461    let row = entity.data.as_row()?;
10462    let is_kv = row.iter_fields().all(|(column, _)| {
10463        column.eq_ignore_ascii_case("key") || column.eq_ignore_ascii_case("value")
10464    });
10465    if is_kv {
10466        return Some(ReturningRowKind::Kv);
10467    }
10468    let is_document = row
10469        .iter_fields()
10470        .any(|(_, value)| matches!(value, crate::storage::schema::Value::Json(_)));
10471    if is_document {
10472        Some(ReturningRowKind::Document)
10473    } else {
10474        Some(ReturningRowKind::Row)
10475    }
10476}
10477
10478fn requested_table_columns_for_policy(
10479    table: &crate::storage::query::ast::TableQuery,
10480) -> Vec<String> {
10481    use crate::storage::query::sql_lowering::{
10482        effective_table_filter, effective_table_group_by_exprs, effective_table_having_filter,
10483        effective_table_projections,
10484    };
10485
10486    let table_name = table.table.as_str();
10487    let table_alias = table.alias.as_deref();
10488    let mut columns = std::collections::BTreeSet::new();
10489
10490    for projection in effective_table_projections(table) {
10491        collect_projection_columns(&projection, table_name, table_alias, &mut columns);
10492    }
10493    if let Some(filter) = effective_table_filter(table) {
10494        collect_filter_columns(&filter, table_name, table_alias, &mut columns);
10495    }
10496    for expr in effective_table_group_by_exprs(table) {
10497        collect_expr_columns(&expr, table_name, table_alias, &mut columns);
10498    }
10499    if let Some(filter) = effective_table_having_filter(table) {
10500        collect_filter_columns(&filter, table_name, table_alias, &mut columns);
10501    }
10502    for order in &table.order_by {
10503        if let Some(expr) = order.expr.as_ref() {
10504            collect_expr_columns(expr, table_name, table_alias, &mut columns);
10505        } else {
10506            collect_field_ref_column(&order.field, table_name, table_alias, &mut columns);
10507        }
10508    }
10509
10510    columns.into_iter().collect()
10511}
10512
10513fn collect_projection_columns(
10514    projection: &crate::storage::query::ast::Projection,
10515    table_name: &str,
10516    table_alias: Option<&str>,
10517    columns: &mut std::collections::BTreeSet<String>,
10518) {
10519    use crate::storage::query::ast::Projection;
10520    match projection {
10521        Projection::All => {
10522            columns.insert("*".to_string());
10523        }
10524        Projection::Column(column) | Projection::Alias(column, _) => {
10525            if column != "*" {
10526                columns.insert(column.clone());
10527            }
10528        }
10529        Projection::Function(_, args) => {
10530            for arg in args {
10531                collect_projection_columns(arg, table_name, table_alias, columns);
10532            }
10533        }
10534        Projection::Expression(filter, _) => {
10535            collect_filter_columns(filter, table_name, table_alias, columns);
10536        }
10537        Projection::Field(field, _) => {
10538            collect_field_ref_column(field, table_name, table_alias, columns);
10539        }
10540        // Slice 7a (#589): no runtime support yet; recurse into args so
10541        // any column references are still tracked in case a future
10542        // executor needs the column set.
10543        Projection::Window { args, .. } => {
10544            for arg in args {
10545                collect_projection_columns(arg, table_name, table_alias, columns);
10546            }
10547        }
10548    }
10549}
10550
10551fn collect_filter_columns(
10552    filter: &crate::storage::query::ast::Filter,
10553    table_name: &str,
10554    table_alias: Option<&str>,
10555    columns: &mut std::collections::BTreeSet<String>,
10556) {
10557    use crate::storage::query::ast::Filter;
10558    match filter {
10559        Filter::Compare { field, .. }
10560        | Filter::IsNull(field)
10561        | Filter::IsNotNull(field)
10562        | Filter::In { field, .. }
10563        | Filter::Between { field, .. }
10564        | Filter::Like { field, .. }
10565        | Filter::StartsWith { field, .. }
10566        | Filter::EndsWith { field, .. }
10567        | Filter::Contains { field, .. } => {
10568            collect_field_ref_column(field, table_name, table_alias, columns);
10569        }
10570        Filter::CompareFields { left, right, .. } => {
10571            collect_field_ref_column(left, table_name, table_alias, columns);
10572            collect_field_ref_column(right, table_name, table_alias, columns);
10573        }
10574        Filter::CompareExpr { lhs, rhs, .. } => {
10575            collect_expr_columns(lhs, table_name, table_alias, columns);
10576            collect_expr_columns(rhs, table_name, table_alias, columns);
10577        }
10578        Filter::And(left, right) | Filter::Or(left, right) => {
10579            collect_filter_columns(left, table_name, table_alias, columns);
10580            collect_filter_columns(right, table_name, table_alias, columns);
10581        }
10582        Filter::Not(inner) => collect_filter_columns(inner, table_name, table_alias, columns),
10583    }
10584}
10585
10586fn collect_expr_columns(
10587    expr: &crate::storage::query::ast::Expr,
10588    table_name: &str,
10589    table_alias: Option<&str>,
10590    columns: &mut std::collections::BTreeSet<String>,
10591) {
10592    use crate::storage::query::ast::Expr;
10593    match expr {
10594        Expr::Column { field, .. } => {
10595            collect_field_ref_column(field, table_name, table_alias, columns);
10596        }
10597        Expr::Literal { .. } | Expr::Parameter { .. } => {}
10598        Expr::UnaryOp { operand, .. } | Expr::Cast { inner: operand, .. } => {
10599            collect_expr_columns(operand, table_name, table_alias, columns);
10600        }
10601        Expr::BinaryOp { lhs, rhs, .. } => {
10602            collect_expr_columns(lhs, table_name, table_alias, columns);
10603            collect_expr_columns(rhs, table_name, table_alias, columns);
10604        }
10605        Expr::FunctionCall { args, .. } => {
10606            for arg in args {
10607                collect_expr_columns(arg, table_name, table_alias, columns);
10608            }
10609        }
10610        Expr::Case {
10611            branches, else_, ..
10612        } => {
10613            for (condition, value) in branches {
10614                collect_expr_columns(condition, table_name, table_alias, columns);
10615                collect_expr_columns(value, table_name, table_alias, columns);
10616            }
10617            if let Some(value) = else_ {
10618                collect_expr_columns(value, table_name, table_alias, columns);
10619            }
10620        }
10621        Expr::IsNull { operand, .. } => {
10622            collect_expr_columns(operand, table_name, table_alias, columns);
10623        }
10624        Expr::InList { target, values, .. } => {
10625            collect_expr_columns(target, table_name, table_alias, columns);
10626            for value in values {
10627                collect_expr_columns(value, table_name, table_alias, columns);
10628            }
10629        }
10630        Expr::Between {
10631            target, low, high, ..
10632        } => {
10633            collect_expr_columns(target, table_name, table_alias, columns);
10634            collect_expr_columns(low, table_name, table_alias, columns);
10635            collect_expr_columns(high, table_name, table_alias, columns);
10636        }
10637        Expr::Subquery { .. } => {}
10638        Expr::WindowFunctionCall { args, window, .. } => {
10639            for arg in args {
10640                collect_expr_columns(arg, table_name, table_alias, columns);
10641            }
10642            for e in &window.partition_by {
10643                collect_expr_columns(e, table_name, table_alias, columns);
10644            }
10645            for o in &window.order_by {
10646                collect_expr_columns(&o.expr, table_name, table_alias, columns);
10647            }
10648        }
10649    }
10650}
10651
10652fn collect_field_ref_column(
10653    field: &crate::storage::query::ast::FieldRef,
10654    table_name: &str,
10655    table_alias: Option<&str>,
10656    columns: &mut std::collections::BTreeSet<String>,
10657) {
10658    if let Some(column) = policy_column_name_from_field_ref(field, table_name, table_alias) {
10659        if column != "*" {
10660            columns.insert(column);
10661        }
10662    }
10663}
10664
10665fn policy_column_name_from_field_ref(
10666    field: &crate::storage::query::ast::FieldRef,
10667    table_name: &str,
10668    table_alias: Option<&str>,
10669) -> Option<String> {
10670    match field {
10671        crate::storage::query::ast::FieldRef::TableColumn { table, column } => {
10672            if column == "*" {
10673                return Some("*".to_string());
10674            }
10675            if table.is_empty() || table == table_name || Some(table.as_str()) == table_alias {
10676                Some(column.clone())
10677            } else {
10678                Some(format!("{table}.{column}"))
10679            }
10680        }
10681        _ => None,
10682    }
10683}
10684
10685fn legacy_resource_to_iam(
10686    resource: &crate::auth::privileges::Resource,
10687    tenant: Option<&str>,
10688) -> crate::auth::policies::ResourceRef {
10689    use crate::auth::privileges::Resource;
10690
10691    let (kind, name) = match resource {
10692        Resource::Database => ("database".to_string(), "*".to_string()),
10693        Resource::Schema(s) => ("schema".to_string(), format!("{s}.*")),
10694        Resource::Table { schema, table } => (
10695            "table".to_string(),
10696            match schema {
10697                Some(s) => format!("{s}.{table}"),
10698                None => table.clone(),
10699            },
10700        ),
10701        Resource::Function { schema, name } => (
10702            "function".to_string(),
10703            match schema {
10704                Some(s) => format!("{s}.{name}"),
10705                None => name.clone(),
10706            },
10707        ),
10708    };
10709
10710    let mut out = crate::auth::policies::ResourceRef::new(kind, name);
10711    if let Some(t) = tenant {
10712        out = out.with_tenant(t.to_string());
10713    }
10714    out
10715}
10716
10717#[derive(Debug)]
10718struct JoinTableSide {
10719    table: String,
10720    alias: String,
10721}
10722
10723fn table_side_context(expr: &QueryExpr) -> Option<JoinTableSide> {
10724    match expr {
10725        QueryExpr::Table(table) => Some(JoinTableSide {
10726            table: table.table.clone(),
10727            alias: table.alias.clone().unwrap_or_else(|| table.table.clone()),
10728        }),
10729        _ => None,
10730    }
10731}
10732
10733fn collect_projection_columns_for_table(
10734    projection: &Projection,
10735    table: &str,
10736    alias: Option<&str>,
10737    out: &mut BTreeSet<String>,
10738) {
10739    match projection {
10740        Projection::Column(column) | Projection::Alias(column, _) => {
10741            match split_qualified_column(column) {
10742                Some((qualifier, column))
10743                    if qualifier == table || alias.is_some_and(|alias| qualifier == alias) =>
10744                {
10745                    push_policy_column(column, out);
10746                }
10747                Some(_) => {}
10748                None => push_policy_column(column, out),
10749            }
10750        }
10751        Projection::Field(
10752            FieldRef::TableColumn {
10753                table: qualifier,
10754                column,
10755            },
10756            _,
10757        ) => {
10758            if qualifier.is_empty()
10759                || qualifier == table
10760                || alias.is_some_and(|alias| qualifier == alias)
10761            {
10762                push_policy_column(column, out);
10763            }
10764        }
10765        Projection::Field(
10766            FieldRef::NodeProperty {
10767                alias: qualifier,
10768                property,
10769            },
10770            _,
10771        )
10772        | Projection::Field(
10773            FieldRef::EdgeProperty {
10774                alias: qualifier,
10775                property,
10776            },
10777            _,
10778        ) => {
10779            if qualifier == table || alias.is_some_and(|alias| qualifier == alias) {
10780                push_policy_column(property, out);
10781            }
10782        }
10783        Projection::Function(_, args) => {
10784            for arg in args {
10785                collect_projection_columns_for_table(arg, table, alias, out);
10786            }
10787        }
10788        Projection::Expression(_, _) | Projection::All | Projection::Field(_, _) => {}
10789        Projection::Window { args, .. } => {
10790            for arg in args {
10791                collect_projection_columns_for_table(arg, table, alias, out);
10792            }
10793        }
10794    }
10795}
10796
10797fn collect_projection_columns_for_join_side(
10798    projection: &Projection,
10799    left: Option<&JoinTableSide>,
10800    right: Option<&JoinTableSide>,
10801    out: &mut HashMap<String, BTreeSet<String>>,
10802) -> RedDBResult<()> {
10803    match projection {
10804        Projection::Column(column) | Projection::Alias(column, _) => {
10805            if let Some((qualifier, column)) = split_qualified_column(column) {
10806                push_qualified_join_column(qualifier, column, left, right, out);
10807            } else {
10808                push_unqualified_join_column(column, left, right, out);
10809            }
10810        }
10811        Projection::Field(FieldRef::TableColumn { table, column }, _) => {
10812            if table.is_empty() {
10813                push_unqualified_join_column(column, left, right, out);
10814            } else if let Some(side) = [left, right]
10815                .into_iter()
10816                .flatten()
10817                .find(|side| table == side.table.as_str() || table == side.alias.as_str())
10818            {
10819                push_join_column(&side.table, column, out);
10820            }
10821        }
10822        Projection::Field(FieldRef::NodeProperty { alias, property }, _)
10823        | Projection::Field(FieldRef::EdgeProperty { alias, property }, _) => {
10824            push_qualified_join_column(alias, property, left, right, out);
10825        }
10826        Projection::Function(_, args) => {
10827            for arg in args {
10828                collect_projection_columns_for_join_side(arg, left, right, out)?;
10829            }
10830        }
10831        Projection::Expression(_, _) | Projection::All | Projection::Field(_, _) => {}
10832        Projection::Window { args, .. } => {
10833            for arg in args {
10834                collect_projection_columns_for_join_side(arg, left, right, out)?;
10835            }
10836        }
10837    }
10838    Ok(())
10839}
10840
10841fn split_qualified_column(column: &str) -> Option<(&str, &str)> {
10842    let (qualifier, column) = column.split_once('.')?;
10843    if qualifier.is_empty() || column.is_empty() || column.contains('.') {
10844        return None;
10845    }
10846    Some((qualifier, column))
10847}
10848
10849fn push_qualified_join_column(
10850    qualifier: &str,
10851    column: &str,
10852    left: Option<&JoinTableSide>,
10853    right: Option<&JoinTableSide>,
10854    out: &mut HashMap<String, BTreeSet<String>>,
10855) {
10856    if let Some(side) = [left, right]
10857        .into_iter()
10858        .flatten()
10859        .find(|side| qualifier == side.table.as_str() || qualifier == side.alias.as_str())
10860    {
10861        push_join_column(&side.table, column, out);
10862    }
10863}
10864
10865fn push_unqualified_join_column(
10866    column: &str,
10867    left: Option<&JoinTableSide>,
10868    right: Option<&JoinTableSide>,
10869    out: &mut HashMap<String, BTreeSet<String>>,
10870) {
10871    for side in [left, right].into_iter().flatten() {
10872        push_join_column(&side.table, column, out);
10873    }
10874}
10875
10876fn push_join_column(table: &str, column: &str, out: &mut HashMap<String, BTreeSet<String>>) {
10877    if is_policy_column_name(column) {
10878        out.entry(table.to_string())
10879            .or_default()
10880            .insert(column.to_string());
10881    }
10882}
10883
10884fn push_policy_column(column: &str, out: &mut BTreeSet<String>) {
10885    if is_policy_column_name(column) {
10886        out.insert(column.to_string());
10887    }
10888}
10889
10890fn is_policy_column_name(column: &str) -> bool {
10891    !column.is_empty()
10892        && column != "*"
10893        && !column.starts_with("LIT:")
10894        && !column.starts_with("TYPE:")
10895}
10896
10897fn runtime_iam_context(
10898    role: crate::auth::Role,
10899    tenant: Option<&str>,
10900) -> crate::auth::policies::EvalContext {
10901    crate::auth::policies::EvalContext {
10902        principal_tenant: tenant.map(|t| t.to_string()),
10903        current_tenant: tenant.map(|t| t.to_string()),
10904        peer_ip: None,
10905        mfa_present: false,
10906        now_ms: crate::auth::now_ms(),
10907        principal_is_admin_role: role == crate::auth::Role::Admin,
10908    }
10909}
10910
10911fn explicit_table_projection_columns(
10912    query: &crate::storage::query::ast::TableQuery,
10913) -> Vec<String> {
10914    use crate::storage::query::ast::{FieldRef, Projection};
10915
10916    let mut columns = Vec::new();
10917    for projection in crate::storage::query::sql_lowering::effective_table_projections(query) {
10918        match projection {
10919            Projection::Column(column) | Projection::Alias(column, _) => {
10920                push_unique(&mut columns, column)
10921            }
10922            Projection::Field(FieldRef::TableColumn { column, .. }, _) => {
10923                push_unique(&mut columns, column)
10924            }
10925            // SELECT * and expression/function projections need the
10926            // executor-wide column-policy context mapped in
10927            // docs/security/select-relational-column-policy-audit-2026-05-08.md.
10928            _ => {}
10929        }
10930    }
10931    columns
10932}
10933
10934fn explicit_graph_projection_properties(
10935    query: &crate::storage::query::ast::GraphQuery,
10936) -> Vec<String> {
10937    use crate::storage::query::ast::{FieldRef, Projection};
10938
10939    let mut columns = Vec::new();
10940    for projection in &query.return_ {
10941        match projection {
10942            Projection::Field(FieldRef::NodeProperty { property, .. }, _)
10943            | Projection::Field(FieldRef::EdgeProperty { property, .. }, _) => {
10944                push_unique(&mut columns, property.clone())
10945            }
10946            _ => {}
10947        }
10948    }
10949    columns
10950}
10951
10952fn push_unique(columns: &mut Vec<String>, column: String) {
10953    if !columns.iter().any(|existing| existing == &column) {
10954        columns.push(column);
10955    }
10956}
10957
10958fn principal_label(p: &crate::storage::query::ast::PolicyPrincipalRef) -> String {
10959    use crate::storage::query::ast::PolicyPrincipalRef;
10960    match p {
10961        PolicyPrincipalRef::User(u) => match &u.tenant {
10962            Some(t) => format!("user:{t}/{}", u.username),
10963            None => format!("user:{}", u.username),
10964        },
10965        PolicyPrincipalRef::Group(g) => format!("group:{g}"),
10966    }
10967}
10968
10969/// Render a `Decision` into the (decision, matched_policy_id, matched_sid)
10970/// shape used by every audit emit + the simulator response.
10971pub(crate) fn decision_to_strings(
10972    d: &crate::auth::policies::Decision,
10973) -> (String, Option<String>, Option<String>) {
10974    use crate::auth::policies::Decision;
10975    match d {
10976        Decision::Allow {
10977            matched_policy_id,
10978            matched_sid,
10979        } => (
10980            "allow".into(),
10981            Some(matched_policy_id.clone()),
10982            matched_sid.clone(),
10983        ),
10984        Decision::Deny {
10985            matched_policy_id,
10986            matched_sid,
10987        } => (
10988            "deny".into(),
10989            Some(matched_policy_id.clone()),
10990            matched_sid.clone(),
10991        ),
10992        Decision::DefaultDeny => ("default_deny".into(), None, None),
10993        Decision::AdminBypass => ("admin_bypass".into(), None, None),
10994    }
10995}
10996
10997fn relation_scopes_for_query(query: &QueryExpr) -> Vec<String> {
10998    let mut scopes = Vec::new();
10999    collect_relation_scopes(query, &mut scopes);
11000    scopes.sort();
11001    scopes.dedup();
11002    scopes
11003}
11004
11005fn collect_relation_scopes(query: &QueryExpr, scopes: &mut Vec<String>) {
11006    match query {
11007        QueryExpr::Table(table) => {
11008            if !table.table.is_empty() {
11009                scopes.push(table.table.clone());
11010            }
11011            if let Some(alias) = &table.alias {
11012                scopes.push(alias.clone());
11013            }
11014        }
11015        QueryExpr::Join(join) => {
11016            collect_relation_scopes(&join.left, scopes);
11017            collect_relation_scopes(&join.right, scopes);
11018        }
11019        _ => {}
11020    }
11021}
11022
11023fn query_references_outer_scope(query: &QueryExpr, outer_scopes: &[String]) -> bool {
11024    let inner_scopes = relation_scopes_for_query(query);
11025    query_expr_references_outer_scope(query, outer_scopes, &inner_scopes)
11026}
11027
11028fn query_expr_references_outer_scope(
11029    query: &QueryExpr,
11030    outer_scopes: &[String],
11031    inner_scopes: &[String],
11032) -> bool {
11033    match query {
11034        QueryExpr::Table(table) => {
11035            table.select_items.iter().any(|item| match item {
11036                crate::storage::query::ast::SelectItem::Wildcard => false,
11037                crate::storage::query::ast::SelectItem::Expr { expr, .. } => {
11038                    expr_references_outer_scope(expr, outer_scopes, inner_scopes)
11039                }
11040            }) || table
11041                .where_expr
11042                .as_ref()
11043                .is_some_and(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
11044                || table.filter.as_ref().is_some_and(|filter| {
11045                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
11046                })
11047                || table.having_expr.as_ref().is_some_and(|expr| {
11048                    expr_references_outer_scope(expr, outer_scopes, inner_scopes)
11049                })
11050                || table.having.as_ref().is_some_and(|filter| {
11051                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
11052                })
11053                || table
11054                    .group_by_exprs
11055                    .iter()
11056                    .any(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
11057                || table.order_by.iter().any(|clause| {
11058                    clause.expr.as_ref().is_some_and(|expr| {
11059                        expr_references_outer_scope(expr, outer_scopes, inner_scopes)
11060                    })
11061                })
11062        }
11063        QueryExpr::Join(join) => {
11064            query_expr_references_outer_scope(&join.left, outer_scopes, inner_scopes)
11065                || query_expr_references_outer_scope(&join.right, outer_scopes, inner_scopes)
11066                || join.filter.as_ref().is_some_and(|filter| {
11067                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
11068                })
11069                || join.return_items.iter().any(|item| match item {
11070                    crate::storage::query::ast::SelectItem::Wildcard => false,
11071                    crate::storage::query::ast::SelectItem::Expr { expr, .. } => {
11072                        expr_references_outer_scope(expr, outer_scopes, inner_scopes)
11073                    }
11074                })
11075        }
11076        _ => false,
11077    }
11078}
11079
11080fn filter_references_outer_scope(
11081    filter: &crate::storage::query::ast::Filter,
11082    outer_scopes: &[String],
11083    inner_scopes: &[String],
11084) -> bool {
11085    use crate::storage::query::ast::Filter;
11086    match filter {
11087        Filter::Compare { field, .. }
11088        | Filter::IsNull(field)
11089        | Filter::IsNotNull(field)
11090        | Filter::In { field, .. }
11091        | Filter::Between { field, .. }
11092        | Filter::Like { field, .. }
11093        | Filter::StartsWith { field, .. }
11094        | Filter::EndsWith { field, .. }
11095        | Filter::Contains { field, .. } => {
11096            field_ref_references_outer_scope(field, outer_scopes, inner_scopes)
11097        }
11098        Filter::CompareFields { left, right, .. } => {
11099            field_ref_references_outer_scope(left, outer_scopes, inner_scopes)
11100                || field_ref_references_outer_scope(right, outer_scopes, inner_scopes)
11101        }
11102        Filter::CompareExpr { lhs, rhs, .. } => {
11103            expr_references_outer_scope(lhs, outer_scopes, inner_scopes)
11104                || expr_references_outer_scope(rhs, outer_scopes, inner_scopes)
11105        }
11106        Filter::And(left, right) | Filter::Or(left, right) => {
11107            filter_references_outer_scope(left, outer_scopes, inner_scopes)
11108                || filter_references_outer_scope(right, outer_scopes, inner_scopes)
11109        }
11110        Filter::Not(inner) => filter_references_outer_scope(inner, outer_scopes, inner_scopes),
11111    }
11112}
11113
11114fn expr_references_outer_scope(
11115    expr: &crate::storage::query::ast::Expr,
11116    outer_scopes: &[String],
11117    inner_scopes: &[String],
11118) -> bool {
11119    use crate::storage::query::ast::Expr;
11120    match expr {
11121        Expr::Column { field, .. } => {
11122            field_ref_references_outer_scope(field, outer_scopes, inner_scopes)
11123        }
11124        Expr::BinaryOp { lhs, rhs, .. } => {
11125            expr_references_outer_scope(lhs, outer_scopes, inner_scopes)
11126                || expr_references_outer_scope(rhs, outer_scopes, inner_scopes)
11127        }
11128        Expr::UnaryOp { operand, .. }
11129        | Expr::Cast { inner: operand, .. }
11130        | Expr::IsNull { operand, .. } => {
11131            expr_references_outer_scope(operand, outer_scopes, inner_scopes)
11132        }
11133        Expr::FunctionCall { args, .. } => args
11134            .iter()
11135            .any(|arg| expr_references_outer_scope(arg, outer_scopes, inner_scopes)),
11136        Expr::Case {
11137            branches, else_, ..
11138        } => {
11139            branches.iter().any(|(cond, value)| {
11140                expr_references_outer_scope(cond, outer_scopes, inner_scopes)
11141                    || expr_references_outer_scope(value, outer_scopes, inner_scopes)
11142            }) || else_
11143                .as_ref()
11144                .is_some_and(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
11145        }
11146        Expr::InList { target, values, .. } => {
11147            expr_references_outer_scope(target, outer_scopes, inner_scopes)
11148                || values
11149                    .iter()
11150                    .any(|value| expr_references_outer_scope(value, outer_scopes, inner_scopes))
11151        }
11152        Expr::Between {
11153            target, low, high, ..
11154        } => {
11155            expr_references_outer_scope(target, outer_scopes, inner_scopes)
11156                || expr_references_outer_scope(low, outer_scopes, inner_scopes)
11157                || expr_references_outer_scope(high, outer_scopes, inner_scopes)
11158        }
11159        Expr::Subquery { query, .. } => query_references_outer_scope(&query.query, inner_scopes),
11160        Expr::Literal { .. } | Expr::Parameter { .. } => false,
11161        Expr::WindowFunctionCall { args, window, .. } => {
11162            args.iter()
11163                .any(|arg| expr_references_outer_scope(arg, outer_scopes, inner_scopes))
11164                || window
11165                    .partition_by
11166                    .iter()
11167                    .any(|e| expr_references_outer_scope(e, outer_scopes, inner_scopes))
11168                || window
11169                    .order_by
11170                    .iter()
11171                    .any(|o| expr_references_outer_scope(&o.expr, outer_scopes, inner_scopes))
11172        }
11173    }
11174}
11175
11176fn field_ref_references_outer_scope(
11177    field: &crate::storage::query::ast::FieldRef,
11178    outer_scopes: &[String],
11179    inner_scopes: &[String],
11180) -> bool {
11181    match field {
11182        crate::storage::query::ast::FieldRef::TableColumn { table, .. } if !table.is_empty() => {
11183            outer_scopes.iter().any(|scope| scope == table)
11184                && !inner_scopes.iter().any(|scope| scope == table)
11185        }
11186        _ => false,
11187    }
11188}
11189
11190fn first_column_values(
11191    result: crate::storage::query::unified::UnifiedResult,
11192) -> RedDBResult<Vec<Value>> {
11193    if result.columns.len() > 1 {
11194        return Err(RedDBError::Query(
11195            "expression subquery must return exactly one column".to_string(),
11196        ));
11197    }
11198    let fallback_column = result
11199        .records
11200        .first()
11201        .and_then(|record| record.column_names().into_iter().next())
11202        .map(|name| name.to_string());
11203    let column = result.columns.first().cloned().or(fallback_column);
11204    let Some(column) = column else {
11205        return Ok(Vec::new());
11206    };
11207    Ok(result
11208        .records
11209        .iter()
11210        .map(|record| record.get(column.as_str()).cloned().unwrap_or(Value::Null))
11211        .collect())
11212}
11213
11214fn parse_timestamp_to_ms(s: &str) -> Option<u128> {
11215    // Bare integer ms.
11216    if let Ok(n) = s.parse::<u128>() {
11217        return Some(n);
11218    }
11219    // Fallback: ISO-8601 like 2030-01-02 03:04:05 — accept the date
11220    // portion only (midnight UTC). Full RFC3339 parsing is a stretch
11221    // goal; the common case is `'2030-01-01'`.
11222    if let Some(date) = s.split_whitespace().next() {
11223        let parts: Vec<&str> = date.split('-').collect();
11224        if parts.len() == 3 {
11225            let (y, m, d) = (parts[0], parts[1], parts[2]);
11226            if let (Ok(y), Ok(m), Ok(d)) = (y.parse::<i64>(), m.parse::<u32>(), d.parse::<u32>()) {
11227                // Days since 1970-01-01 — simple Julian arithmetic
11228                // suitable for years 1970-2100. Good enough for test
11229                // fixtures; precise parsing lands when we wire chrono.
11230                let days_in = days_from_civil(y, m, d);
11231                return Some((days_in as u128) * 86_400_000u128);
11232            }
11233        }
11234    }
11235    None
11236}
11237
11238/// Days from Unix epoch using H. Hinnant's civil-from-days algorithm.
11239/// Robust for the entire Gregorian range; used by `parse_timestamp_to_ms`.
11240fn days_from_civil(y: i64, m: u32, d: u32) -> i64 {
11241    let y = if m <= 2 { y - 1 } else { y };
11242    let era = if y >= 0 { y } else { y - 399 } / 400;
11243    let yoe = (y - era * 400) as u64; // [0, 399]
11244    let doy = (153 * (if m > 2 { m - 3 } else { m + 9 }) as u64 + 2) / 5 + d as u64 - 1;
11245    let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
11246    era * 146097 + doe as i64 - 719468
11247}
11248
11249fn walk_plan_node(
11250    node: &crate::storage::query::planner::CanonicalLogicalNode,
11251    depth: usize,
11252    out: &mut Vec<crate::storage::query::unified::UnifiedRecord>,
11253) {
11254    use std::sync::Arc;
11255    let mut rec = crate::storage::query::unified::UnifiedRecord::default();
11256    rec.set_arc(Arc::from("op"), Value::text(node.operator.clone()));
11257    rec.set_arc(
11258        Arc::from("source"),
11259        node.source.clone().map(Value::text).unwrap_or(Value::Null),
11260    );
11261    rec.set_arc(Arc::from("est_rows"), Value::Float(node.estimated_rows));
11262    rec.set_arc(Arc::from("est_cost"), Value::Float(node.operator_cost));
11263    rec.set_arc(Arc::from("depth"), Value::Integer(depth as i64));
11264    out.push(rec);
11265    for child in &node.children {
11266        walk_plan_node(child, depth + 1, out);
11267    }
11268}