Skip to main content

reddb_server/runtime/
impl_core.rs

1use super::*;
2use crate::application::entity::metadata_to_json;
3use crate::auth::column_policy_gate::ColumnAccessRequest;
4use crate::auth::UserId;
5use crate::replication::cdc::ChangeRecord;
6use crate::replication::logical::{ApplyMode, LogicalChangeApplier};
7use crate::storage::query::ast::TableSource;
8
9thread_local! {
10    /// Current connection id for the executing statement. Set by the
11    /// per-connection wrapper (stdio/gRPC handlers) before dispatching
12    /// into `execute_query`; falls back to `0` for embedded callers.
13    static CURRENT_CONN_ID: std::cell::Cell<u64> = const { std::cell::Cell::new(0) };
14
15    /// Authenticated user + role for the executing statement (Phase 2.5.2
16    /// RLS enforcement). Set by the transport middleware after validating
17    /// credentials (password / cert / oauth); unset means "anonymous" /
18    /// "embedded" — RLS policies degrade to the role-agnostic subset.
19    ///
20    /// `None` skips RLS injection entirely; `Some((username, role))`
21    /// passes `role` to `matching_rls_policies(table, Some(role), action)`.
22    static CURRENT_AUTH_IDENTITY: std::cell::RefCell<Option<(String, crate::auth::Role)>> =
23        const { std::cell::RefCell::new(None) };
24
25    /// MVCC snapshot scoped to the currently-executing statement (Phase
26    /// 2.3.2d PG parity). `execute_query` captures it on entry and drops
27    /// it on exit; every scan consults it via
28    /// `entity_visible_under_current_snapshot` to hide tuples whose xmin
29    /// hasn't committed or whose xmax already has.
30    ///
31    /// `None` means "pre-MVCC semantics" — the read path returns every
32    /// tuple regardless of xmin/xmax. All embedded callers that bypass
33    /// `execute_query` see this default.
34    static CURRENT_SNAPSHOT: std::cell::RefCell<Option<SnapshotContext>> =
35        const { std::cell::RefCell::new(None) };
36
37    /// Cheap presence flag for `CURRENT_SNAPSHOT`. Scan hot paths
38    /// poll this instead of `borrow()`-ing the RefCell on every
39    /// row — the common case (autocommit / no MVCC session) reads
40    /// one atomic `Cell<bool>` and short-circuits, saving ~10ns × N
41    /// rows on aggregate_group / select_range scans.
42    static HAS_SNAPSHOT: std::cell::Cell<bool> = const { std::cell::Cell::new(false) };
43
44    /// Session-scoped tenant id for the current connection (Phase 2.5.3
45    /// multi-tenancy). Populated by `SET TENANT 'id'` or by transport
46    /// middleware after resolving tenant from auth claims. Read by the
47    /// `CURRENT_TENANT()` scalar function — RLS policies typically
48    /// combine it as `USING (tenant_id = CURRENT_TENANT())` to scope
49    /// every query to one tenant.
50    ///
51    /// `None` means "no tenant bound" — `CURRENT_TENANT()` returns
52    /// NULL, and RLS policies that gate on it hide every row.
53    static CURRENT_TENANT_ID: std::cell::RefCell<Option<String>> =
54        const { std::cell::RefCell::new(None) };
55
56    /// Statement-local config resolver. SQL expressions materialize the
57    /// `red_config` snapshot lazily on the first `$config.*`/`CONFIG()`
58    /// access, keeping ordinary statements on the zero-scan path.
59    static CURRENT_CONFIG_RESOLVER: std::cell::RefCell<Option<ConfigResolver>> =
60        const { std::cell::RefCell::new(None) };
61
62    /// Statement-local secret resolver. SQL expressions materialize the
63    /// vault KV snapshot lazily on first `$secret.*` access, then use
64    /// lock-free map reads for the rest of the statement.
65    static CURRENT_SECRET_RESOLVER: std::cell::RefCell<Option<SecretResolver>> =
66        const { std::cell::RefCell::new(None) };
67}
68
69fn secret_sql_value_to_string(value: &Value) -> RedDBResult<String> {
70    match value {
71        Value::Text(s) => Ok(s.to_string()),
72        Value::Integer(n) => Ok(n.to_string()),
73        Value::UnsignedInteger(n) => Ok(n.to_string()),
74        Value::Float(n) => Ok(n.to_string()),
75        Value::Boolean(b) => Ok(b.to_string()),
76        Value::Null => Err(RedDBError::Query(
77            "SET SECRET key = NULL deletes the secret; use DELETE SECRET for explicit deletes"
78                .to_string(),
79        )),
80        Value::Password(_) | Value::Secret(_) => Err(RedDBError::Query(
81            "SET SECRET accepts plain scalar literals; PASSWORD() and SECRET() are for typed columns"
82                .to_string(),
83        )),
84        _ => Err(RedDBError::Query(format!(
85            "SET SECRET does not support value type {:?} yet",
86            value.data_type()
87        ))),
88    }
89}
90
91#[derive(Clone)]
92struct QueryControlEventSpec {
93    kind: crate::runtime::control_events::EventKind,
94    action: &'static str,
95    resource: Option<String>,
96    fields: Vec<(String, crate::runtime::control_events::Sensitivity)>,
97}
98
99#[derive(Clone)]
100struct QueryAuditPlan {
101    statement_kind: &'static str,
102    collections: Vec<String>,
103}
104
105fn query_audit_plan(expr: &QueryExpr) -> Option<QueryAuditPlan> {
106    let mut collections = Vec::new();
107    let statement_kind = match expr {
108        QueryExpr::Table(table) => {
109            push_query_audit_collection(&mut collections, &table.table);
110            "select"
111        }
112        QueryExpr::Join(join) => {
113            collect_query_audit_collections(&join.left, &mut collections);
114            collect_query_audit_collections(&join.right, &mut collections);
115            "select"
116        }
117        QueryExpr::Insert(insert) => {
118            push_query_audit_collection(&mut collections, &insert.table);
119            "insert"
120        }
121        QueryExpr::Update(update) => {
122            push_query_audit_collection(&mut collections, &update.table);
123            "update"
124        }
125        QueryExpr::Delete(delete) => {
126            push_query_audit_collection(&mut collections, &delete.table);
127            "delete"
128        }
129        _ => return None,
130    };
131    if collections.is_empty() {
132        None
133    } else {
134        Some(QueryAuditPlan {
135            statement_kind,
136            collections,
137        })
138    }
139}
140
141fn collect_query_audit_collections(expr: &QueryExpr, collections: &mut Vec<String>) {
142    match expr {
143        QueryExpr::Table(table) => push_query_audit_collection(collections, &table.table),
144        QueryExpr::Join(join) => {
145            collect_query_audit_collections(&join.left, collections);
146            collect_query_audit_collections(&join.right, collections);
147        }
148        _ => {}
149    }
150}
151
152fn push_query_audit_collection(collections: &mut Vec<String>, name: &str) {
153    if name == "red" || name.starts_with("red.") || name.starts_with("__red_schema_") {
154        return;
155    }
156    if !collections.iter().any(|existing| existing == name) {
157        collections.push(name.to_string());
158    }
159}
160
161fn query_control_event_specs(expr: &QueryExpr) -> Vec<QueryControlEventSpec> {
162    use crate::runtime::control_events::{EventKind, Sensitivity};
163
164    let mut specs = Vec::new();
165    let mut schema = |action: &'static str, resource: Option<String>| {
166        specs.push(QueryControlEventSpec {
167            kind: EventKind::SchemaDdl,
168            action,
169            resource,
170            fields: Vec::new(),
171        });
172    };
173    match expr {
174        QueryExpr::CreateTable(q) => {
175            schema("create_table", Some(format!("table:{}", q.name)));
176            if let Some(column) = &q.tenant_by {
177                specs.push(QueryControlEventSpec {
178                    kind: EventKind::TenantGovernance,
179                    action: "create_table_tenant_by",
180                    resource: Some(format!("table:{}", q.name)),
181                    fields: vec![("tenant_column".to_string(), Sensitivity::raw(column))],
182                });
183            }
184        }
185        QueryExpr::CreateCollection(q) => {
186            schema("create_collection", Some(format!("collection:{}", q.name)));
187        }
188        QueryExpr::CreateVector(q) => schema("create_vector", Some(format!("vector:{}", q.name))),
189        QueryExpr::DropTable(q) => schema("drop_table", Some(format!("table:{}", q.name))),
190        QueryExpr::DropGraph(q) => schema("drop_graph", Some(format!("graph:{}", q.name))),
191        QueryExpr::DropVector(q) => schema("drop_vector", Some(format!("vector:{}", q.name))),
192        QueryExpr::DropDocument(q) => {
193            schema("drop_document", Some(format!("document:{}", q.name)));
194        }
195        QueryExpr::DropKv(q) => schema("drop_kv", Some(format!("kv:{}", q.name))),
196        QueryExpr::DropCollection(q) => {
197            schema("drop_collection", Some(format!("collection:{}", q.name)));
198        }
199        QueryExpr::Truncate(q) => schema("truncate", Some(format!("collection:{}", q.name))),
200        QueryExpr::AlterTable(q) => {
201            schema("alter_table", Some(format!("table:{}", q.name)));
202            for op in &q.operations {
203                match op {
204                    crate::storage::query::ast::AlterOperation::EnableRowLevelSecurity => {
205                        specs.push(QueryControlEventSpec {
206                            kind: EventKind::RlsGovernance,
207                            action: "enable_rls",
208                            resource: Some(format!("table:{}", q.name)),
209                            fields: Vec::new(),
210                        });
211                    }
212                    crate::storage::query::ast::AlterOperation::DisableRowLevelSecurity => {
213                        specs.push(QueryControlEventSpec {
214                            kind: EventKind::RlsGovernance,
215                            action: "disable_rls",
216                            resource: Some(format!("table:{}", q.name)),
217                            fields: Vec::new(),
218                        });
219                    }
220                    crate::storage::query::ast::AlterOperation::EnableTenancy { column } => {
221                        specs.push(QueryControlEventSpec {
222                            kind: EventKind::TenantGovernance,
223                            action: "enable_tenancy",
224                            resource: Some(format!("table:{}", q.name)),
225                            fields: vec![("tenant_column".to_string(), Sensitivity::raw(column))],
226                        });
227                    }
228                    crate::storage::query::ast::AlterOperation::DisableTenancy => {
229                        specs.push(QueryControlEventSpec {
230                            kind: EventKind::TenantGovernance,
231                            action: "disable_tenancy",
232                            resource: Some(format!("table:{}", q.name)),
233                            fields: Vec::new(),
234                        });
235                    }
236                    _ => {}
237                }
238            }
239        }
240        QueryExpr::CreateIndex(q) => {
241            schema(
242                "create_index",
243                Some(format!("index:{}:{}", q.table, q.name)),
244            );
245        }
246        QueryExpr::DropIndex(q) => {
247            schema("drop_index", Some(format!("index:{}:{}", q.table, q.name)));
248        }
249        QueryExpr::CreateTimeSeries(q) => {
250            schema("create_timeseries", Some(format!("timeseries:{}", q.name)));
251        }
252        QueryExpr::DropTimeSeries(q) => {
253            schema("drop_timeseries", Some(format!("timeseries:{}", q.name)));
254        }
255        QueryExpr::CreateQueue(q) => schema("create_queue", Some(format!("queue:{}", q.name))),
256        QueryExpr::AlterQueue(q) => schema("alter_queue", Some(format!("queue:{}", q.name))),
257        QueryExpr::DropQueue(q) => schema("drop_queue", Some(format!("queue:{}", q.name))),
258        QueryExpr::CreateTree(q) => {
259            schema(
260                "create_tree",
261                Some(format!("tree:{}:{}", q.collection, q.name)),
262            );
263        }
264        QueryExpr::DropTree(q) => {
265            schema(
266                "drop_tree",
267                Some(format!("tree:{}:{}", q.collection, q.name)),
268            );
269        }
270        QueryExpr::CreateSchema(q) => schema("create_schema", Some(format!("schema:{}", q.name))),
271        QueryExpr::DropSchema(q) => schema("drop_schema", Some(format!("schema:{}", q.name))),
272        QueryExpr::CreateSequence(q) => {
273            schema("create_sequence", Some(format!("sequence:{}", q.name)));
274        }
275        QueryExpr::DropSequence(q) => schema("drop_sequence", Some(format!("sequence:{}", q.name))),
276        QueryExpr::CreateView(q) => schema("create_view", Some(format!("view:{}", q.name))),
277        QueryExpr::DropView(q) => schema("drop_view", Some(format!("view:{}", q.name))),
278        QueryExpr::RefreshMaterializedView(q) => {
279            schema(
280                "refresh_materialized_view",
281                Some(format!("view:{}", q.name)),
282            );
283        }
284        QueryExpr::CreatePolicy(q) => {
285            specs.push(QueryControlEventSpec {
286                kind: EventKind::RlsGovernance,
287                action: "create_policy",
288                resource: Some(format!("table:{}:policy:{}", q.table, q.name)),
289                fields: vec![(
290                    "target_kind".to_string(),
291                    Sensitivity::raw(q.target_kind.as_ident()),
292                )],
293            });
294        }
295        QueryExpr::DropPolicy(q) => {
296            specs.push(QueryControlEventSpec {
297                kind: EventKind::RlsGovernance,
298                action: "drop_policy",
299                resource: Some(format!("table:{}:policy:{}", q.table, q.name)),
300                fields: Vec::new(),
301            });
302        }
303        QueryExpr::SetTenant(value) => {
304            let mut fields = Vec::new();
305            if let Some(value) = value {
306                fields.push(("tenant".to_string(), Sensitivity::raw(value)));
307            }
308            specs.push(QueryControlEventSpec {
309                kind: EventKind::TenantGovernance,
310                action: "set_tenant",
311                resource: Some("tenant:session".to_string()),
312                fields,
313            });
314        }
315        QueryExpr::SetConfig { key, .. } => {
316            specs.push(QueryControlEventSpec {
317                kind: EventKind::ConfigWrite,
318                action: "config:write",
319                resource: Some(format!("config:{key}")),
320                fields: vec![("key".to_string(), Sensitivity::raw(key))],
321            });
322        }
323        QueryExpr::ConfigCommand(cmd) => match cmd {
324            crate::storage::query::ast::ConfigCommand::Put {
325                collection, key, ..
326            }
327            | crate::storage::query::ast::ConfigCommand::Rotate {
328                collection, key, ..
329            } => {
330                let target = format!("{collection}/{key}");
331                specs.push(QueryControlEventSpec {
332                    kind: EventKind::ConfigWrite,
333                    action: "config:write",
334                    resource: Some(format!("config:{target}")),
335                    fields: vec![
336                        ("collection".to_string(), Sensitivity::raw(collection)),
337                        ("key".to_string(), Sensitivity::raw(key)),
338                    ],
339                });
340            }
341            crate::storage::query::ast::ConfigCommand::Delete { collection, key } => {
342                let target = format!("{collection}/{key}");
343                specs.push(QueryControlEventSpec {
344                    kind: EventKind::ConfigDelete,
345                    action: "config:write",
346                    resource: Some(format!("config:{target}")),
347                    fields: vec![
348                        ("collection".to_string(), Sensitivity::raw(collection)),
349                        ("key".to_string(), Sensitivity::raw(key)),
350                    ],
351                });
352            }
353            _ => {}
354        },
355        QueryExpr::AlterUser(stmt) => {
356            let disables = stmt.attributes.iter().any(|attr| {
357                matches!(
358                    attr,
359                    crate::storage::query::ast::AlterUserAttribute::Disable
360                )
361            });
362            specs.push(QueryControlEventSpec {
363                kind: if disables {
364                    EventKind::UserDisable
365                } else {
366                    EventKind::UserUpdate
367                },
368                action: "alter_user",
369                resource: Some(format!("user:{}", stmt.username)),
370                fields: Vec::new(),
371            });
372        }
373        _ => {}
374    }
375    specs
376}
377
378fn control_event_outcome_for_error(err: &RedDBError) -> crate::runtime::control_events::Outcome {
379    match err {
380        RedDBError::ReadOnly(_) => crate::runtime::control_events::Outcome::Denied,
381        RedDBError::Query(msg)
382            if msg.contains("permission denied")
383                || msg.contains("cannot issue")
384                || msg.contains("lacks") =>
385        {
386            crate::runtime::control_events::Outcome::Denied
387        }
388        _ => crate::runtime::control_events::Outcome::Error,
389    }
390}
391
392/// Convert the rows produced by a materialized-view body into
393/// `UnifiedEntity` table rows targeting the backing collection.
394/// Issue #595 slice 9c — feeds `UnifiedStore::refresh_collection`.
395///
396/// Graph fragments and vector hits are ignored: a materialized view
397/// is a relational result set (SELECT-shaped); slices 11+ may extend
398/// this once we have a richer view body shape. Each row materialises
399/// the union of its schema-bound columns + overflow.
400fn view_records_to_entities(
401    table: &str,
402    records: &[crate::storage::query::unified::UnifiedRecord],
403) -> Vec<crate::storage::UnifiedEntity> {
404    use std::collections::HashMap;
405    let table_arc: std::sync::Arc<str> = std::sync::Arc::from(table);
406    let mut out = Vec::with_capacity(records.len());
407    for record in records {
408        let mut named: HashMap<String, crate::storage::schema::Value> = HashMap::new();
409        for (name, value) in record.iter_fields() {
410            named.insert(name.to_string(), value.clone());
411        }
412        let entity = crate::storage::UnifiedEntity::new(
413            crate::storage::EntityId::new(0),
414            crate::storage::EntityKind::TableRow {
415                table: std::sync::Arc::clone(&table_arc),
416                row_id: 0,
417            },
418            crate::storage::EntityData::Row(crate::storage::RowData {
419                columns: Vec::new(),
420                named: Some(named),
421                schema: None,
422            }),
423        );
424        out.push(entity);
425    }
426    out
427}
428
429fn system_keyed_collection_contract(
430    name: &str,
431    model: crate::catalog::CollectionModel,
432) -> crate::physical::CollectionContract {
433    let now = crate::utils::now_unix_millis() as u128;
434    crate::physical::CollectionContract {
435        name: name.to_string(),
436        declared_model: model,
437        schema_mode: crate::catalog::SchemaMode::Dynamic,
438        origin: crate::physical::ContractOrigin::Implicit,
439        version: 1,
440        created_at_unix_ms: now,
441        updated_at_unix_ms: now,
442        default_ttl_ms: None,
443        vector_dimension: None,
444        vector_metric: None,
445        context_index_fields: Vec::new(),
446        declared_columns: Vec::new(),
447        table_def: None,
448        timestamps_enabled: false,
449        context_index_enabled: false,
450        metrics_raw_retention_ms: None,
451        metrics_rollup_policies: Vec::new(),
452        metrics_tenant_identity: None,
453        metrics_namespace: None,
454        append_only: false,
455        subscriptions: Vec::new(),
456        session_key: None,
457        session_gap_ms: None,
458        retention_duration_ms: None,
459    }
460}
461
462/// Snapshot + manager pair used for read-path visibility checks.
463///
464/// The manager is needed in addition to the snapshot because `aborted`
465/// state mutates after the snapshot is captured — a ROLLBACK by a
466/// committed-at-capture-time writer must still hide its tuples. Keeping
467/// the Arc around is O(pointer) and the RwLock reads on `is_aborted`
468/// are cheap (HashSet lookup under a parking_lot read guard).
469///
470/// `own_xids` (Phase 2.3.2e) lists the xids belonging to the current
471/// connection's transaction — the parent xid plus open and released
472/// savepoint sub-xids. The visibility rule promotes rows stamped with
473/// these xids to "always visible (unless aborted)" so the writer sees
474/// its own nested-savepoint writes even though their xids exceed
475/// `snapshot.xid`.
476#[derive(Clone)]
477pub struct SnapshotContext {
478    pub snapshot: crate::storage::transaction::snapshot::Snapshot,
479    pub manager: Arc<crate::storage::transaction::snapshot::SnapshotManager>,
480    pub own_xids: std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
481    pub requires_index_fallback: bool,
482}
483
484/// Install a connection id on the current thread for the duration of a
485/// statement. Transaction state (`RuntimeInner::tx_contexts`) is keyed
486/// by this id so different connections can hold independent BEGINs.
487///
488/// Pub so transports (PG wire, gRPC, HTTP per-request spawners) and
489/// tests can emulate per-connection isolation. Call it once when
490/// binding the connection's worker thread; pair with
491/// `clear_current_connection_id` on teardown.
492pub fn set_current_connection_id(id: u64) {
493    CURRENT_CONN_ID.with(|c| c.set(id));
494}
495
496/// Reset the thread's connection id back to `0` (autocommit).
497pub fn clear_current_connection_id() {
498    CURRENT_CONN_ID.with(|c| c.set(0));
499}
500
501/// Read the connection id set by `set_current_connection_id`. Returns
502/// `0` when no wrapper installed one — auto-commit path.
503pub fn current_connection_id() -> u64 {
504    CURRENT_CONN_ID.with(|c| c.get())
505}
506
507/// Install the authenticated identity for the current thread (Phase 2.5.2
508/// RLS enforcement). Transport layers call this right after resolving
509/// auth so the query dispatch can fold RLS policies into the filter.
510pub fn set_current_auth_identity(username: String, role: crate::auth::Role) {
511    CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = Some((username, role)));
512}
513
514/// Clear the thread-local auth identity. Transports call this after the
515/// statement completes so pooled threads don't leak identities across
516/// requests.
517pub fn clear_current_auth_identity() {
518    CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = None);
519}
520
521/// Read the current-thread auth identity. `None` when no transport
522/// installed one (embedded mode / anonymous access).
523pub(crate) fn current_auth_identity() -> Option<(String, crate::auth::Role)> {
524    CURRENT_AUTH_IDENTITY.with(|cell| cell.borrow().clone())
525}
526
527/// Public probe of the thread-local auth identity for callers outside
528/// the `runtime` module (e.g. the AI credential resolver, which audits
529/// who triggered a secret read on behalf of a query).
530pub fn current_auth_identity_for_audit() -> Option<(String, crate::auth::Role)> {
531    current_auth_identity()
532}
533
534/// Install the session tenant id for the current thread (Phase 2.5.3
535/// multi-tenancy). Called by `SET TENANT 'id'` dispatch and by
536/// transport middleware that resolves tenant from auth claims (e.g.
537/// JWT `tenant` claim, HTTP header, subdomain).
538pub fn set_current_tenant(tenant_id: String) {
539    CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = Some(tenant_id));
540}
541
542/// Clear the current-thread tenant — `CURRENT_TENANT()` will then
543/// return NULL and any RLS policy gated on it will hide every row.
544pub fn clear_current_tenant() {
545    CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = None);
546}
547
548/// Read the current-thread tenant id, applying overrides in priority order:
549///   1. `WITHIN TENANT '<id>' …` per-statement override (highest)
550///   2. `SET LOCAL TENANT '<id>'` transaction-local override (consulted
551///      only when the current connection has an open transaction)
552///   3. `SET TENANT '<id>'` session-level thread-local
553///   4. `None` (deny-default for RLS).
554///
555/// The transaction-local layer is read through the runtime; an embedded
556/// helper crate that has no `RedDBRuntime` access still gets correct
557/// behaviour for layers 1, 3, and 4.
558pub fn current_tenant() -> Option<String> {
559    let inherited = CURRENT_TENANT_ID.with(|cell| cell.borrow().clone());
560    if let Some(over) = current_scope_override() {
561        if over.tenant.is_active() {
562            return over.tenant.resolve(inherited);
563        }
564    }
565    if let Some(tx_local) = current_tx_local_tenant() {
566        return tx_local;
567    }
568    inherited
569}
570
571thread_local! {
572    /// Snapshot of the active connection's `tx_local_tenants` entry for
573    /// the current `execute_query` call. Outer `Some(_)` means "a
574    /// transaction-local tenant override is active for this call";
575    /// inner is the override's value (`Some(s)` overrides to `s`,
576    /// `None` overrides to NULL/cleared). Refreshed at the top of every
577    /// `execute_query` invocation and cleared by the RAII guard on
578    /// return so pooled connections cannot leak the override past the
579    /// statement that owns it.
580    static TX_LOCAL_TENANT: std::cell::RefCell<Option<Option<String>>> =
581        const { std::cell::RefCell::new(None) };
582}
583
584fn current_tx_local_tenant() -> Option<Option<String>> {
585    TX_LOCAL_TENANT.with(|cell| cell.borrow().clone())
586}
587
588/// Recognise `SET LOCAL TENANT '<id>'` / `SET LOCAL TENANT NULL` —
589/// returns `Ok(Some(Some(id)))` for an explicit value, `Ok(Some(None))`
590/// for an explicit NULL clear, `Ok(None)` when the input is not a
591/// `SET LOCAL TENANT` statement at all, and `Err` when the prefix
592/// matches but the value is malformed.
593fn parse_set_local_tenant(query: &str) -> RedDBResult<Option<Option<String>>> {
594    let mut tokens = query.split_ascii_whitespace();
595    let Some(w1) = tokens.next() else {
596        return Ok(None);
597    };
598    if !w1.eq_ignore_ascii_case("SET") {
599        return Ok(None);
600    }
601    let Some(w2) = tokens.next() else {
602        return Ok(None);
603    };
604    if !w2.eq_ignore_ascii_case("LOCAL") {
605        return Ok(None);
606    }
607    let Some(w3) = tokens.next() else {
608        return Ok(None);
609    };
610    if !w3.eq_ignore_ascii_case("TENANT") {
611        return Ok(None);
612    }
613    let rest: String = tokens.collect::<Vec<_>>().join(" ");
614    let rest = rest.trim().trim_end_matches(';').trim();
615    let value_str = rest.strip_prefix('=').map(|s| s.trim()).unwrap_or(rest);
616    if value_str.is_empty() {
617        return Err(RedDBError::Query(
618            "SET LOCAL TENANT expects a string literal or NULL".to_string(),
619        ));
620    }
621    if value_str.eq_ignore_ascii_case("NULL") {
622        return Ok(Some(None));
623    }
624    if value_str.starts_with('\'') && value_str.ends_with('\'') && value_str.len() >= 2 {
625        let inner = &value_str[1..value_str.len() - 1];
626        return Ok(Some(Some(inner.to_string())));
627    }
628    Err(RedDBError::Query(format!(
629        "SET LOCAL TENANT expects a string literal or NULL, got `{value_str}`"
630    )))
631}
632
633pub(crate) struct TxLocalTenantGuard;
634
635impl TxLocalTenantGuard {
636    pub fn install(value: Option<Option<String>>) -> Self {
637        TX_LOCAL_TENANT.with(|cell| *cell.borrow_mut() = value);
638        Self
639    }
640}
641
642impl Drop for TxLocalTenantGuard {
643    fn drop(&mut self) {
644        TX_LOCAL_TENANT.with(|cell| *cell.borrow_mut() = None);
645    }
646}
647
648thread_local! {
649    /// Stack of `WITHIN ... <stmt>` overrides active on the current
650    /// thread. Every entry corresponds to one in-flight `execute_query`
651    /// call that started with a `WITHIN` prefix; the entry is pushed
652    /// before dispatch and popped before the call returns. The stack
653    /// shape supports nested invocations (e.g. a view body that itself
654    /// re-enters execute_query).
655    static SCOPE_OVERRIDES: std::cell::RefCell<Vec<crate::runtime::within_clause::ScopeOverride>> =
656        const { std::cell::RefCell::new(Vec::new()) };
657}
658
659pub(crate) fn push_scope_override(over: crate::runtime::within_clause::ScopeOverride) {
660    SCOPE_OVERRIDES.with(|cell| cell.borrow_mut().push(over));
661}
662
663pub(crate) fn pop_scope_override() {
664    SCOPE_OVERRIDES.with(|cell| {
665        cell.borrow_mut().pop();
666    });
667}
668
669pub(crate) fn current_scope_override() -> Option<crate::runtime::within_clause::ScopeOverride> {
670    SCOPE_OVERRIDES.with(|cell| cell.borrow().last().cloned())
671}
672
673/// Cheap probe: is any `WITHIN …` scope override active on this
674/// thread? The fast-path needs to know without paying for the full
675/// `.last().cloned()` allocation — just peek at stack length.
676pub(crate) fn has_scope_override_active() -> bool {
677    SCOPE_OVERRIDES.with(|cell| !cell.borrow().is_empty())
678}
679
680/// RAII guard pairing `push_scope_override` with the matching pop, so
681/// the stack stays balanced even when the inner `execute_query` returns
682/// early via `?`.
683pub(crate) struct ScopeOverrideGuard;
684
685impl ScopeOverrideGuard {
686    pub fn install(over: crate::runtime::within_clause::ScopeOverride) -> Self {
687        push_scope_override(over);
688        Self
689    }
690}
691
692impl Drop for ScopeOverrideGuard {
693    fn drop(&mut self) {
694        pop_scope_override();
695    }
696}
697
698/// Read the current-thread auth identity, honouring per-statement
699/// `WITHIN ... USER '<u>' AS ROLE '<r>'` overrides. The override only
700/// supplies projected strings — it never grants additional privilege —
701/// so callers that need to make authorisation decisions must read from
702/// the underlying `current_auth_identity()` directly.
703pub(crate) fn current_user_projected() -> Option<String> {
704    let inherited = current_auth_identity().map(|(u, _)| u);
705    if let Some(over) = current_scope_override() {
706        if over.user.is_active() {
707            return over.user.resolve(inherited);
708        }
709    }
710    inherited
711}
712
713pub(crate) fn current_role_projected() -> Option<String> {
714    let inherited = current_auth_identity().map(|(_, r)| format!("{r:?}").to_lowercase());
715    if let Some(over) = current_scope_override() {
716        if over.role.is_active() {
717            return over.role.resolve(inherited);
718        }
719    }
720    inherited
721}
722
723pub(crate) fn current_secret_value(path: &str) -> Option<String> {
724    let key = path.to_ascii_lowercase();
725    CURRENT_SECRET_RESOLVER.with(|cell| {
726        let mut resolver = cell.borrow_mut();
727        let resolver = resolver.as_mut()?;
728        if resolver.values.is_none() {
729            resolver.values = resolver
730                .store
731                .as_ref()
732                .map(|store| store.vault_kv_snapshot());
733        }
734        let values = resolver.values.as_ref()?;
735        values.get(&key).cloned().or_else(|| {
736            key.strip_prefix("red.vault/").and_then(|rest| {
737                values
738                    .get(rest)
739                    .cloned()
740                    .or_else(|| values.get(&format!("red.secret.{rest}")).cloned())
741            })
742        })
743    })
744}
745
746struct SecretResolver {
747    store: Option<Arc<crate::auth::store::AuthStore>>,
748    values: Option<HashMap<String, String>>,
749}
750
751pub(super) struct SecretStoreGuard {
752    previous: Option<SecretResolver>,
753}
754
755impl SecretStoreGuard {
756    pub(super) fn install(store: Option<Arc<crate::auth::store::AuthStore>>) -> Self {
757        let previous = CURRENT_SECRET_RESOLVER.with(|cell| {
758            cell.replace(Some(SecretResolver {
759                store,
760                values: None,
761            }))
762        });
763        Self { previous }
764    }
765}
766
767impl Drop for SecretStoreGuard {
768    fn drop(&mut self) {
769        let previous = self.previous.take();
770        CURRENT_SECRET_RESOLVER.with(|cell| {
771            cell.replace(previous);
772        });
773    }
774}
775
776pub(crate) fn current_config_value(path: &str) -> Option<Value> {
777    let key = path.to_ascii_lowercase();
778    CURRENT_CONFIG_RESOLVER.with(|cell| {
779        let mut resolver = cell.borrow_mut();
780        let resolver = resolver.as_mut()?;
781        if resolver.values.is_none() {
782            resolver.values = Some(latest_config_snapshot(&resolver.db));
783        }
784        let values = resolver.values.as_ref()?;
785        values.get(&key).cloned().or_else(|| {
786            key.strip_prefix("red.config/")
787                .and_then(|rest| values.get(&format!("red.config.{rest}")).cloned())
788        })
789    })
790}
791
792fn update_current_config_value(path: &str, value: Value) {
793    let key = path.to_ascii_lowercase();
794    CURRENT_CONFIG_RESOLVER.with(|cell| {
795        if let Some(resolver) = cell.borrow_mut().as_mut() {
796            if let Some(values) = resolver.values.as_mut() {
797                values.insert(key, value);
798            }
799        }
800    });
801}
802
803fn update_current_secret_value(path: &str, value: Option<String>) {
804    let key = path.to_ascii_lowercase();
805    CURRENT_SECRET_RESOLVER.with(|cell| {
806        if let Some(resolver) = cell.borrow_mut().as_mut() {
807            let Some(values) = resolver.values.as_mut() else {
808                return;
809            };
810            match value {
811                Some(value) => {
812                    values.insert(key, value);
813                }
814                None => {
815                    values.remove(&key);
816                }
817            }
818        }
819    });
820}
821
822fn latest_config_snapshot(db: &RedDB) -> HashMap<String, Value> {
823    let mut latest: HashMap<String, (u64, Value)> = HashMap::new();
824
825    if let Some(manager) = db.store().get_collection("red_config") {
826        manager.for_each_entity(|entity| {
827            let Some(row) = entity.data.as_row() else {
828                return true;
829            };
830            let Some(Value::Text(key)) = row.get_field("key") else {
831                return true;
832            };
833            let value = row.get_field("value").cloned().unwrap_or(Value::Null);
834            let id = entity.id.raw();
835            let key = key.to_ascii_lowercase();
836            insert_latest_config_value(&mut latest, key.clone(), id, value.clone());
837            if let Some(rest) = key.strip_prefix("red.config.") {
838                insert_latest_config_value(&mut latest, format!("red.config/{rest}"), id, value);
839            }
840            true
841        });
842    }
843
844    if let Some(manager) = db.store().get_collection("red.config") {
845        manager.for_each_entity(|entity| {
846            let Some(row) = entity.data.as_row() else {
847                return true;
848            };
849            if matches!(row.get_field("tombstone"), Some(Value::Boolean(true))) {
850                return true;
851            }
852            let Some(Value::Text(key)) = row.get_field("key") else {
853                return true;
854            };
855            let value = row.get_field("value").cloned().unwrap_or(Value::Null);
856            insert_latest_config_value(
857                &mut latest,
858                format!("red.config/{}", key.to_ascii_lowercase()),
859                entity.id.raw(),
860                value,
861            );
862            true
863        });
864    }
865
866    latest
867        .into_iter()
868        .map(|(key, (_, value))| (key, value))
869        .collect()
870}
871
872fn insert_latest_config_value(
873    latest: &mut HashMap<String, (u64, Value)>,
874    key: String,
875    id: u64,
876    value: Value,
877) {
878    match latest.get(&key) {
879        Some((prev_id, _)) if *prev_id > id => {}
880        _ => {
881            latest.insert(key, (id, value));
882        }
883    }
884}
885
886struct ConfigResolver {
887    db: Arc<RedDB>,
888    values: Option<HashMap<String, Value>>,
889}
890
891pub(super) struct ConfigSnapshotGuard {
892    previous: Option<ConfigResolver>,
893}
894
895impl ConfigSnapshotGuard {
896    pub(super) fn install(db: Arc<RedDB>) -> Self {
897        let previous = CURRENT_CONFIG_RESOLVER
898            .with(|cell| cell.replace(Some(ConfigResolver { db, values: None })));
899        Self { previous }
900    }
901}
902
903impl Drop for ConfigSnapshotGuard {
904    fn drop(&mut self) {
905        let previous = self.previous.take();
906        CURRENT_CONFIG_RESOLVER.with(|cell| {
907            cell.replace(previous);
908        });
909    }
910}
911
912/// Install the MVCC snapshot used by the current thread for the duration
913/// of one statement. Paired with `clear_current_snapshot()` — callers
914/// should prefer the `CurrentSnapshotGuard` RAII wrapper so early returns
915/// still clean up.
916pub fn set_current_snapshot(ctx: SnapshotContext) {
917    CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = Some(ctx));
918    HAS_SNAPSHOT.with(|c| c.set(true));
919}
920
921pub fn clear_current_snapshot() {
922    CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = None);
923    HAS_SNAPSHOT.with(|c| c.set(false));
924}
925
926/// Drop-guard that restores the previous snapshot on scope exit. Safe to
927/// nest — each statement saves the caller's snapshot and puts it back
928/// instead of blindly clearing, so a top-level `execute_query` called
929/// from inside another statement dispatch (e.g. vector source subqueries)
930/// doesn't strip visibility from the outer scan.
931pub(crate) struct CurrentSnapshotGuard {
932    previous: Option<SnapshotContext>,
933}
934
935impl CurrentSnapshotGuard {
936    pub(crate) fn install(ctx: SnapshotContext) -> Self {
937        let previous = CURRENT_SNAPSHOT.with(|cell| cell.borrow().clone());
938        set_current_snapshot(ctx);
939        Self { previous }
940    }
941}
942
943impl Drop for CurrentSnapshotGuard {
944    fn drop(&mut self) {
945        let prev = self.previous.take();
946        let has = prev.is_some();
947        CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = prev);
948        HAS_SNAPSHOT.with(|c| c.set(has));
949    }
950}
951
952/// Is this entity visible under the current thread's MVCC snapshot?
953///
954/// Returns `true` (no filtering) when no snapshot is installed — that
955/// path is used by embedded callers and by operations that intentionally
956/// bypass MVCC (VACUUM, snapshot export, admin introspection).
957///
958/// When a snapshot is installed the result is
959///   `snapshot.sees(xmin, xmax) && !mgr.is_aborted(xmin) && !xmax_half_abort`
960/// where `xmax_half_abort` re-grants visibility for tuples whose
961/// deleting transaction rolled back.
962#[inline]
963pub fn entity_visible_under_current_snapshot(
964    entity: &crate::storage::unified::entity::UnifiedEntity,
965) -> bool {
966    // Fast path — one `Cell<bool>` read, no RefCell borrow. Autocommit
967    // reads (no active MVCC transaction) still hide superseded physical
968    // versions while avoiding a full snapshot-context lookup.
969    // This runs on every row of every scan; the slow path only fires
970    // inside an explicit transaction.
971    if !HAS_SNAPSHOT.with(|c| c.get()) {
972        return entity.xmax == 0;
973    }
974    CURRENT_SNAPSHOT.with(|cell| {
975        let guard = cell.borrow();
976        let Some(ctx) = guard.as_ref() else {
977            return true;
978        };
979        visibility_check(ctx, entity.xmin, entity.xmax)
980    })
981}
982
983/// Direct visibility check from raw `(xmin, xmax)` — bypasses the
984/// entity borrow for callers that already decomposed the tuple (e.g.
985/// pre-materialized scan caches). Same semantics as
986/// `entity_visible_under_current_snapshot`.
987#[inline]
988pub(crate) fn xids_visible_under_current_snapshot(xmin: u64, xmax: u64) -> bool {
989    if !HAS_SNAPSHOT.with(|c| c.get()) {
990        return true;
991    }
992    CURRENT_SNAPSHOT.with(|cell| {
993        let guard = cell.borrow();
994        let Some(ctx) = guard.as_ref() else {
995            return true;
996        };
997        visibility_check(ctx, xmin, xmax)
998    })
999}
1000
1001/// Clone the current thread's snapshot context. Parallel scan paths
1002/// (`query_all_zoned` with `std::thread::scope`) call this on the main
1003/// thread *before* spawning workers so the captured `SnapshotContext`
1004/// can be moved into every worker closure. Worker threads do not
1005/// inherit thread-locals, so calling `entity_visible_under_current_snapshot`
1006/// from inside a spawned closure would silently skip the filter.
1007pub fn capture_current_snapshot() -> Option<SnapshotContext> {
1008    CURRENT_SNAPSHOT.with(|cell| cell.borrow().clone())
1009}
1010
1011/// Whether the active read snapshot may need historical tuple versions
1012/// that the current secondary indexes cannot prove. Index paths can still
1013/// recheck visible candidates, but only a heap scan can discover versions
1014/// whose indexed value was changed or deleted after this snapshot.
1015pub(crate) fn current_snapshot_requires_index_fallback() -> bool {
1016    if !HAS_SNAPSHOT.with(|c| c.get()) {
1017        return false;
1018    }
1019    CURRENT_SNAPSHOT.with(|cell| {
1020        cell.borrow()
1021            .as_ref()
1022            .is_some_and(|ctx| ctx.requires_index_fallback)
1023    })
1024}
1025
1026/// Frozen MVCC + identity context for callers that need to reinstall
1027/// the same view across thread-local boundaries — long-lived cursors,
1028/// background batchers, anything that detaches from the dispatch path
1029/// and re-enters later.
1030///
1031/// The bundle bakes in the three thread-locals every read path
1032/// consults: `SnapshotContext` (MVCC visibility), the auth identity
1033/// (RLS policy gate), and the tenant id (RLS scope). A FETCH that
1034/// reinstalls the bundle sees exactly the same rows as the DECLARE
1035/// would have, regardless of writes that landed in between.
1036///
1037/// Cheap to clone — `SnapshotContext` is a clone of three
1038/// `Arc`-backed fields, identity is a `(String, Role)`, tenant is a
1039/// `String`. None of these contend with the read path.
1040#[derive(Clone, Default)]
1041pub struct SnapshotBundle {
1042    pub snapshot: Option<SnapshotContext>,
1043    pub auth: Option<(String, crate::auth::Role)>,
1044    pub tenant: Option<String>,
1045}
1046
1047/// Capture the three read-path thread-locals into a `SnapshotBundle`.
1048/// Pairs with `with_snapshot_bundle` for re-entry.
1049pub fn snapshot_bundle() -> SnapshotBundle {
1050    SnapshotBundle {
1051        snapshot: capture_current_snapshot(),
1052        auth: current_auth_identity(),
1053        tenant: CURRENT_TENANT_ID.with(|cell| cell.borrow().clone()),
1054    }
1055}
1056
1057/// Reinstall a captured `SnapshotBundle` for the duration of `f`.
1058/// Restores the caller's previous thread-locals on exit (panic-safe via
1059/// the explicit guard struct so a panic in `f` cannot leak the
1060/// installed identity into the worker's next request).
1061pub fn with_snapshot_bundle<R>(bundle: &SnapshotBundle, f: impl FnOnce() -> R) -> R {
1062    struct Guard {
1063        prev_snapshot: Option<SnapshotContext>,
1064        prev_auth: Option<(String, crate::auth::Role)>,
1065        prev_tenant: Option<String>,
1066    }
1067    impl Drop for Guard {
1068        fn drop(&mut self) {
1069            let snap = self.prev_snapshot.take();
1070            let has = snap.is_some();
1071            CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = snap);
1072            HAS_SNAPSHOT.with(|c| c.set(has));
1073            CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = self.prev_auth.take());
1074            CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = self.prev_tenant.take());
1075        }
1076    }
1077
1078    let _guard = {
1079        let prev_snapshot = CURRENT_SNAPSHOT.with(|cell| cell.borrow().clone());
1080        let prev_auth = CURRENT_AUTH_IDENTITY.with(|cell| cell.borrow().clone());
1081        let prev_tenant = CURRENT_TENANT_ID.with(|cell| cell.borrow().clone());
1082
1083        match bundle.snapshot.clone() {
1084            Some(ctx) => set_current_snapshot(ctx),
1085            None => clear_current_snapshot(),
1086        }
1087        CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = bundle.auth.clone());
1088        CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = bundle.tenant.clone());
1089
1090        Guard {
1091            prev_snapshot,
1092            prev_auth,
1093            prev_tenant,
1094        }
1095    };
1096    f()
1097}
1098
1099/// Apply the same visibility rules used by the thread-local helpers
1100/// against a caller-provided context. Intended for parallel workers
1101/// that captured the snapshot with `capture_current_snapshot()`.
1102#[inline]
1103pub fn entity_visible_with_context(
1104    ctx: Option<&SnapshotContext>,
1105    entity: &crate::storage::unified::entity::UnifiedEntity,
1106) -> bool {
1107    match ctx {
1108        Some(ctx) => visibility_check(ctx, entity.xmin, entity.xmax),
1109        None => true,
1110    }
1111}
1112
1113fn table_row_index_fields(
1114    entity: &crate::storage::unified::entity::UnifiedEntity,
1115) -> Vec<(String, crate::storage::schema::Value)> {
1116    let crate::storage::EntityData::Row(row) = &entity.data else {
1117        return Vec::new();
1118    };
1119    if let Some(named) = &row.named {
1120        return named
1121            .iter()
1122            .map(|(name, value)| (name.clone(), value.clone()))
1123            .collect();
1124    }
1125    if let Some(schema) = &row.schema {
1126        return schema
1127            .iter()
1128            .zip(row.columns.iter())
1129            .map(|(name, value)| (name.clone(), value.clone()))
1130            .collect();
1131    }
1132    Vec::new()
1133}
1134
1135#[inline]
1136fn visibility_check(ctx: &SnapshotContext, xmin: u64, xmax: u64) -> bool {
1137    // Writer aborted → tuple never existed from any future reader's view.
1138    // Checked *before* the own-xids fast path so an aborted own-sub-xid
1139    // (rolled-back savepoint) stays hidden from the parent.
1140    if xmin != 0 && ctx.manager.is_aborted(xmin) {
1141        return false;
1142    }
1143    // Deleter aborted → treat xmax as unset; fall back to xmin-only check.
1144    let effective_xmax = if xmax != 0 && ctx.manager.is_aborted(xmax) {
1145        0
1146    } else {
1147        xmax
1148    };
1149    // Phase 2.3.2e: own-tx writes are always visible to the connection
1150    // that stamped them, even when xmin/xmax exceed `snapshot.xid` (as
1151    // happens for sub-xids allocated by SAVEPOINT after BEGIN).
1152    let own_xmin = xmin != 0 && ctx.own_xids.contains(&xmin);
1153    let own_xmax = effective_xmax != 0 && ctx.own_xids.contains(&effective_xmax);
1154    if own_xmax {
1155        // This connection deleted the row via this xid — hide it from self.
1156        return false;
1157    }
1158    if own_xmin {
1159        return true;
1160    }
1161    ctx.snapshot.sees(xmin, effective_xmax)
1162}
1163
1164fn runtime_pool_lock(runtime: &RedDBRuntime) -> std::sync::MutexGuard<'_, PoolState> {
1165    runtime
1166        .inner
1167        .pool
1168        .lock()
1169        .unwrap_or_else(|poisoned| poisoned.into_inner())
1170}
1171
1172fn cache_scope_insert(scopes: &mut HashSet<String>, name: &str) {
1173    if name.is_empty() || name.starts_with("__subq_") || is_universal_query_source(name) {
1174        return;
1175    }
1176    scopes.insert(name.to_string());
1177}
1178
1179fn collect_table_source_scopes(scopes: &mut HashSet<String>, query: &TableQuery) {
1180    match query.source.as_ref() {
1181        Some(crate::storage::query::ast::TableSource::Name(name)) => {
1182            cache_scope_insert(scopes, name)
1183        }
1184        Some(crate::storage::query::ast::TableSource::Subquery(subquery)) => {
1185            collect_query_expr_result_cache_scopes(scopes, subquery);
1186        }
1187        None => cache_scope_insert(scopes, &query.table),
1188    }
1189}
1190
1191fn collect_vector_source_scopes(
1192    scopes: &mut HashSet<String>,
1193    source: &crate::storage::query::ast::VectorSource,
1194) {
1195    match source {
1196        crate::storage::query::ast::VectorSource::Reference { collection, .. } => {
1197            cache_scope_insert(scopes, collection);
1198        }
1199        crate::storage::query::ast::VectorSource::Subquery(subquery) => {
1200            collect_query_expr_result_cache_scopes(scopes, subquery);
1201        }
1202        crate::storage::query::ast::VectorSource::Literal(_)
1203        | crate::storage::query::ast::VectorSource::Text(_) => {}
1204    }
1205}
1206
1207fn collect_path_selector_scopes(
1208    scopes: &mut HashSet<String>,
1209    selector: &crate::storage::query::ast::NodeSelector,
1210) {
1211    if let crate::storage::query::ast::NodeSelector::ByRow { table, .. } = selector {
1212        cache_scope_insert(scopes, table);
1213    }
1214}
1215
1216fn collect_query_expr_result_cache_scopes(scopes: &mut HashSet<String>, expr: &QueryExpr) {
1217    match expr {
1218        QueryExpr::Table(query) => collect_table_source_scopes(scopes, query),
1219        QueryExpr::Join(query) => {
1220            collect_query_expr_result_cache_scopes(scopes, &query.left);
1221            collect_query_expr_result_cache_scopes(scopes, &query.right);
1222        }
1223        QueryExpr::Path(query) => {
1224            collect_path_selector_scopes(scopes, &query.from);
1225            collect_path_selector_scopes(scopes, &query.to);
1226        }
1227        QueryExpr::Vector(query) => {
1228            cache_scope_insert(scopes, &query.collection);
1229            collect_vector_source_scopes(scopes, &query.query_vector);
1230        }
1231        QueryExpr::Hybrid(query) => {
1232            collect_query_expr_result_cache_scopes(scopes, &query.structured);
1233            cache_scope_insert(scopes, &query.vector.collection);
1234            collect_vector_source_scopes(scopes, &query.vector.query_vector);
1235        }
1236        QueryExpr::Insert(query) => cache_scope_insert(scopes, &query.table),
1237        QueryExpr::Update(query) => cache_scope_insert(scopes, &query.table),
1238        QueryExpr::Delete(query) => cache_scope_insert(scopes, &query.table),
1239        QueryExpr::CreateTable(query) => cache_scope_insert(scopes, &query.name),
1240        QueryExpr::CreateCollection(query) => cache_scope_insert(scopes, &query.name),
1241        QueryExpr::CreateVector(query) => cache_scope_insert(scopes, &query.name),
1242        QueryExpr::DropTable(query) => cache_scope_insert(scopes, &query.name),
1243        QueryExpr::DropGraph(query) => cache_scope_insert(scopes, &query.name),
1244        QueryExpr::DropVector(query) => cache_scope_insert(scopes, &query.name),
1245        QueryExpr::DropDocument(query) => cache_scope_insert(scopes, &query.name),
1246        QueryExpr::DropKv(query) => cache_scope_insert(scopes, &query.name),
1247        QueryExpr::DropCollection(query) => cache_scope_insert(scopes, &query.name),
1248        QueryExpr::Truncate(query) => cache_scope_insert(scopes, &query.name),
1249        QueryExpr::AlterTable(query) => cache_scope_insert(scopes, &query.name),
1250        QueryExpr::CreateIndex(query) => cache_scope_insert(scopes, &query.table),
1251        QueryExpr::DropIndex(query) => cache_scope_insert(scopes, &query.table),
1252        QueryExpr::CreateTimeSeries(query) => cache_scope_insert(scopes, &query.name),
1253        QueryExpr::DropTimeSeries(query) => cache_scope_insert(scopes, &query.name),
1254        QueryExpr::CreateQueue(query) => cache_scope_insert(scopes, &query.name),
1255        QueryExpr::AlterQueue(query) => cache_scope_insert(scopes, &query.name),
1256        QueryExpr::DropQueue(query) => cache_scope_insert(scopes, &query.name),
1257        QueryExpr::QueueSelect(query) => cache_scope_insert(scopes, &query.queue),
1258        QueryExpr::QueueCommand(query) => match query {
1259            QueueCommand::Push { queue, .. }
1260            | QueueCommand::Pop { queue, .. }
1261            | QueueCommand::Peek { queue, .. }
1262            | QueueCommand::Len { queue }
1263            | QueueCommand::Purge { queue }
1264            | QueueCommand::GroupCreate { queue, .. }
1265            | QueueCommand::GroupRead { queue, .. }
1266            | QueueCommand::Pending { queue, .. }
1267            | QueueCommand::Claim { queue, .. }
1268            | QueueCommand::Ack { queue, .. }
1269            | QueueCommand::Nack { queue, .. } => cache_scope_insert(scopes, queue),
1270            QueueCommand::Move {
1271                source,
1272                destination,
1273                ..
1274            } => {
1275                cache_scope_insert(scopes, source);
1276                cache_scope_insert(scopes, destination);
1277            }
1278        },
1279        QueryExpr::EventsBackfill(query) => {
1280            cache_scope_insert(scopes, &query.collection);
1281            cache_scope_insert(scopes, &query.target_queue);
1282        }
1283        QueryExpr::CreateTree(query) => cache_scope_insert(scopes, &query.collection),
1284        QueryExpr::DropTree(query) => cache_scope_insert(scopes, &query.collection),
1285        QueryExpr::TreeCommand(query) => match query {
1286            TreeCommand::Insert { collection, .. }
1287            | TreeCommand::Move { collection, .. }
1288            | TreeCommand::Delete { collection, .. }
1289            | TreeCommand::Validate { collection, .. }
1290            | TreeCommand::Rebalance { collection, .. } => cache_scope_insert(scopes, collection),
1291        },
1292        QueryExpr::SearchCommand(query) => match query {
1293            SearchCommand::Similar { collection, .. }
1294            | SearchCommand::Hybrid { collection, .. }
1295            | SearchCommand::SpatialRadius { collection, .. }
1296            | SearchCommand::SpatialBbox { collection, .. }
1297            | SearchCommand::SpatialNearest { collection, .. } => {
1298                cache_scope_insert(scopes, collection);
1299            }
1300            SearchCommand::Text { collection, .. }
1301            | SearchCommand::Multimodal { collection, .. }
1302            | SearchCommand::Index { collection, .. }
1303            | SearchCommand::Context { collection, .. } => {
1304                if let Some(collection) = collection.as_deref() {
1305                    cache_scope_insert(scopes, collection);
1306                }
1307            }
1308        },
1309        QueryExpr::Ask(query) => {
1310            if let Some(collection) = query.collection.as_deref() {
1311                cache_scope_insert(scopes, collection);
1312            }
1313        }
1314        QueryExpr::ExplainAlter(query) => cache_scope_insert(scopes, &query.target.name),
1315        QueryExpr::MaintenanceCommand(cmd) => match cmd {
1316            crate::storage::query::ast::MaintenanceCommand::Vacuum { target, .. }
1317            | crate::storage::query::ast::MaintenanceCommand::Analyze { target } => {
1318                if let Some(t) = target {
1319                    cache_scope_insert(scopes, t);
1320                }
1321            }
1322        },
1323        QueryExpr::CopyFrom(cmd) => cache_scope_insert(scopes, &cmd.table),
1324        QueryExpr::CreateView(cmd) => {
1325            cache_scope_insert(scopes, &cmd.name);
1326            // Invalidating the view should also invalidate its dependencies.
1327            collect_query_expr_result_cache_scopes(scopes, &cmd.query);
1328        }
1329        QueryExpr::DropView(cmd) => cache_scope_insert(scopes, &cmd.name),
1330        QueryExpr::RefreshMaterializedView(cmd) => cache_scope_insert(scopes, &cmd.name),
1331        QueryExpr::CreatePolicy(cmd) => cache_scope_insert(scopes, &cmd.table),
1332        QueryExpr::DropPolicy(cmd) => cache_scope_insert(scopes, &cmd.table),
1333        QueryExpr::CreateServer(_) | QueryExpr::DropServer(_) => {}
1334        QueryExpr::CreateForeignTable(cmd) => cache_scope_insert(scopes, &cmd.name),
1335        QueryExpr::DropForeignTable(cmd) => cache_scope_insert(scopes, &cmd.name),
1336        QueryExpr::Graph(_)
1337        | QueryExpr::GraphCommand(_)
1338        | QueryExpr::ProbabilisticCommand(_)
1339        | QueryExpr::SetConfig { .. }
1340        | QueryExpr::ShowConfig { .. }
1341        | QueryExpr::SetSecret { .. }
1342        | QueryExpr::DeleteSecret { .. }
1343        | QueryExpr::ShowSecrets { .. }
1344        | QueryExpr::SetTenant(_)
1345        | QueryExpr::ShowTenant
1346        | QueryExpr::TransactionControl(_)
1347        | QueryExpr::CreateSchema(_)
1348        | QueryExpr::DropSchema(_)
1349        | QueryExpr::CreateSequence(_)
1350        | QueryExpr::DropSequence(_)
1351        | QueryExpr::Grant(_)
1352        | QueryExpr::Revoke(_)
1353        | QueryExpr::AlterUser(_)
1354        | QueryExpr::CreateIamPolicy { .. }
1355        | QueryExpr::DropIamPolicy { .. }
1356        | QueryExpr::AttachPolicy { .. }
1357        | QueryExpr::DetachPolicy { .. }
1358        | QueryExpr::ShowPolicies { .. }
1359        | QueryExpr::ShowEffectivePermissions { .. }
1360        | QueryExpr::SimulatePolicy { .. }
1361        | QueryExpr::CreateMigration(_)
1362        | QueryExpr::ApplyMigration(_)
1363        | QueryExpr::RollbackMigration(_)
1364        | QueryExpr::ExplainMigration(_)
1365        | QueryExpr::EventsBackfillStatus { .. } => {}
1366        QueryExpr::KvCommand(cmd) => {
1367            use crate::storage::query::ast::KvCommand;
1368            match cmd {
1369                KvCommand::Put { collection, .. }
1370                | KvCommand::InvalidateTags { collection, .. }
1371                | KvCommand::Get { collection, .. }
1372                | KvCommand::Unseal { collection, .. }
1373                | KvCommand::Rotate { collection, .. }
1374                | KvCommand::History { collection, .. }
1375                | KvCommand::List { collection, .. }
1376                | KvCommand::Purge { collection, .. }
1377                | KvCommand::Watch { collection, .. }
1378                | KvCommand::Delete { collection, .. }
1379                | KvCommand::Incr { collection, .. }
1380                | KvCommand::Cas { collection, .. } => cache_scope_insert(scopes, collection),
1381            }
1382        }
1383        QueryExpr::ConfigCommand(cmd) => {
1384            use crate::storage::query::ast::ConfigCommand;
1385            match cmd {
1386                ConfigCommand::Put { collection, .. }
1387                | ConfigCommand::Get { collection, .. }
1388                | ConfigCommand::Resolve { collection, .. }
1389                | ConfigCommand::Rotate { collection, .. }
1390                | ConfigCommand::Delete { collection, .. }
1391                | ConfigCommand::History { collection, .. }
1392                | ConfigCommand::List { collection, .. }
1393                | ConfigCommand::Watch { collection, .. }
1394                | ConfigCommand::InvalidVolatileOperation { collection, .. } => {
1395                    cache_scope_insert(scopes, collection)
1396                }
1397            }
1398        }
1399    }
1400}
1401
1402/// Combine matching RLS policies for a table + action into a single
1403/// `Filter` suitable for AND-ing into a caller's `WHERE` clause.
1404///
1405/// Returns `None` when RLS is disabled or no policy admits the caller's
1406/// role — callers use that to short-circuit the mutation (for DELETE /
1407/// UPDATE we simply skip the operation, which PG expresses as "no rows
1408/// match the policy + predicate combination").
1409pub(crate) fn rls_policy_filter(
1410    runtime: &RedDBRuntime,
1411    table: &str,
1412    action: crate::storage::query::ast::PolicyAction,
1413) -> Option<crate::storage::query::ast::Filter> {
1414    rls_policy_filter_for_kind(
1415        runtime,
1416        table,
1417        action,
1418        crate::storage::query::ast::PolicyTargetKind::Table,
1419    )
1420}
1421
1422/// Kind-aware policy filter combiner (Phase 2.5.5 RLS universal).
1423/// Graph / vector / queue / timeseries scans pass the concrete kind;
1424/// policies targeting other kinds are ignored. Legacy Table-scoped
1425/// policies still apply cross-kind — callers register auto-tenancy
1426/// policies as Table today.
1427pub(crate) fn rls_policy_filter_for_kind(
1428    runtime: &RedDBRuntime,
1429    table: &str,
1430    action: crate::storage::query::ast::PolicyAction,
1431    kind: crate::storage::query::ast::PolicyTargetKind,
1432) -> Option<crate::storage::query::ast::Filter> {
1433    use crate::storage::query::ast::Filter;
1434
1435    if !runtime.inner.rls_enabled_tables.read().contains(table) {
1436        return None;
1437    }
1438    let role = current_auth_identity().map(|(_, role)| role);
1439    let role_str = role.map(|r| r.as_str().to_string());
1440    let policies = runtime.matching_rls_policies_for_kind(table, role_str.as_deref(), action, kind);
1441    if policies.is_empty() {
1442        return None;
1443    }
1444    policies
1445        .into_iter()
1446        .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1447}
1448
1449/// Returns true when the table has RLS enforcement enabled. Convenience
1450/// shortcut so DML paths can gate the AND-combine work without reaching
1451/// into `runtime.inner.rls_enabled_tables` directly.
1452pub(crate) fn rls_is_enabled(runtime: &RedDBRuntime, table: &str) -> bool {
1453    runtime.inner.rls_enabled_tables.read().contains(table)
1454}
1455
1456/// Per-entity gate used by the graph materialiser for `GraphNode`
1457/// entities. RLS is checked against the source collection with
1458/// `kind = Nodes`, which `matching_rls_policies_for_kind` resolves to
1459/// either `Nodes`-targeted policies or legacy `Table`-targeted ones
1460/// (for back-compat with auto-tenancy declarations). Cached per
1461/// collection so big graphs only resolve the policy chain once.
1462fn node_passes_rls(
1463    runtime: &RedDBRuntime,
1464    collection: &str,
1465    role: Option<&str>,
1466    cache: &mut std::collections::HashMap<String, Option<crate::storage::query::ast::Filter>>,
1467    entity: &crate::storage::unified::entity::UnifiedEntity,
1468) -> bool {
1469    use crate::storage::query::ast::{Filter, PolicyAction, PolicyTargetKind};
1470
1471    if !runtime.inner.rls_enabled_tables.read().contains(collection) {
1472        return true;
1473    }
1474    let filter = cache.entry(collection.to_string()).or_insert_with(|| {
1475        let policies = runtime.matching_rls_policies_for_kind(
1476            collection,
1477            role,
1478            PolicyAction::Select,
1479            PolicyTargetKind::Nodes,
1480        );
1481        if policies.is_empty() {
1482            None
1483        } else {
1484            policies
1485                .into_iter()
1486                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1487        }
1488    });
1489    let Some(filter) = filter else {
1490        return false;
1491    };
1492    crate::runtime::query_exec::evaluate_entity_filter_with_db(
1493        Some(&runtime.inner.db),
1494        entity,
1495        filter,
1496        collection,
1497        collection,
1498    )
1499}
1500
1501/// Edge counterpart of `node_passes_rls`. Same caching strategy with
1502/// `kind = Edges`.
1503fn edge_passes_rls(
1504    runtime: &RedDBRuntime,
1505    collection: &str,
1506    role: Option<&str>,
1507    cache: &mut std::collections::HashMap<String, Option<crate::storage::query::ast::Filter>>,
1508    entity: &crate::storage::unified::entity::UnifiedEntity,
1509) -> bool {
1510    use crate::storage::query::ast::{Filter, PolicyAction, PolicyTargetKind};
1511
1512    if !runtime.inner.rls_enabled_tables.read().contains(collection) {
1513        return true;
1514    }
1515    let filter = cache.entry(collection.to_string()).or_insert_with(|| {
1516        let policies = runtime.matching_rls_policies_for_kind(
1517            collection,
1518            role,
1519            PolicyAction::Select,
1520            PolicyTargetKind::Edges,
1521        );
1522        if policies.is_empty() {
1523            None
1524        } else {
1525            policies
1526                .into_iter()
1527                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1528        }
1529    });
1530    let Some(filter) = filter else {
1531        return false;
1532    };
1533    crate::runtime::query_exec::evaluate_entity_filter_with_db(
1534        Some(&runtime.inner.db),
1535        entity,
1536        filter,
1537        collection,
1538        collection,
1539    )
1540}
1541
1542/// RLS policy injection (Phase 2.5.2 PG parity).
1543///
1544/// Fetch every matching policy for the current thread-local role and
1545/// fold them into the query's filter. Semantics mirror PostgreSQL:
1546///
1547/// * Multiple policies on the same table combine with **OR** — a row is
1548///   visible if *any* policy admits it.
1549/// * The combined policy predicate is **AND**-ed into the caller's
1550///   existing `WHERE` clause so explicit predicates continue to trim
1551///   the policy-allowed set.
1552/// * No matching policies + RLS enabled = zero rows (PG's
1553///   restrictive-default). Callers get `None` and return an empty
1554///   `UnifiedResult` without ever dispatching the scan.
1555///
1556/// This runs only when `RuntimeInner::rls_enabled_tables` already
1557/// contains the table name — callers gate the hot path upfront to
1558/// avoid the lock acquisition on tables without RLS.
1559///
1560/// Returns `None` when no policy admits the current role; returns
1561/// `Some(mutated_table)` with policy filters folded in otherwise.
1562fn inject_rls_filters(
1563    runtime: &RedDBRuntime,
1564    frame: &dyn super::statement_frame::ReadFrame,
1565    mut table: crate::storage::query::ast::TableQuery,
1566) -> Option<crate::storage::query::ast::TableQuery> {
1567    use crate::storage::query::ast::{Filter, PolicyAction};
1568
1569    // `None` role falls through to policies with no `TO role` clause.
1570    let role = frame.identity().map(|(_, role)| role);
1571    let role_str = role.map(|r| r.as_str().to_string());
1572    let policies =
1573        runtime.matching_rls_policies(&table.table, role_str.as_deref(), PolicyAction::Select);
1574
1575    if policies.is_empty() {
1576        // RLS enabled + no policy match = deny everything. Signal the
1577        // caller to short-circuit with an empty result set.
1578        return None;
1579    }
1580
1581    // Combine policy predicates with OR (PG's permissive default).
1582    let combined = policies
1583        .into_iter()
1584        .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1585        .expect("policies non-empty");
1586
1587    // AND into the caller's existing predicate. The predicate may live
1588    // in `where_expr` rather than `filter`: `resolve_table_expr_subqueries`
1589    // nulls `filter` whenever `where_expr` is present (the case for a
1590    // view body rewritten into `SELECT … WHERE …`). Folding only into
1591    // `filter` here would silently drop that `where_expr` predicate at
1592    // eval time because `effective_table_filter` prefers `filter` —
1593    // e.g. `WITHIN TENANT … SELECT * FROM <view>` would apply the
1594    // tenant policy but lose the view's own WHERE (#635).
1595    use crate::storage::query::sql_lowering::{expr_to_filter, filter_to_expr};
1596    let had_where_expr = table.where_expr.is_some();
1597    let existing = table
1598        .filter
1599        .take()
1600        .or_else(|| table.where_expr.as_ref().map(expr_to_filter));
1601    let new_filter = match existing {
1602        Some(existing) => Filter::And(Box::new(existing), Box::new(combined)),
1603        None => combined,
1604    };
1605    // Keep `where_expr` in lock-step with the merged `filter` so
1606    // whichever the executor consults sees the full predicate.
1607    if had_where_expr {
1608        table.where_expr = Some(filter_to_expr(&new_filter));
1609    }
1610    table.filter = Some(new_filter);
1611    Some(table)
1612}
1613
1614/// Apply per-table RLS to a `JoinQuery` by folding each side's policy
1615/// predicate into the join's outer filter. Walking the merged record
1616/// at the join layer (rather than mutating the per-side scan filter)
1617/// keeps the planner's strategy choice and per-side index selection
1618/// undisturbed — the policy predicate uses the qualified `t.col` form
1619/// that resolves cleanly against the merged record's keys.
1620///
1621/// Returns `None` when any leaf has RLS enabled and no policy admits
1622/// the caller — the join short-circuits to an empty result.
1623fn inject_rls_into_join(
1624    runtime: &RedDBRuntime,
1625    frame: &dyn super::statement_frame::ReadFrame,
1626    mut join: crate::storage::query::ast::JoinQuery,
1627) -> Option<crate::storage::query::ast::JoinQuery> {
1628    use crate::storage::query::ast::Filter;
1629
1630    let mut policy_filters: Vec<Filter> = Vec::new();
1631    if !collect_join_side_policy(runtime, frame, join.left.as_ref(), &mut policy_filters) {
1632        return None;
1633    }
1634    if !collect_join_side_policy(runtime, frame, join.right.as_ref(), &mut policy_filters) {
1635        return None;
1636    }
1637
1638    if policy_filters.is_empty() {
1639        return Some(join);
1640    }
1641
1642    let combined = policy_filters
1643        .into_iter()
1644        .reduce(|acc, f| Filter::And(Box::new(acc), Box::new(f)))
1645        .expect("policy_filters non-empty");
1646
1647    join.filter = Some(match join.filter.take() {
1648        Some(existing) => Filter::And(Box::new(existing), Box::new(combined)),
1649        None => combined,
1650    });
1651
1652    Some(join)
1653}
1654
1655/// For each `Table` leaf reachable through nested joins, append the
1656/// RLS-policy filter (combined with OR across that side's matching
1657/// policies) into `out`. Returns `false` when a side has RLS enabled
1658/// but no policy admits the caller — the join must short-circuit.
1659fn collect_join_side_policy(
1660    runtime: &RedDBRuntime,
1661    frame: &dyn super::statement_frame::ReadFrame,
1662    expr: &crate::storage::query::ast::QueryExpr,
1663    out: &mut Vec<crate::storage::query::ast::Filter>,
1664) -> bool {
1665    use crate::storage::query::ast::{Filter, PolicyAction, QueryExpr};
1666    match expr {
1667        QueryExpr::Table(t) => {
1668            if !runtime.inner.rls_enabled_tables.read().contains(&t.table) {
1669                return true;
1670            }
1671            let role = frame.identity().map(|(_, role)| role);
1672            let role_str = role.map(|r| r.as_str().to_string());
1673            let policies =
1674                runtime.matching_rls_policies(&t.table, role_str.as_deref(), PolicyAction::Select);
1675            if policies.is_empty() {
1676                return false;
1677            }
1678            let combined = policies
1679                .into_iter()
1680                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1681                .expect("policies non-empty");
1682            out.push(combined);
1683            true
1684        }
1685        QueryExpr::Join(inner) => {
1686            collect_join_side_policy(runtime, frame, inner.left.as_ref(), out)
1687                && collect_join_side_policy(runtime, frame, inner.right.as_ref(), out)
1688        }
1689        _ => true,
1690    }
1691}
1692
1693/// Foreign-table post-scan filter application (Phase 3.2.2 PG parity).
1694///
1695/// Phase 3.2 FDW wrappers don't advertise filter pushdown, so the runtime
1696/// applies `WHERE` / `ORDER BY` / `LIMIT` / `OFFSET` after the wrapper
1697/// materialises all rows. Projections are best-effort — when the query
1698/// lists explicit columns we keep only those; a `SELECT *` keeps every
1699/// wrapper-emitted field verbatim.
1700///
1701/// When a wrapper later opts into pushdown (`supports_pushdown = true`)
1702/// the runtime will pass the compiled filter down instead of post-filtering.
1703fn apply_foreign_table_filters(
1704    records: Vec<crate::storage::query::unified::UnifiedRecord>,
1705    query: &crate::storage::query::ast::TableQuery,
1706) -> crate::storage::query::unified::UnifiedResult {
1707    use crate::storage::query::sql_lowering::{
1708        effective_table_filter, effective_table_projections,
1709    };
1710    use crate::storage::query::unified::UnifiedResult;
1711
1712    let filter = effective_table_filter(query);
1713    let projections = effective_table_projections(query);
1714
1715    // Step 1 — WHERE. Reuse the cross-store evaluator so the semantics
1716    // match native-collection queries (same operators, same NULL handling).
1717    let mut filtered: Vec<_> = records
1718        .into_iter()
1719        .filter(|record| match &filter {
1720            Some(f) => {
1721                super::join_filter::evaluate_runtime_filter_with_db(None, record, f, None, None)
1722            }
1723            None => true,
1724        })
1725        .collect();
1726
1727    // Step 2 — LIMIT / OFFSET. Applied after filter to match SQL semantics.
1728    if let Some(offset) = query.offset {
1729        let offset = offset as usize;
1730        if offset >= filtered.len() {
1731            filtered.clear();
1732        } else {
1733            filtered.drain(0..offset);
1734        }
1735    }
1736    if let Some(limit) = query.limit {
1737        filtered.truncate(limit as usize);
1738    }
1739
1740    // Step 3 — columns list. `SELECT *` (no explicit projections) keeps
1741    // the wrapper's column set; an explicit list trims to those names.
1742    let columns: Vec<String> = if projections.is_empty() {
1743        filtered
1744            .first()
1745            .map(|r| r.column_names().iter().map(|k| k.to_string()).collect())
1746            .unwrap_or_default()
1747    } else {
1748        projections
1749            .iter()
1750            .map(super::join_filter::projection_name)
1751            .collect()
1752    };
1753
1754    let mut result = UnifiedResult::empty();
1755    result.columns = columns;
1756    result.records = filtered;
1757    result
1758}
1759
1760/// Collect every concrete table reference inside a `QueryExpr`.
1761///
1762/// Used by view bookkeeping (dependency tracking for materialised
1763/// invalidation) and any other rewriter that needs to know the base
1764/// tables a query pulls from. Does not descend into projections/filters;
1765/// only the `FROM` side.
1766pub(crate) fn collect_table_refs(expr: &QueryExpr) -> Vec<String> {
1767    let mut scopes: HashSet<String> = HashSet::new();
1768    collect_query_expr_result_cache_scopes(&mut scopes, expr);
1769    scopes.into_iter().collect()
1770}
1771
1772fn query_expr_result_cache_scopes(expr: &QueryExpr) -> HashSet<String> {
1773    let mut scopes = HashSet::new();
1774    collect_query_expr_result_cache_scopes(&mut scopes, expr);
1775    scopes
1776}
1777
1778const RESULT_CACHE_BACKEND_KEY: &str = "runtime.result_cache.backend";
1779const RESULT_CACHE_DEFAULT_BACKEND: &str = "legacy";
1780const RESULT_CACHE_BLOB_NAMESPACE: &str = "runtime.result_cache";
1781const RESULT_CACHE_TTL_SECS: u64 = 30;
1782const RESULT_CACHE_MAX_ENTRIES: usize = 1000;
1783const RESULT_CACHE_PAYLOAD_MAGIC: &[u8; 8] = b"RDRC0001";
1784
1785#[derive(Clone, Copy, Debug, PartialEq, Eq)]
1786enum RuntimeResultCacheBackend {
1787    Legacy,
1788    BlobCache,
1789    Shadow,
1790}
1791
1792fn trim_result_cache(
1793    map: &mut HashMap<String, RuntimeResultCacheEntry>,
1794    order: &mut std::collections::VecDeque<String>,
1795) {
1796    while map.len() > RESULT_CACHE_MAX_ENTRIES {
1797        if let Some(oldest) = order.pop_front() {
1798            map.remove(&oldest);
1799        } else {
1800            break;
1801        }
1802    }
1803}
1804
1805fn result_cache_fingerprint(result: &RuntimeQueryResult) -> String {
1806    format!(
1807        "{:?}|{}|{}|{}|{}|{:?}",
1808        result.result,
1809        result.query,
1810        result.statement,
1811        result.engine,
1812        result.affected_rows,
1813        result.statement_type
1814    )
1815}
1816
1817fn mode_to_byte(mode: crate::storage::query::modes::QueryMode) -> u8 {
1818    match mode {
1819        crate::storage::query::modes::QueryMode::Sql => 0,
1820        crate::storage::query::modes::QueryMode::Gremlin => 1,
1821        crate::storage::query::modes::QueryMode::Cypher => 2,
1822        crate::storage::query::modes::QueryMode::Sparql => 3,
1823        crate::storage::query::modes::QueryMode::Path => 4,
1824        crate::storage::query::modes::QueryMode::Natural => 5,
1825        crate::storage::query::modes::QueryMode::Unknown => 255,
1826    }
1827}
1828
1829fn mode_from_byte(byte: u8) -> Option<crate::storage::query::modes::QueryMode> {
1830    match byte {
1831        0 => Some(crate::storage::query::modes::QueryMode::Sql),
1832        1 => Some(crate::storage::query::modes::QueryMode::Gremlin),
1833        2 => Some(crate::storage::query::modes::QueryMode::Cypher),
1834        3 => Some(crate::storage::query::modes::QueryMode::Sparql),
1835        4 => Some(crate::storage::query::modes::QueryMode::Path),
1836        5 => Some(crate::storage::query::modes::QueryMode::Natural),
1837        255 => Some(crate::storage::query::modes::QueryMode::Unknown),
1838        _ => None,
1839    }
1840}
1841
1842fn result_cache_static_str(value: &str) -> Option<&'static str> {
1843    match value {
1844        "select" => Some("select"),
1845        "materialized-graph" => Some("materialized-graph"),
1846        "runtime-red-schema" => Some("runtime-red-schema"),
1847        "runtime-fdw" => Some("runtime-fdw"),
1848        "runtime-table-rls" => Some("runtime-table-rls"),
1849        "runtime-table" => Some("runtime-table"),
1850        "runtime-join-rls" => Some("runtime-join-rls"),
1851        "runtime-join" => Some("runtime-join"),
1852        "runtime-vector" => Some("runtime-vector"),
1853        "runtime-hybrid" => Some("runtime-hybrid"),
1854        "runtime-secret" => Some("runtime-secret"),
1855        "runtime-config" => Some("runtime-config"),
1856        "runtime-tenant" => Some("runtime-tenant"),
1857        "runtime-explain" => Some("runtime-explain"),
1858        "runtime-tree" => Some("runtime-tree"),
1859        "runtime-kv" => Some("runtime-kv"),
1860        "runtime-queue" => Some("runtime-queue"),
1861        _ => None,
1862    }
1863}
1864
1865fn write_u32(out: &mut Vec<u8>, value: usize) -> Option<()> {
1866    let value = u32::try_from(value).ok()?;
1867    out.extend_from_slice(&value.to_le_bytes());
1868    Some(())
1869}
1870
1871fn write_string(out: &mut Vec<u8>, value: &str) -> Option<()> {
1872    write_u32(out, value.len())?;
1873    out.extend_from_slice(value.as_bytes());
1874    Some(())
1875}
1876
1877fn write_bytes(out: &mut Vec<u8>, value: &[u8]) -> Option<()> {
1878    write_u32(out, value.len())?;
1879    out.extend_from_slice(value);
1880    Some(())
1881}
1882
1883fn read_u8(input: &mut &[u8]) -> Option<u8> {
1884    let (&value, rest) = input.split_first()?;
1885    *input = rest;
1886    Some(value)
1887}
1888
1889fn read_u32(input: &mut &[u8]) -> Option<usize> {
1890    if input.len() < 4 {
1891        return None;
1892    }
1893    let value = u32::from_le_bytes(input[..4].try_into().ok()?) as usize;
1894    *input = &input[4..];
1895    Some(value)
1896}
1897
1898fn read_u64(input: &mut &[u8]) -> Option<u64> {
1899    if input.len() < 8 {
1900        return None;
1901    }
1902    let value = u64::from_le_bytes(input[..8].try_into().ok()?);
1903    *input = &input[8..];
1904    Some(value)
1905}
1906
1907fn read_string(input: &mut &[u8]) -> Option<String> {
1908    let len = read_u32(input)?;
1909    if input.len() < len {
1910        return None;
1911    }
1912    let value = String::from_utf8(input[..len].to_vec()).ok()?;
1913    *input = &input[len..];
1914    Some(value)
1915}
1916
1917fn read_bytes<'a>(input: &mut &'a [u8]) -> Option<&'a [u8]> {
1918    let len = read_u32(input)?;
1919    if input.len() < len {
1920        return None;
1921    }
1922    let value = &input[..len];
1923    *input = &input[len..];
1924    Some(value)
1925}
1926
1927fn encode_result_cache_payload(entry: &RuntimeResultCacheEntry) -> Option<Vec<u8>> {
1928    let result = &entry.result;
1929    if result.result.pre_serialized_json.is_some()
1930        || result_cache_static_str(result.statement).is_none()
1931        || result_cache_static_str(result.engine).is_none()
1932        || result_cache_static_str(result.statement_type).is_none()
1933        || result.result.records.iter().any(|record| {
1934            !record.nodes.is_empty()
1935                || !record.edges.is_empty()
1936                || !record.paths.is_empty()
1937                || !record.vector_results.is_empty()
1938        })
1939    {
1940        return None;
1941    }
1942
1943    let mut out = Vec::new();
1944    out.extend_from_slice(RESULT_CACHE_PAYLOAD_MAGIC);
1945    write_string(&mut out, &result.query)?;
1946    out.push(mode_to_byte(result.mode));
1947    write_string(&mut out, result.statement)?;
1948    write_string(&mut out, result.engine)?;
1949    out.extend_from_slice(&result.affected_rows.to_le_bytes());
1950    write_string(&mut out, result.statement_type)?;
1951
1952    write_u32(&mut out, result.result.columns.len())?;
1953    for column in &result.result.columns {
1954        write_string(&mut out, column)?;
1955    }
1956    out.extend_from_slice(&result.result.stats.nodes_scanned.to_le_bytes());
1957    out.extend_from_slice(&result.result.stats.edges_scanned.to_le_bytes());
1958    out.extend_from_slice(&result.result.stats.rows_scanned.to_le_bytes());
1959    out.extend_from_slice(&result.result.stats.exec_time_us.to_le_bytes());
1960
1961    write_u32(&mut out, result.result.records.len())?;
1962    for record in &result.result.records {
1963        let fields = record.iter_fields().collect::<Vec<_>>();
1964        write_u32(&mut out, fields.len())?;
1965        for (name, value) in fields {
1966            write_string(&mut out, name)?;
1967            let mut encoded = Vec::new();
1968            crate::storage::schema::value_codec::encode(value, &mut encoded);
1969            write_bytes(&mut out, &encoded)?;
1970        }
1971    }
1972
1973    write_u32(&mut out, entry.scopes.len())?;
1974    for scope in &entry.scopes {
1975        write_string(&mut out, scope)?;
1976    }
1977    Some(out)
1978}
1979
1980fn decode_result_cache_payload(mut input: &[u8]) -> Option<(RuntimeQueryResult, HashSet<String>)> {
1981    if input.len() < RESULT_CACHE_PAYLOAD_MAGIC.len()
1982        || &input[..RESULT_CACHE_PAYLOAD_MAGIC.len()] != RESULT_CACHE_PAYLOAD_MAGIC
1983    {
1984        return None;
1985    }
1986    input = &input[RESULT_CACHE_PAYLOAD_MAGIC.len()..];
1987
1988    let query = read_string(&mut input)?;
1989    let mode = mode_from_byte(read_u8(&mut input)?)?;
1990    let statement = result_cache_static_str(&read_string(&mut input)?)?;
1991    let engine = result_cache_static_str(&read_string(&mut input)?)?;
1992    let affected_rows = read_u64(&mut input)?;
1993    let statement_type = result_cache_static_str(&read_string(&mut input)?)?;
1994
1995    let mut columns = Vec::new();
1996    for _ in 0..read_u32(&mut input)? {
1997        columns.push(read_string(&mut input)?);
1998    }
1999    let stats = crate::storage::query::unified::QueryStats {
2000        nodes_scanned: read_u64(&mut input)?,
2001        edges_scanned: read_u64(&mut input)?,
2002        rows_scanned: read_u64(&mut input)?,
2003        exec_time_us: read_u64(&mut input)?,
2004    };
2005
2006    let mut records = Vec::new();
2007    for _ in 0..read_u32(&mut input)? {
2008        let mut record = crate::storage::query::unified::UnifiedRecord::new();
2009        for _ in 0..read_u32(&mut input)? {
2010            let name = read_string(&mut input)?;
2011            let bytes = read_bytes(&mut input)?;
2012            let (value, used) = crate::storage::schema::value_codec::decode(bytes).ok()?;
2013            if used != bytes.len() {
2014                return None;
2015            }
2016            record.set_owned(name, value);
2017        }
2018        records.push(record);
2019    }
2020
2021    let mut scopes = HashSet::new();
2022    for _ in 0..read_u32(&mut input)? {
2023        scopes.insert(read_string(&mut input)?);
2024    }
2025    if !input.is_empty() {
2026        return None;
2027    }
2028
2029    Some((
2030        RuntimeQueryResult {
2031            query,
2032            mode,
2033            statement,
2034            engine,
2035            result: crate::storage::query::unified::UnifiedResult {
2036                columns,
2037                records,
2038                stats,
2039                pre_serialized_json: None,
2040            },
2041            affected_rows,
2042            statement_type,
2043        },
2044        scopes,
2045    ))
2046}
2047
2048/// Heuristic: does the raw SQL reference a built-in whose output
2049/// varies by connection, clock, or randomness? Such queries must
2050/// skip the 30s result cache — see the call site for rationale.
2051///
2052/// ASCII case-insensitive substring match. False positives (the
2053/// token appears in a quoted string) only skip caching, which is
2054/// the conservative direction.
2055/// If `sql` starts with `EXPLAIN` followed by a non-`ALTER` token,
2056/// return the trimmed inner statement; otherwise `None`.
2057///
2058/// `EXPLAIN ALTER FOR CREATE TABLE ...` is a separate schema-diff
2059/// command handled inside the normal SQL parser, so we leave it
2060/// alone here.
2061fn strip_explain_prefix(sql: &str) -> Option<&str> {
2062    let trimmed = sql.trim_start();
2063    let (head, rest) = trimmed.split_at(
2064        trimmed
2065            .find(|c: char| c.is_whitespace())
2066            .unwrap_or(trimmed.len()),
2067    );
2068    if !head.eq_ignore_ascii_case("EXPLAIN") {
2069        return None;
2070    }
2071    let rest = rest.trim_start();
2072    if rest.is_empty() {
2073        return None;
2074    }
2075    // Peek the next token — if ALTER or ASK, defer to the normal parser.
2076    // `EXPLAIN ASK` is an executable read path: it runs retrieval and
2077    // provider selection, then short-circuits before the LLM call.
2078    let next_head_end = rest.find(|c: char| c.is_whitespace()).unwrap_or(rest.len());
2079    if rest[..next_head_end].eq_ignore_ascii_case("ALTER")
2080        || rest[..next_head_end].eq_ignore_ascii_case("ASK")
2081    {
2082        return None;
2083    }
2084    Some(rest)
2085}
2086
2087/// Cheap prefix check for a leading `WITH` keyword. Used to gate the
2088/// CTE-aware parse in `execute_query` without paying for a full
2089/// lexer pass on every statement. Treats `WITHIN` as not-a-CTE so
2090/// `WITHIN TENANT '...' SELECT ...` doesn't mis-route.
2091pub(super) fn has_with_prefix(sql: &str) -> bool {
2092    let trimmed = sql.trim_start();
2093    let head_end = trimmed
2094        .find(|c: char| c.is_whitespace() || c == '(')
2095        .unwrap_or(trimmed.len());
2096    trimmed[..head_end].eq_ignore_ascii_case("WITH")
2097}
2098
2099/// If the query is a plain SELECT whose top-level `TableQuery`
2100/// carries an `AS OF` clause, return a typed spec that the runtime
2101/// can feed to `vcs_resolve_as_of`. Returns `None` for any other
2102/// shape — joins, DML, EXPLAIN, or parse failures — so callers fall
2103/// back to the connection's regular MVCC snapshot. A cheap textual
2104/// prefilter skips the parse entirely when the source doesn't
2105/// mention `AS OF` / `as of`, keeping the autocommit hot path free.
2106fn peek_top_level_as_of(sql: &str) -> Option<crate::application::vcs::AsOfSpec> {
2107    peek_top_level_as_of_with_table(sql).map(|(spec, _)| spec)
2108}
2109
2110/// Same as `peek_top_level_as_of` but also returns the table name
2111/// targeted by the AS OF clause (when the FROM clause names a
2112/// concrete table). `None` for the table slot means scalar SELECT
2113/// or a subquery source — callers treat those as "no enforcement".
2114pub(super) fn peek_top_level_as_of_with_table(
2115    sql: &str,
2116) -> Option<(crate::application::vcs::AsOfSpec, Option<String>)> {
2117    if !sql
2118        .as_bytes()
2119        .windows(5)
2120        .any(|w| w.eq_ignore_ascii_case(b"as of"))
2121    {
2122        return None;
2123    }
2124    let parsed = crate::storage::query::parser::parse(sql).ok()?;
2125    let crate::storage::query::ast::QueryExpr::Table(table) = parsed.query else {
2126        return None;
2127    };
2128    let clause = table.as_of?;
2129    let table_name = if table.table.is_empty() || table.table == "any" {
2130        None
2131    } else {
2132        Some(table.table.clone())
2133    };
2134    let spec = match clause {
2135        crate::storage::query::ast::AsOfClause::Commit(h) => {
2136            crate::application::vcs::AsOfSpec::Commit(h)
2137        }
2138        crate::storage::query::ast::AsOfClause::Branch(b) => {
2139            crate::application::vcs::AsOfSpec::Branch(b)
2140        }
2141        crate::storage::query::ast::AsOfClause::Tag(t) => crate::application::vcs::AsOfSpec::Tag(t),
2142        crate::storage::query::ast::AsOfClause::TimestampMs(ts) => {
2143            crate::application::vcs::AsOfSpec::TimestampMs(ts)
2144        }
2145        crate::storage::query::ast::AsOfClause::Snapshot(x) => {
2146            crate::application::vcs::AsOfSpec::Snapshot(x)
2147        }
2148    };
2149    Some((spec, table_name))
2150}
2151
2152pub(super) fn query_has_volatile_builtin(sql: &str) -> bool {
2153    // Lowercase the bytes up to the first null/newline into a small
2154    // stack buffer for cheap contains() checks. Most SQL fits in the
2155    // buffer; longer queries fall back to owned lowercase.
2156    const VOLATILE_TOKENS: &[&str] = &[
2157        "pg_advisory_lock",
2158        "pg_try_advisory_lock",
2159        "pg_advisory_unlock",
2160        "random()",
2161        // NOW() / CURRENT_TIMESTAMP / CURRENT_DATE intentionally
2162        // omitted for now — they ARE volatile but today's tests rely
2163        // on caching them. Revisit once a tighter volatility story
2164        // lands.
2165    ];
2166    let lowered = sql.to_ascii_lowercase();
2167    VOLATILE_TOKENS.iter().any(|t| lowered.contains(t))
2168}
2169
2170pub(super) fn query_is_ask_statement(sql: &str) -> bool {
2171    let trimmed = sql.trim_start();
2172    let head_end = trimmed
2173        .find(|c: char| c.is_whitespace() || c == '(' || c == ';')
2174        .unwrap_or(trimmed.len());
2175    trimmed[..head_end].eq_ignore_ascii_case("ASK")
2176}
2177
2178/// Pick the `(global_mode, collection_mode)` pair for an expression,
2179/// or `None` for variants that opt out of intent-locking entirely
2180/// (admin statements like `SHOW CONFIG`, transaction control, tenant
2181/// toggles).
2182///
2183/// Phase-1 contract:
2184/// - Reads  — `(IX-compatible) (Global, IS) → (Collection, IS)`
2185/// - Writes — `(IX-compatible) (Global, IX) → (Collection, IX)`
2186/// - DDL    — `(strong)        (Global, IX) → (Collection, X)`
2187pub(super) fn intent_lock_modes_for(
2188    expr: &QueryExpr,
2189) -> Option<(
2190    crate::storage::transaction::lock::LockMode,
2191    crate::storage::transaction::lock::LockMode,
2192)> {
2193    use crate::storage::transaction::lock::LockMode::{Exclusive, IntentExclusive, IntentShared};
2194
2195    match expr {
2196        // Reads — IS / IS.
2197        QueryExpr::Table(_)
2198        | QueryExpr::Join(_)
2199        | QueryExpr::Vector(_)
2200        | QueryExpr::Hybrid(_)
2201        | QueryExpr::Graph(_)
2202        | QueryExpr::Path(_)
2203        | QueryExpr::Ask(_)
2204        | QueryExpr::SearchCommand(_)
2205        | QueryExpr::GraphCommand(_)
2206        | QueryExpr::QueueSelect(_) => Some((IntentShared, IntentShared)),
2207
2208        // Writes — IX / IX. Non-tabular mutations (vector insert,
2209        // graph node insert, queue push, timeseries point insert)
2210        // don't carry their own dispatch arm here; they ride through
2211        // the Insert variant or a command variant covered by the
2212        // read-side arm above. P1.T4 expands only the TableQuery-ish
2213        // writes; non-tabular kinds inherit when their DML variants
2214        // land in later phases.
2215        QueryExpr::Insert(_)
2216        | QueryExpr::Update(_)
2217        | QueryExpr::Delete(_)
2218        | QueryExpr::QueueCommand(QueueCommand::Move { .. }) => {
2219            Some((IntentExclusive, IntentExclusive))
2220        }
2221        QueryExpr::QueueCommand(_) => Some((IntentShared, IntentShared)),
2222
2223        // DDL — IX / X. A DDL against collection `c` blocks all
2224        // other writers + readers on `c` but leaves other collections
2225        // running (because Global stays IX, not X).
2226        QueryExpr::CreateTable(_)
2227        | QueryExpr::CreateCollection(_)
2228        | QueryExpr::CreateVector(_)
2229        | QueryExpr::DropTable(_)
2230        | QueryExpr::DropGraph(_)
2231        | QueryExpr::DropVector(_)
2232        | QueryExpr::DropDocument(_)
2233        | QueryExpr::DropKv(_)
2234        | QueryExpr::DropCollection(_)
2235        | QueryExpr::Truncate(_)
2236        | QueryExpr::AlterTable(_)
2237        | QueryExpr::CreateIndex(_)
2238        | QueryExpr::DropIndex(_)
2239        | QueryExpr::CreateTimeSeries(_)
2240        | QueryExpr::DropTimeSeries(_)
2241        | QueryExpr::CreateQueue(_)
2242        | QueryExpr::AlterQueue(_)
2243        | QueryExpr::DropQueue(_)
2244        | QueryExpr::CreateTree(_)
2245        | QueryExpr::DropTree(_)
2246        | QueryExpr::CreatePolicy(_)
2247        | QueryExpr::DropPolicy(_)
2248        | QueryExpr::CreateView(_)
2249        | QueryExpr::DropView(_)
2250        | QueryExpr::RefreshMaterializedView(_)
2251        | QueryExpr::CreateSchema(_)
2252        | QueryExpr::DropSchema(_)
2253        | QueryExpr::CreateSequence(_)
2254        | QueryExpr::DropSequence(_)
2255        | QueryExpr::CreateServer(_)
2256        | QueryExpr::DropServer(_)
2257        | QueryExpr::CreateForeignTable(_)
2258        | QueryExpr::DropForeignTable(_) => Some((IntentExclusive, Exclusive)),
2259
2260        // Admin / control — skip intent locks. `SET TENANT`,
2261        // `BEGIN / COMMIT / ROLLBACK`, `SET CONFIG`, `SHOW CONFIG`,
2262        // `VACUUM`, etc. don't touch collection data the same way
2263        // and the existing transaction layer already serialises the
2264        // pieces that matter.
2265        _ => None,
2266    }
2267}
2268
2269/// Best-effort collection inventory for an expression. Used to pick
2270/// `Collection(...)` resources for the intent-lock guard. Overshoots
2271/// are fine (take an extra IS, benign); undershoots leak writes past
2272/// DDL X locks, so err on the side of listing more names.
2273pub(super) fn collections_referenced(expr: &QueryExpr) -> Vec<String> {
2274    let mut out = Vec::new();
2275    walk_collections(expr, &mut out);
2276    out.sort();
2277    out.dedup();
2278    out
2279}
2280
2281fn walk_collections(expr: &QueryExpr, out: &mut Vec<String>) {
2282    match expr {
2283        QueryExpr::Table(t) => out.push(t.table.clone()),
2284        QueryExpr::Join(j) => {
2285            walk_collections(&j.left, out);
2286            walk_collections(&j.right, out);
2287        }
2288        QueryExpr::Insert(i) => out.push(i.table.clone()),
2289        QueryExpr::Update(u) => out.push(u.table.clone()),
2290        QueryExpr::Delete(d) => out.push(d.table.clone()),
2291        QueryExpr::QueueSelect(q) => out.push(q.queue.clone()),
2292
2293        // DDL — include the target collection so DDL takes
2294        // `(Collection, X)` and blocks concurrent readers / writers
2295        // on the same collection. Other collections stay live
2296        // because Global is still IX.
2297        QueryExpr::CreateTable(q) => out.push(q.name.clone()),
2298        QueryExpr::CreateCollection(q) => out.push(q.name.clone()),
2299        QueryExpr::CreateVector(q) => out.push(q.name.clone()),
2300        QueryExpr::DropTable(q) => out.push(q.name.clone()),
2301        QueryExpr::DropGraph(q) => out.push(q.name.clone()),
2302        QueryExpr::DropVector(q) => out.push(q.name.clone()),
2303        QueryExpr::DropDocument(q) => out.push(q.name.clone()),
2304        QueryExpr::DropKv(q) => out.push(q.name.clone()),
2305        QueryExpr::DropCollection(q) => out.push(q.name.clone()),
2306        QueryExpr::Truncate(q) => out.push(q.name.clone()),
2307        QueryExpr::AlterTable(q) => out.push(q.name.clone()),
2308        QueryExpr::CreateIndex(q) => out.push(q.table.clone()),
2309        QueryExpr::DropIndex(q) => out.push(q.table.clone()),
2310        QueryExpr::CreateTimeSeries(q) => out.push(q.name.clone()),
2311        QueryExpr::DropTimeSeries(q) => out.push(q.name.clone()),
2312        QueryExpr::CreateQueue(q) => out.push(q.name.clone()),
2313        QueryExpr::AlterQueue(q) => out.push(q.name.clone()),
2314        QueryExpr::DropQueue(q) => out.push(q.name.clone()),
2315        QueryExpr::QueueCommand(QueueCommand::Move {
2316            source,
2317            destination,
2318            ..
2319        }) => {
2320            out.push(source.clone());
2321            out.push(destination.clone());
2322        }
2323        QueryExpr::CreatePolicy(q) => out.push(q.table.clone()),
2324        QueryExpr::CreateView(q) => out.push(q.name.clone()),
2325        QueryExpr::DropView(q) => out.push(q.name.clone()),
2326        QueryExpr::RefreshMaterializedView(q) => out.push(q.name.clone()),
2327
2328        // Vector / Hybrid / Graph / Path / commands reference
2329        // collections through fields whose shape varies; without a
2330        // uniform accessor we fall back to the global lock only —
2331        // benign because every runtime path still holds the global
2332        // mode.
2333        _ => {}
2334    }
2335}
2336
2337impl RedDBRuntime {
2338    pub fn in_memory() -> RedDBResult<Self> {
2339        Self::with_options(RedDBOptions::in_memory())
2340    }
2341
2342    /// Handle to the intent-lock manager for tests + introspection.
2343    /// Production code acquires via `LockerGuard::new(rt.lock_manager())`
2344    /// rather than touching the manager directly.
2345    pub fn lock_manager(&self) -> std::sync::Arc<crate::storage::transaction::lock::LockManager> {
2346        self.inner.lock_manager.clone()
2347    }
2348
2349    /// Process-local governance registry for managed policy/config guardrails.
2350    pub fn config_registry(&self) -> std::sync::Arc<crate::auth::registry::ConfigRegistry> {
2351        self.inner.config_registry.clone()
2352    }
2353
2354    pub fn query_audit(&self) -> std::sync::Arc<crate::runtime::query_audit::QueryAuditStream> {
2355        self.inner.query_audit.clone()
2356    }
2357
2358    pub fn control_events_require_persistence(&self) -> bool {
2359        self.inner.control_event_config.require_persistence()
2360    }
2361
2362    pub fn control_event_config(&self) -> crate::runtime::control_events::ControlEventConfig {
2363        self.inner.control_event_config
2364    }
2365
2366    pub fn control_event_ledger(
2367        &self,
2368    ) -> Arc<dyn crate::runtime::control_events::ControlEventLedger> {
2369        self.inner.control_event_ledger.read().clone()
2370    }
2371
2372    #[doc(hidden)]
2373    pub fn replace_control_event_ledger_for_tests(
2374        &self,
2375        ledger: Arc<dyn crate::runtime::control_events::ControlEventLedger>,
2376    ) {
2377        *self.inner.control_event_ledger.write() = ledger;
2378    }
2379
2380    #[inline(never)]
2381    pub fn with_options(options: RedDBOptions) -> RedDBResult<Self> {
2382        Self::with_pool(options, ConnectionPoolConfig::default())
2383    }
2384
2385    pub fn with_pool(
2386        options: RedDBOptions,
2387        pool_config: ConnectionPoolConfig,
2388    ) -> RedDBResult<Self> {
2389        // PLAN.md Phase 9.1 — capture wall-clock before storage
2390        // open so the cold-start phase markers can be backfilled
2391        // once Lifecycle is constructed below. Storage open
2392        // encapsulates auto-restore + WAL replay; we treat the
2393        // whole window as one combined "restore" + "wal_replay"
2394        // phase split at the same boundary because the storage
2395        // layer doesn't yet emit a finer signal.
2396        let boot_open_start_ms = std::time::SystemTime::now()
2397            .duration_since(std::time::UNIX_EPOCH)
2398            .map(|d| d.as_millis() as u64)
2399            .unwrap_or(0);
2400        let db = Arc::new(
2401            RedDB::open_with_options(&options)
2402                .map_err(|err| RedDBError::Internal(err.to_string()))?,
2403        );
2404        let result_blob_cache = crate::storage::cache::BlobCache::open_with_l2(
2405            crate::storage::cache::BlobCacheConfig::default().with_l2_path(
2406                options
2407                    .resolved_path("data.rdb")
2408                    .with_extension("result-cache.l2"),
2409            ),
2410        )
2411        .map_err(|err| {
2412            RedDBError::Internal(format!("open result Blob Cache L2 failed: {err:?}"))
2413        })?;
2414        let storage_ready_ms = std::time::SystemTime::now()
2415            .duration_since(std::time::UNIX_EPOCH)
2416            .map(|d| d.as_millis() as u64)
2417            .unwrap_or(0);
2418
2419        let runtime = Self {
2420            inner: Arc::new(RuntimeInner {
2421                db: db.clone(),
2422                layout: PhysicalLayout::from_options(&options),
2423                indices: IndexCatalog::register_default_vector_graph(
2424                    options.has_capability(crate::api::Capability::Table),
2425                    options.has_capability(crate::api::Capability::Graph),
2426                ),
2427                pool_config,
2428                pool: Mutex::new(PoolState::default()),
2429                started_at_unix_ms: SystemTime::now()
2430                    .duration_since(UNIX_EPOCH)
2431                    .unwrap_or_default()
2432                    .as_millis(),
2433                probabilistic: super::probabilistic_store::ProbabilisticStore::new(),
2434                index_store: super::index_store::IndexStore::new(),
2435                cdc: crate::replication::cdc::CdcBuffer::new(100_000),
2436                backup_scheduler: crate::replication::scheduler::BackupScheduler::new(3600),
2437                query_cache: parking_lot::RwLock::new(
2438                    crate::storage::query::planner::cache::PlanCache::new(1000),
2439                ),
2440                result_cache: parking_lot::RwLock::new((
2441                    HashMap::new(),
2442                    std::collections::VecDeque::new(),
2443                )),
2444                result_blob_cache,
2445                result_blob_entries: parking_lot::RwLock::new((
2446                    HashMap::new(),
2447                    std::collections::VecDeque::new(),
2448                )),
2449                ask_answer_cache_entries: parking_lot::RwLock::new((
2450                    HashSet::new(),
2451                    std::collections::VecDeque::new(),
2452                )),
2453                result_cache_shadow_divergences: std::sync::atomic::AtomicU64::new(0),
2454                ask_daily_spend: parking_lot::RwLock::new(HashMap::new()),
2455                queue_message_locks: parking_lot::RwLock::new(HashMap::new()),
2456                rmw_locks: RmwLockTable::new(),
2457                planner_dirty_tables: parking_lot::RwLock::new(HashSet::new()),
2458                ec_registry: Arc::new(crate::ec::config::EcRegistry::new()),
2459                config_registry: Arc::new(crate::auth::registry::ConfigRegistry::new()),
2460                ec_worker: crate::ec::worker::EcWorker::new(),
2461                auth_store: parking_lot::RwLock::new(None),
2462                oauth_validator: parking_lot::RwLock::new(None),
2463                views: parking_lot::RwLock::new(HashMap::new()),
2464                materialized_views: parking_lot::RwLock::new(
2465                    crate::storage::cache::result::MaterializedViewCache::new(),
2466                ),
2467                retention_sweeper: parking_lot::RwLock::new(
2468                    crate::runtime::retention_sweeper::RetentionSweeperState::new(),
2469                ),
2470                snapshot_manager: Arc::new(
2471                    crate::storage::transaction::snapshot::SnapshotManager::new(),
2472                ),
2473                tx_contexts: parking_lot::RwLock::new(HashMap::new()),
2474                tx_local_tenants: parking_lot::RwLock::new(HashMap::new()),
2475                env_config_overrides: crate::runtime::config_overlay::collect_env_overrides(),
2476                lock_manager: Arc::new({
2477                    // Sourced from the matrix: Tier B key
2478                    // `concurrency.locking.deadlock_timeout_ms`
2479                    // (default 5000). Env var wins at boot so
2480                    // operators can tune without touching red_config.
2481                    let env = crate::runtime::config_overlay::collect_env_overrides();
2482                    let timeout_ms = env
2483                        .get("concurrency.locking.deadlock_timeout_ms")
2484                        .and_then(|raw| raw.parse::<u64>().ok())
2485                        .unwrap_or_else(|| {
2486                            match crate::runtime::config_matrix::default_for(
2487                                "concurrency.locking.deadlock_timeout_ms",
2488                            ) {
2489                                Some(crate::serde_json::Value::Number(n)) => n as u64,
2490                                _ => 5000,
2491                            }
2492                        });
2493                    let cfg = crate::storage::transaction::lock::LockConfig {
2494                        default_timeout: std::time::Duration::from_millis(timeout_ms),
2495                        ..Default::default()
2496                    };
2497                    crate::storage::transaction::lock::LockManager::new(cfg)
2498                }),
2499                rls_policies: parking_lot::RwLock::new(HashMap::new()),
2500                rls_enabled_tables: parking_lot::RwLock::new(HashSet::new()),
2501                foreign_tables: Arc::new(crate::storage::fdw::ForeignTableRegistry::with_builtins()),
2502                pending_tombstones: parking_lot::RwLock::new(HashMap::new()),
2503                pending_versioned_updates: parking_lot::RwLock::new(HashMap::new()),
2504                pending_kv_watch_events: parking_lot::RwLock::new(HashMap::new()),
2505                pending_store_wal_actions: parking_lot::RwLock::new(HashMap::new()),
2506                tenant_tables: parking_lot::RwLock::new(HashMap::new()),
2507                ddl_epoch: std::sync::atomic::AtomicU64::new(0),
2508                write_gate: Arc::new(crate::runtime::write_gate::WriteGate::from_options(
2509                    &options,
2510                )),
2511                lifecycle: crate::runtime::lifecycle::Lifecycle::new(),
2512                resource_limits: crate::runtime::resource_limits::ResourceLimits::from_env(),
2513                audit_log: {
2514                    // Default audit-log path for the in-memory case
2515                    // sits in the system temp dir; persistent runs
2516                    // place it next to data.rdb.
2517                    //
2518                    // gh-471 iter 2: route through the resolved
2519                    // `LogDestination`. Performance/Max tiers emit a
2520                    // `File(...)` under `<dbname>.rdb.red/logs/`;
2521                    // lower tiers / ephemeral runs report `Stderr`
2522                    // and we keep the legacy file-next-to-data sink.
2523                    let data_path = options
2524                        .data_path
2525                        .clone()
2526                        .unwrap_or_else(|| std::env::temp_dir().join("reddb"));
2527                    let (audit_dest, _) = crate::api::tier_wiring::current_log_destinations();
2528                    Arc::new(crate::runtime::audit_log::AuditLogger::for_destination(
2529                        &audit_dest,
2530                        &data_path,
2531                    ))
2532                },
2533                control_event_ledger: parking_lot::RwLock::new(Arc::new(
2534                    crate::runtime::control_events::RuntimeLedger::new(db.store()),
2535                )),
2536                control_event_config: options.control_events,
2537                query_audit: Arc::new(crate::runtime::query_audit::QueryAuditStream::new(
2538                    db.store(),
2539                    options.query_audit.clone(),
2540                )),
2541                lease_lifecycle: std::sync::OnceLock::new(),
2542                replica_apply_metrics: crate::replication::logical::ReplicaApplyMetrics::default(),
2543                quota_bucket: crate::runtime::quota_bucket::QuotaBucket::from_env(),
2544                schema_vocabulary: parking_lot::RwLock::new(
2545                    crate::runtime::schema_vocabulary::SchemaVocabulary::new(),
2546                ),
2547                slow_query_logger: {
2548                    // Issue #205 — slow-query sink lives in the same
2549                    // directory the audit log uses, so backup/restore
2550                    // ships them together. Threshold + sample-pct
2551                    // default conservatively (1 s, 100% sampling) so
2552                    // emitted lines are rare and complete. Operators
2553                    // tune via env / config matrix in a follow-up.
2554                    //
2555                    // gh-471 iter 2: same routing as the audit log —
2556                    // `LogDestination::File(...)` for Performance/Max
2557                    // lands under `<dbname>.rdb.red/logs/slow.log`;
2558                    // lower tiers fall back to `red-slow.log` in the
2559                    // data directory.
2560                    let fallback_dir = options
2561                        .data_path
2562                        .as_ref()
2563                        .and_then(|p| p.parent().map(std::path::PathBuf::from))
2564                        .unwrap_or_else(|| std::env::temp_dir().join("reddb"));
2565                    let threshold_ms = std::env::var("RED_SLOW_QUERY_THRESHOLD_MS")
2566                        .ok()
2567                        .and_then(|s| s.parse::<u64>().ok())
2568                        .unwrap_or(1000);
2569                    let sample_pct = std::env::var("RED_SLOW_QUERY_SAMPLE_PCT")
2570                        .ok()
2571                        .and_then(|s| s.parse::<u8>().ok())
2572                        .unwrap_or(100);
2573                    let (_, slow_dest) = crate::api::tier_wiring::current_log_destinations();
2574                    crate::telemetry::slow_query_logger::SlowQueryLogger::for_destination(
2575                        &slow_dest,
2576                        &fallback_dir,
2577                        threshold_ms,
2578                        sample_pct,
2579                    )
2580                },
2581                kv_stats: crate::runtime::KvStatsCounters::default(),
2582                metrics_ingest_stats: crate::runtime::MetricsIngestCounters::default(),
2583                metrics_tenant_activity_stats:
2584                    crate::runtime::MetricsTenantActivityCounters::default(),
2585                queue_telemetry: Arc::new(
2586                    crate::runtime::queue_telemetry::QueueTelemetryCounters::default(),
2587                ),
2588                kv_tag_index: crate::runtime::KvTagIndex::default(),
2589                chain_tip_cache: parking_lot::Mutex::new(HashMap::new()),
2590                chain_integrity_broken: parking_lot::Mutex::new(HashMap::new()),
2591            }),
2592        };
2593
2594        // Issue #205 — install the process-wide OperatorEvent sink so
2595        // emit sites buried in storage / replication / signal handlers
2596        // can record without threading an `&AuditLogger` through every
2597        // call stack. First registration wins; subsequent in-memory
2598        // runtimes (test harnesses) fall through to tracing+eprintln.
2599        crate::telemetry::operator_event::install_global_audit_sink(Arc::clone(
2600            &runtime.inner.audit_log,
2601        ));
2602
2603        // PLAN.md Phase 9.1 — backfill cold-start phase markers
2604        // from the wall-clock captured before storage open. The
2605        // entire `RedDB::open_with_options` call covers both
2606        // auto-restore (when configured) and WAL replay. We
2607        // record both phases against the same boundary today;
2608        // a follow-up will split them once the storage layer
2609        // surfaces a finer-grained event.
2610        runtime
2611            .inner
2612            .lifecycle
2613            .set_restore_started_at_ms(boot_open_start_ms);
2614        runtime
2615            .inner
2616            .lifecycle
2617            .set_restore_ready_at_ms(storage_ready_ms);
2618        runtime
2619            .inner
2620            .lifecycle
2621            .set_wal_replay_started_at_ms(boot_open_start_ms);
2622        runtime
2623            .inner
2624            .lifecycle
2625            .set_wal_replay_ready_at_ms(storage_ready_ms);
2626
2627        let restored_cdc_lsn = runtime
2628            .inner
2629            .db
2630            .replication
2631            .as_ref()
2632            .map(|repl| {
2633                repl.logical_wal_spool
2634                    .as_ref()
2635                    .map(|spool| spool.current_lsn())
2636                    .unwrap_or(0)
2637            })
2638            .unwrap_or(0)
2639            .max(runtime.config_u64("red.config.timeline.last_archived_lsn", 0));
2640        runtime.inner.cdc.set_current_lsn(restored_cdc_lsn);
2641        runtime.rehydrate_snapshot_xid_floor();
2642        runtime.bootstrap_system_keyed_collections()?;
2643        runtime.rehydrate_declared_column_schemas();
2644        runtime.load_probabilistic_state()?;
2645
2646        // Phase 2.5.4: replay `tenant_tables.{table}.column` markers so
2647        // tables declared via `TENANT BY (col)` survive restart. Each
2648        // entry re-registers the auto-policy and flips RLS on again.
2649        runtime.rehydrate_tenant_tables();
2650        // Issue #593 slice 9a — replay persisted materialized-view
2651        // descriptors so `CREATE MATERIALIZED VIEW v AS …` survives a
2652        // restart. Runs after the system-keyed collections bootstrap
2653        // and before the API opens.
2654        runtime.rehydrate_materialized_view_descriptors();
2655        if let Some(repl) = &runtime.inner.db.replication {
2656            repl.wal_buffer.set_current_lsn(restored_cdc_lsn);
2657        }
2658
2659        // Save system info to red_config on boot
2660        {
2661            let sys = SystemInfo::collect();
2662            runtime.inner.db.store().set_config_tree(
2663                "red.system",
2664                &crate::serde_json::json!({
2665                    "pid": sys.pid,
2666                    "cpu_cores": sys.cpu_cores,
2667                    "total_memory_bytes": sys.total_memory_bytes,
2668                    "available_memory_bytes": sys.available_memory_bytes,
2669                    "os": sys.os,
2670                    "arch": sys.arch,
2671                    "hostname": sys.hostname,
2672                    "started_at": SystemTime::now()
2673                        .duration_since(UNIX_EPOCH)
2674                        .unwrap_or_default()
2675                        .as_millis() as u64
2676                }),
2677            );
2678
2679            // Seed defaults on first boot (only if red_config is empty or missing defaults)
2680            let store = runtime.inner.db.store();
2681            if store
2682                .get_collection("red_config")
2683                .map(|m| m.query_all(|_| true).len())
2684                .unwrap_or(0)
2685                <= 10
2686            {
2687                store.set_config_tree("red.ai", &crate::json!({
2688                    "default": crate::json!({
2689                        "provider": "openai",
2690                        "model": crate::ai::DEFAULT_OPENAI_PROMPT_MODEL
2691                    }),
2692                    "max_embedding_inputs": 256,
2693                    "max_prompt_batch": 256,
2694                    "timeout": crate::json!({ "connect_secs": 10, "read_secs": 90, "write_secs": 30 })
2695                }));
2696                store.set_config_tree(
2697                    "red.server",
2698                    &crate::json!({
2699                        "max_scan_limit": 1000,
2700                        "max_body_size": 1048576,
2701                        "read_timeout_ms": 5000,
2702                        "write_timeout_ms": 5000
2703                    }),
2704                );
2705                store.set_config_tree(
2706                    "red.storage",
2707                    &crate::json!({
2708                        "page_size": 4096,
2709                        "page_cache_capacity": 100000,
2710                        "auto_checkpoint_pages": 1000,
2711                        "snapshot_retention": 16,
2712                        "verify_checksums": true,
2713                        "segment": crate::json!({
2714                            "max_entities": 100000,
2715                            "max_bytes": 268435456_u64,
2716                            "compression_level": 6
2717                        }),
2718                        "hnsw": crate::json!({ "m": 16, "ef_construction": 100, "ef_search": 50 }),
2719                        "ivf": crate::json!({ "n_lists": 100, "n_probes": 10 }),
2720                        "bm25": crate::json!({ "k1": 1.2, "b": 0.75 })
2721                    }),
2722                );
2723                store.set_config_tree(
2724                    "red.search",
2725                    &crate::json!({
2726                        "rag": crate::json!({
2727                            "max_chunks_per_source": 10,
2728                            "max_total_chunks": 25,
2729                            "similarity_threshold": 0.8,
2730                            "graph_depth": 2,
2731                            "min_relevance": 0.3
2732                        }),
2733                        "fusion": crate::json!({
2734                            "vector_weight": 0.5,
2735                            "graph_weight": 0.3,
2736                            "table_weight": 0.2,
2737                            "dedup_threshold": 0.85
2738                        })
2739                    }),
2740                );
2741                store.set_config_tree(
2742                    "red.auth",
2743                    &crate::json!({
2744                        "enabled": false,
2745                        "session_ttl_secs": 3600,
2746                        "require_auth": false
2747                    }),
2748                );
2749                store.set_config_tree(
2750                    "red.query",
2751                    &crate::json!({
2752                        "connection_pool": crate::json!({ "max_connections": 64, "max_idle": 16 }),
2753                        "max_recursion_depth": 1000
2754                    }),
2755                );
2756                store.set_config_tree(
2757                    "red.indexes",
2758                    &crate::json!({
2759                        "auto_select": true,
2760                        "bloom_filter": crate::json!({
2761                            "enabled": true,
2762                            "false_positive_rate": 0.01,
2763                            "prune_on_scan": true
2764                        }),
2765                        "hash": crate::json!({ "enabled": true }),
2766                        "bitmap": crate::json!({ "enabled": true, "max_cardinality": 1000 }),
2767                        "spatial": crate::json!({ "enabled": true })
2768                    }),
2769                );
2770                store.set_config_tree(
2771                    "red.memtable",
2772                    &crate::json!({
2773                        "enabled": true,
2774                        "max_bytes": 67108864_u64,
2775                        "flush_threshold": 0.75
2776                    }),
2777                );
2778                store.set_config_tree(
2779                    "red.probabilistic",
2780                    &crate::json!({
2781                        "hll_registers": 16384,
2782                        "sketch_default_width": 1000,
2783                        "sketch_default_depth": 5,
2784                        "filter_default_capacity": 100000
2785                    }),
2786                );
2787                store.set_config_tree(
2788                    "red.timeseries",
2789                    &crate::json!({
2790                        "default_chunk_size": 1024,
2791                        "compression": crate::json!({
2792                            "timestamps": "delta_of_delta",
2793                            "values": "gorilla_xor"
2794                        }),
2795                        "default_retention_days": 0
2796                    }),
2797                );
2798                store.set_config_tree(
2799                    "red.queue",
2800                    &crate::json!({
2801                        "default_max_size": 0,
2802                        "default_max_attempts": 3,
2803                        "visibility_timeout_ms": 30000,
2804                        "consumer_idle_timeout_ms": 60000
2805                    }),
2806                );
2807                store.set_config_tree(
2808                    "red.backup",
2809                    &crate::json!({
2810                        "enabled": false,
2811                        "interval_secs": 3600,
2812                        "retention_count": 24,
2813                        "upload": false,
2814                        "backend": "local"
2815                    }),
2816                );
2817                store.set_config_tree(
2818                    "red.wal",
2819                    &crate::json!({
2820                        "archive": crate::json!({
2821                            "enabled": false,
2822                            "retention_hours": 168,
2823                            "prefix": "wal/"
2824                        })
2825                    }),
2826                );
2827                store.set_config_tree(
2828                    "red.cdc",
2829                    &crate::json!({
2830                        "enabled": true,
2831                        "buffer_size": 100000
2832                    }),
2833                );
2834                store.set_config_tree(
2835                    "red.config.secret",
2836                    &crate::json!({
2837                        "auto_encrypt": true,
2838                        "auto_decrypt": true
2839                    }),
2840                );
2841            }
2842
2843            // Perf-parity config matrix: heal the Tier A (critical)
2844            // keys unconditionally on every boot. Idempotent — only
2845            // writes the default when the key is missing. Keeps
2846            // `SHOW CONFIG` showing every guarantee the operator has
2847            // (durability.mode, concurrency.locking.enabled, …) even
2848            // on long-running datadirs that predate the matrix.
2849            crate::runtime::config_matrix::heal_critical_keys(store.as_ref());
2850
2851            // Phase 5 — Lehman-Yao runtime flag. Read the Tier A
2852            // `storage.btree.lehman_yao` value from the matrix (env
2853            // > file > red_config > default) and publish it to the
2854            // storage layer's atomic so the B-tree read / split
2855            // paths can branch without re-reading the config on
2856            // every hot-path call.
2857            let lehman_yao = runtime.config_bool("storage.btree.lehman_yao", true);
2858            crate::storage::engine::btree::lehman_yao::set_enabled(lehman_yao);
2859            if lehman_yao {
2860                tracing::info!(
2861                    "storage.btree.lehman_yao=true — lock-free concurrent descent enabled"
2862                );
2863            }
2864
2865            // Config file overlay — mounted `/etc/reddb/config.json`
2866            // (override path via REDDB_CONFIG_FILE). Writes keys with
2867            // write-if-absent semantics so a later user `SET CONFIG`
2868            // always wins. Missing file = silent no-op.
2869            let overlay_path = crate::runtime::config_overlay::config_file_path();
2870            let _ =
2871                crate::runtime::config_overlay::apply_config_file(store.as_ref(), &overlay_path);
2872        }
2873
2874        // VCS ("Git for Data") — create the `red_*` metadata
2875        // collections on first boot. Idempotent: `get_or_create_collection`
2876        // is a no-op if the collection already exists.
2877        {
2878            let store = runtime.inner.db.store();
2879            for name in crate::application::vcs_collections::ALL {
2880                let _ = store.get_or_create_collection(*name);
2881            }
2882            // Seed VCS config namespace with sensible defaults on first
2883            // boot, matching the pattern used by red.ai / red.storage.
2884            store.set_config_tree(
2885                crate::application::vcs_collections::CONFIG_NAMESPACE,
2886                &crate::json!({
2887                    "default_branch": "main",
2888                    "author": crate::json!({
2889                        "name": "reddb",
2890                        "email": "reddb@localhost"
2891                    }),
2892                    "protected_branches": crate::json!(["main"]),
2893                    "closure": crate::json!({
2894                        "enabled": true,
2895                        "lazy": true
2896                    }),
2897                    "merge": crate::json!({
2898                        "default_strategy": "auto",
2899                        "fast_forward": true
2900                    })
2901                }),
2902            );
2903        }
2904
2905        // Migrations — create the `red_migrations` / `red_migration_deps`
2906        // system collections on first boot. Idempotent.
2907        {
2908            let store = runtime.inner.db.store();
2909            for name in crate::application::migration_collections::ALL {
2910                let _ = store.get_or_create_collection(*name);
2911            }
2912        }
2913
2914        // Start background maintenance thread (context index refresh +
2915        // session purge). Held by a WEAK reference to `RuntimeInner`
2916        // so dropping the last `RedDBRuntime` handle actually releases
2917        // the underlying Arc<Pager> (and its file lock). Polling at
2918        // 200ms means shutdown latency is bounded; the real 60-second
2919        // work cadence is tracked independently via a `last_work`
2920        // timestamp.
2921        //
2922        // The previous version captured `rt = runtime.clone()` by
2923        // strong reference and ran an unterminated `loop`, which held
2924        // Arc<RuntimeInner> forever — reopening a persistent database
2925        // in the same process failed with "Database is locked" because
2926        // the pager could never drop. See the regression test
2927        // `finding_1_select_after_bulk_insert_persistent_reopen`.
2928        {
2929            let weak = Arc::downgrade(&runtime.inner);
2930            std::thread::Builder::new()
2931                .name("reddb-maintenance".into())
2932                .spawn(move || {
2933                    let tick = std::time::Duration::from_millis(200);
2934                    let work_interval = std::time::Duration::from_secs(60);
2935                    let mut last_work = std::time::Instant::now();
2936                    loop {
2937                        std::thread::sleep(tick);
2938                        let Some(inner) = weak.upgrade() else {
2939                            // All strong references dropped — the
2940                            // runtime is gone, exit cleanly.
2941                            break;
2942                        };
2943                        if last_work.elapsed() >= work_interval {
2944                            let _stats = inner.db.store().context_index().stats();
2945                            last_work = std::time::Instant::now();
2946                        }
2947                    }
2948                })
2949                .ok();
2950        }
2951
2952        // Start backup scheduler if enabled via red_config
2953        {
2954            let store = runtime.inner.db.store();
2955            let mut backup_enabled = false;
2956            let mut backup_interval = 3600u64;
2957
2958            if let Some(manager) = store.get_collection("red_config") {
2959                manager.for_each_entity(|entity| {
2960                    if let Some(row) = entity.data.as_row() {
2961                        let key = row.get_field("key").and_then(|v| match v {
2962                            crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2963                            _ => None,
2964                        });
2965                        let val = row.get_field("value");
2966                        if key == Some("red.config.backup.enabled") {
2967                            backup_enabled = match val {
2968                                Some(crate::storage::schema::Value::Boolean(true)) => true,
2969                                Some(crate::storage::schema::Value::Text(s)) => &**s == "true",
2970                                _ => false,
2971                            };
2972                        } else if key == Some("red.config.backup.interval_secs") {
2973                            if let Some(crate::storage::schema::Value::Integer(n)) = val {
2974                                backup_interval = *n as u64;
2975                            }
2976                        }
2977                    }
2978                    true
2979                });
2980            }
2981
2982            if backup_enabled {
2983                runtime.inner.backup_scheduler.set_interval(backup_interval);
2984                let rt = runtime.clone();
2985                runtime
2986                    .inner
2987                    .backup_scheduler
2988                    .start(move || rt.trigger_backup().map_err(|e| format!("{}", e)));
2989            }
2990        }
2991
2992        // Load EC registry from red_config and start worker
2993        {
2994            runtime
2995                .inner
2996                .ec_registry
2997                .load_from_config_store(runtime.inner.db.store().as_ref());
2998            if !runtime.inner.ec_registry.async_configs().is_empty() {
2999                runtime.inner.ec_worker.start(
3000                    Arc::clone(&runtime.inner.ec_registry),
3001                    Arc::clone(&runtime.inner.db.store()),
3002                );
3003            }
3004        }
3005
3006        if let crate::replication::ReplicationRole::Replica { primary_addr } =
3007            runtime.inner.db.options().replication.role.clone()
3008        {
3009            let rt = runtime.clone();
3010            std::thread::Builder::new()
3011                .name("reddb-replica".into())
3012                .spawn(move || rt.run_replica_loop(primary_addr))
3013                .ok();
3014        }
3015
3016        // PLAN.md Phase 1 — Lifecycle Contract. Mark Ready once every
3017        // boot stage above has completed (WAL replay, restore-from-
3018        // remote, replica-loop spawn). Health probes flip from 503 to
3019        // 200 here; shutdown begins from this state.
3020        runtime.inner.lifecycle.mark_ready();
3021
3022        // Issue #583 slice 10 — ContinuousMaterializedView scheduler.
3023        // Low-priority background ticker that drains the cache's
3024        // `claim_due_at` set every ~50ms. Holds only a Weak<RuntimeInner>
3025        // so the thread exits cleanly when the runtime drops (≤50ms
3026        // latency between drop and exit). Materialized views without
3027        // a `REFRESH EVERY` clause stay on the manual-refresh path
3028        // and are skipped by `claim_due_at`, so the loop is a no-op
3029        // when no scheduled views exist.
3030        {
3031            let weak_inner = Arc::downgrade(&runtime.inner);
3032            std::thread::Builder::new()
3033                .name("reddb-mv-scheduler".into())
3034                .spawn(move || loop {
3035                    std::thread::sleep(std::time::Duration::from_millis(50));
3036                    let Some(inner) = weak_inner.upgrade() else {
3037                        break;
3038                    };
3039                    let rt = RedDBRuntime { inner };
3040                    rt.refresh_due_materialized_views();
3041                })
3042                .ok();
3043        }
3044
3045        // Issue #584 slice 12 — DeclarativeRetention background sweeper.
3046        // Low-priority ticker that physically reclaims rows whose
3047        // timestamp has fallen beyond the retention window. Holds a
3048        // `Weak<RuntimeInner>` so the thread exits within one tick of
3049        // the runtime drop (graceful shutdown leaves storage consistent
3050        // because each tick goes through the standard DELETE path —
3051        // there is no half-finished mutation state to clean up). The
3052        // tick interval is intentionally longer than the MV scheduler
3053        // (500ms) because retention is order-of-seconds at minimum.
3054        {
3055            let weak_inner = Arc::downgrade(&runtime.inner);
3056            std::thread::Builder::new()
3057                .name("reddb-retention-sweeper".into())
3058                .spawn(move || loop {
3059                    std::thread::sleep(std::time::Duration::from_millis(500));
3060                    let Some(inner) = weak_inner.upgrade() else {
3061                        break;
3062                    };
3063                    let rt = RedDBRuntime { inner };
3064                    rt.sweep_retention_tick(
3065                        crate::runtime::retention_sweeper::DEFAULT_SWEEPER_BATCH,
3066                    );
3067                })
3068                .ok();
3069        }
3070
3071        Ok(runtime)
3072    }
3073
3074    fn rehydrate_snapshot_xid_floor(&self) {
3075        let store = self.inner.db.store();
3076        for collection in store.list_collections() {
3077            let Some(manager) = store.get_collection(&collection) else {
3078                continue;
3079            };
3080            for entity in manager.query_all(|_| true) {
3081                self.inner
3082                    .snapshot_manager
3083                    .observe_committed_xid(entity.xmin);
3084                self.inner
3085                    .snapshot_manager
3086                    .observe_committed_xid(entity.xmax);
3087            }
3088        }
3089    }
3090
3091    /// Provision an empty Table-shaped collection that backs a
3092    /// `CREATE MATERIALIZED VIEW v` (issue #594 slice 9b of #575).
3093    /// `SELECT FROM v` reads this collection directly; the rewriter is
3094    /// configured to skip materialized views so the body is no longer
3095    /// substituted. REFRESH still writes to the cache slot — wiring it
3096    /// into this backing collection is the job of slice 9c.
3097    ///
3098    /// Idempotent: re-running for the same name leaves the existing
3099    /// collection in place (mirrors `CREATE TABLE IF NOT EXISTS`
3100    /// semantics). This keeps `CREATE OR REPLACE MATERIALIZED VIEW v`
3101    /// cheap — the body change does not invalidate already-buffered
3102    /// rows. Until 9c lands the backing is always empty anyway.
3103    pub(crate) fn ensure_materialized_view_backing(&self, name: &str) -> RedDBResult<()> {
3104        let store = self.inner.db.store();
3105        let mut changed = false;
3106        if store.get_collection(name).is_none() {
3107            store.get_or_create_collection(name);
3108            changed = true;
3109        }
3110        if self.inner.db.collection_contract(name).is_none() {
3111            self.inner
3112                .db
3113                .save_collection_contract(system_keyed_collection_contract(
3114                    name,
3115                    crate::catalog::CollectionModel::Table,
3116                ))
3117                .map_err(|err| RedDBError::Internal(err.to_string()))?;
3118            changed = true;
3119        }
3120        if changed {
3121            self.inner
3122                .db
3123                .persist_metadata()
3124                .map_err(|err| RedDBError::Internal(err.to_string()))?;
3125        }
3126        Ok(())
3127    }
3128
3129    /// Inverse of [`ensure_materialized_view_backing`] — drops the
3130    /// backing collection on `DROP MATERIALIZED VIEW v`. No-op when
3131    /// the collection was never created (e.g. a `DROP MATERIALIZED
3132    /// VIEW IF EXISTS v` against an unknown name).
3133    pub(crate) fn drop_materialized_view_backing(&self, name: &str) -> RedDBResult<()> {
3134        let store = self.inner.db.store();
3135        if store.get_collection(name).is_none() {
3136            return Ok(());
3137        }
3138        store
3139            .drop_collection(name)
3140            .map_err(|err| RedDBError::Internal(err.to_string()))?;
3141        // The contract may have been dropped already (DROP TABLE path)
3142        // — ignore "not found" errors by checking presence first.
3143        if self.inner.db.collection_contract(name).is_some() {
3144            self.inner
3145                .db
3146                .remove_collection_contract(name)
3147                .map_err(|err| RedDBError::Internal(err.to_string()))?;
3148        }
3149        self.invalidate_result_cache();
3150        self.inner
3151            .db
3152            .persist_metadata()
3153            .map_err(|err| RedDBError::Internal(err.to_string()))?;
3154        Ok(())
3155    }
3156
3157    fn bootstrap_system_keyed_collections(&self) -> RedDBResult<()> {
3158        let mut changed = false;
3159        for (name, model) in [
3160            ("red.config", crate::catalog::CollectionModel::Config),
3161            ("red.vault", crate::catalog::CollectionModel::Vault),
3162            // Issue #593 — materialized-view catalog. One row per
3163            // `CREATE MATERIALIZED VIEW`; rehydrated at boot before
3164            // the API opens.
3165            (
3166                crate::runtime::continuous_materialized_view::CATALOG_COLLECTION,
3167                crate::catalog::CollectionModel::Config,
3168            ),
3169        ] {
3170            if self.inner.db.store().get_collection(name).is_none() {
3171                self.inner.db.store().get_or_create_collection(name);
3172                changed = true;
3173            }
3174            if self.inner.db.collection_contract(name).is_none() {
3175                self.inner
3176                    .db
3177                    .save_collection_contract(system_keyed_collection_contract(name, model))
3178                    .map_err(|err| RedDBError::Internal(err.to_string()))?;
3179                changed = true;
3180            }
3181        }
3182        if changed {
3183            self.inner
3184                .db
3185                .persist_metadata()
3186                .map_err(|err| RedDBError::Internal(err.to_string()))?;
3187        }
3188        Ok(())
3189    }
3190
3191    pub fn db(&self) -> Arc<RedDB> {
3192        Arc::clone(&self.inner.db)
3193    }
3194
3195    /// Direct access to the runtime's secondary-index store.
3196    /// Used by bulk-insert entry points (gRPC binary bulk, HTTP bulk,
3197    /// wire bulk) that need to push new rows through the per-index
3198    /// maintenance hook after `store.bulk_insert` returns.
3199    pub fn index_store_ref(&self) -> &super::index_store::IndexStore {
3200        &self.inner.index_store
3201    }
3202
3203    /// Apply a DDL event to the schema-vocabulary reverse index
3204    /// (issue #120). Called by DDL execution paths after the catalog
3205    /// mutation has succeeded so the index never holds entries for
3206    /// half-applied DDL.
3207    pub(crate) fn schema_vocabulary_apply(
3208        &self,
3209        event: crate::runtime::schema_vocabulary::DdlEvent,
3210    ) {
3211        self.inner.schema_vocabulary.write().on_ddl(event);
3212    }
3213
3214    /// Lookup `token` in the schema-vocabulary reverse index. Returns
3215    /// an owned `Vec<VocabHit>` because the underlying read lock
3216    /// cannot be borrowed across the call boundary; the slice from
3217    /// `SchemaVocabulary::lookup` is cloned per hit.
3218    pub fn schema_vocabulary_lookup(
3219        &self,
3220        token: &str,
3221    ) -> Vec<crate::runtime::schema_vocabulary::VocabHit> {
3222        self.inner.schema_vocabulary.read().lookup(token).to_vec()
3223    }
3224
3225    /// Inject an AuthStore into the runtime. Called by server boot
3226    /// after the vault has been bootstrapped, so that `Value::Secret`
3227    /// auto-encrypt/decrypt can reach the vault AES key.
3228    pub fn set_auth_store(&self, store: Arc<crate::auth::store::AuthStore>) {
3229        *self.inner.auth_store.write() = Some(store);
3230    }
3231
3232    /// Snapshot the current AuthStore (if any). Used by the wire listener
3233    /// to validate bearer tokens issued via HTTP `/auth/login`.
3234    pub fn auth_store(&self) -> Option<Arc<crate::auth::store::AuthStore>> {
3235        self.inner.auth_store.read().clone()
3236    }
3237
3238    /// Read a vault KV secret from the configured AuthStore, if present.
3239    pub fn vault_kv_get(&self, key: &str) -> Option<String> {
3240        self.inner
3241            .auth_store
3242            .read()
3243            .as_ref()
3244            .and_then(|store| store.vault_kv_get(key))
3245    }
3246
3247    /// Write a vault KV secret and fail if the encrypted vault write is
3248    /// unavailable or cannot be made durable.
3249    pub fn vault_kv_try_set(&self, key: String, value: String) -> RedDBResult<()> {
3250        let store = self.inner.auth_store.read().clone().ok_or_else(|| {
3251            RedDBError::Query("secret storage requires an enabled, unsealed vault".to_string())
3252        })?;
3253        store
3254            .vault_kv_try_set(key, value)
3255            .map_err(|err| RedDBError::Query(err.to_string()))
3256    }
3257
3258    /// Inject an `OAuthValidator` into the runtime. When set, HTTP and
3259    /// wire transports try OAuth JWT validation before falling back to
3260    /// the local AuthStore lookup. Pass `None` to disable.
3261    pub fn set_oauth_validator(&self, validator: Option<Arc<crate::auth::oauth::OAuthValidator>>) {
3262        *self.inner.oauth_validator.write() = validator;
3263    }
3264
3265    /// Returns a clone of the configured `OAuthValidator` Arc, if any.
3266    /// Hot path: called per HTTP request when an Authorization header
3267    /// is present, so we hand back a cheap Arc clone.
3268    pub fn oauth_validator(&self) -> Option<Arc<crate::auth::oauth::OAuthValidator>> {
3269        self.inner.oauth_validator.read().clone()
3270    }
3271
3272    /// Returns the vault AES key (`red.secret.aes_key`) if an auth
3273    /// store is wired and a key has been generated. Used by the
3274    /// `Value::Secret` encrypt/decrypt pipeline.
3275    pub(crate) fn secret_aes_key(&self) -> Option<[u8; 32]> {
3276        let guard = self.inner.auth_store.read();
3277        guard.as_ref().and_then(|s| s.vault_secret_key())
3278    }
3279
3280    /// Resolve a boolean flag from `red_config`. Defaults to `default`
3281    /// when the key is missing or not coercible. If the same key has
3282    /// been written multiple times (SET CONFIG appends new rows), the
3283    /// most recent entity wins. Env-var overrides
3284    /// (`REDDB_<UP_DOTTED>`) take highest precedence.
3285    pub(crate) fn config_bool(&self, key: &str, default: bool) -> bool {
3286        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3287            if let Some(crate::storage::schema::Value::Boolean(b)) =
3288                crate::runtime::config_overlay::coerce_env_value(key, raw)
3289            {
3290                return b;
3291            }
3292        }
3293        let store = self.inner.db.store();
3294        let Some(manager) = store.get_collection("red_config") else {
3295            return default;
3296        };
3297        let mut result = default;
3298        let mut latest_id: u64 = 0;
3299        manager.for_each_entity(|entity| {
3300            if let Some(row) = entity.data.as_row() {
3301                let entry_key = row.get_field("key").and_then(|v| match v {
3302                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3303                    _ => None,
3304                });
3305                if entry_key == Some(key) {
3306                    let id = entity.id.raw();
3307                    if id >= latest_id {
3308                        latest_id = id;
3309                        result = match row.get_field("value") {
3310                            Some(crate::storage::schema::Value::Boolean(b)) => *b,
3311                            Some(crate::storage::schema::Value::Text(s)) => {
3312                                matches!(s.as_ref(), "true" | "TRUE" | "True" | "1")
3313                            }
3314                            Some(crate::storage::schema::Value::Integer(n)) => *n != 0,
3315                            _ => default,
3316                        };
3317                    }
3318                }
3319            }
3320            true
3321        });
3322        result
3323    }
3324
3325    pub(crate) fn config_u64(&self, key: &str, default: u64) -> u64 {
3326        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3327            if let Some(crate::storage::schema::Value::UnsignedInteger(n)) =
3328                crate::runtime::config_overlay::coerce_env_value(key, raw)
3329            {
3330                return n;
3331            }
3332        }
3333        let store = self.inner.db.store();
3334        let Some(manager) = store.get_collection("red_config") else {
3335            return default;
3336        };
3337        let mut result = default;
3338        let mut latest_id: u64 = 0;
3339        manager.for_each_entity(|entity| {
3340            if let Some(row) = entity.data.as_row() {
3341                let entry_key = row.get_field("key").and_then(|v| match v {
3342                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3343                    _ => None,
3344                });
3345                if entry_key == Some(key) {
3346                    let id = entity.id.raw();
3347                    if id >= latest_id {
3348                        latest_id = id;
3349                        result = match row.get_field("value") {
3350                            Some(crate::storage::schema::Value::Integer(n)) => *n as u64,
3351                            Some(crate::storage::schema::Value::UnsignedInteger(n)) => *n,
3352                            Some(crate::storage::schema::Value::Text(s)) => {
3353                                s.parse::<u64>().unwrap_or(default)
3354                            }
3355                            _ => default,
3356                        };
3357                    }
3358                }
3359            }
3360            true
3361        });
3362        result
3363    }
3364
3365    pub(crate) fn config_f64(&self, key: &str, default: f64) -> f64 {
3366        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3367            if let Ok(n) = raw.parse::<f64>() {
3368                return n;
3369            }
3370        }
3371        let store = self.inner.db.store();
3372        let Some(manager) = store.get_collection("red_config") else {
3373            return default;
3374        };
3375        let mut result = default;
3376        let mut latest_id: u64 = 0;
3377        manager.for_each_entity(|entity| {
3378            if let Some(row) = entity.data.as_row() {
3379                let entry_key = row.get_field("key").and_then(|v| match v {
3380                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3381                    _ => None,
3382                });
3383                if entry_key == Some(key) {
3384                    let id = entity.id.raw();
3385                    if id >= latest_id {
3386                        latest_id = id;
3387                        result = match row.get_field("value") {
3388                            Some(crate::storage::schema::Value::Float(n)) => *n,
3389                            Some(crate::storage::schema::Value::Integer(n)) => *n as f64,
3390                            Some(crate::storage::schema::Value::UnsignedInteger(n)) => *n as f64,
3391                            Some(crate::storage::schema::Value::Text(s)) => {
3392                                s.parse::<f64>().unwrap_or(default)
3393                            }
3394                            _ => default,
3395                        };
3396                    }
3397                }
3398            }
3399            true
3400        });
3401        result
3402    }
3403
3404    pub(crate) fn config_string(&self, key: &str, default: &str) -> String {
3405        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3406            return raw.clone();
3407        }
3408        let store = self.inner.db.store();
3409        let Some(manager) = store.get_collection("red_config") else {
3410            return default.to_string();
3411        };
3412        let mut result = default.to_string();
3413        let mut latest_id: u64 = 0;
3414        manager.for_each_entity(|entity| {
3415            if let Some(row) = entity.data.as_row() {
3416                let entry_key = row.get_field("key").and_then(|v| match v {
3417                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3418                    _ => None,
3419                });
3420                if entry_key == Some(key) {
3421                    let id = entity.id.raw();
3422                    if id >= latest_id {
3423                        latest_id = id;
3424                        if let Some(crate::storage::schema::Value::Text(value)) =
3425                            row.get_field("value")
3426                        {
3427                            result = value.to_string();
3428                        }
3429                    }
3430                }
3431            }
3432            true
3433        });
3434        result
3435    }
3436
3437    fn latest_metadata_for(
3438        &self,
3439        collection: &str,
3440        entity_id: u64,
3441    ) -> Option<crate::serde_json::Value> {
3442        self.inner
3443            .db
3444            .store()
3445            .get_metadata(collection, EntityId::new(entity_id))
3446            .map(|metadata| metadata_to_json(&metadata))
3447    }
3448
3449    fn persist_replica_lsn(&self, lsn: u64) {
3450        self.inner.db.store().set_config_tree(
3451            "red.replication",
3452            &crate::json!({
3453                "last_applied_lsn": lsn
3454            }),
3455        );
3456    }
3457
3458    fn persist_replication_health(
3459        &self,
3460        state: &str,
3461        last_error: &str,
3462        primary_lsn: Option<u64>,
3463        oldest_available_lsn: Option<u64>,
3464    ) {
3465        self.inner.db.store().set_config_tree(
3466            "red.replication",
3467            &crate::json!({
3468                "state": state,
3469                "last_error": last_error,
3470                "last_seen_primary_lsn": primary_lsn.unwrap_or(0),
3471                "last_seen_oldest_lsn": oldest_available_lsn.unwrap_or(0),
3472                "updated_at_unix_ms": SystemTime::now()
3473                    .duration_since(UNIX_EPOCH)
3474                    .unwrap_or_default()
3475                    .as_millis() as u64
3476            }),
3477        );
3478    }
3479
3480    /// Whether `SECRET('...')` literals should be encrypted with the
3481    /// vault AES key on INSERT. Default `true`.
3482    pub(crate) fn secret_auto_encrypt(&self) -> bool {
3483        self.config_bool("red.config.secret.auto_encrypt", true)
3484    }
3485
3486    /// Whether `Value::Secret` columns should be decrypted back to
3487    /// plaintext on SELECT when the vault is unsealed. Default `true`.
3488    /// Turning this off keeps secrets masked as `***` even while the
3489    /// vault is open — useful for audit trails or read-only exports.
3490    pub(crate) fn secret_auto_decrypt(&self) -> bool {
3491        self.config_bool("red.config.secret.auto_decrypt", true)
3492    }
3493
3494    /// Walk every record in `result` and swap `Value::Secret(bytes)`
3495    /// for the decrypted plaintext when the runtime has the vault
3496    /// AES key AND `red.config.secret.auto_decrypt = true`. If the
3497    /// key is missing, the vault is sealed, or auto_decrypt is off,
3498    /// secrets are left as `Value::Secret` which every formatter
3499    /// (Display, JSON) already masks as `***`.
3500    pub(crate) fn apply_secret_decryption(&self, result: &mut RuntimeQueryResult) {
3501        if !self.secret_auto_decrypt() {
3502            return;
3503        }
3504        let Some(key) = self.secret_aes_key() else {
3505            return;
3506        };
3507        for record in result.result.records.iter_mut() {
3508            for value in record.values_mut() {
3509                if let Value::Secret(ref bytes) = value {
3510                    if let Some(plain) =
3511                        super::impl_dml::decrypt_secret_payload(&key, bytes.as_slice())
3512                    {
3513                        if let Ok(text) = String::from_utf8(plain) {
3514                            *value = Value::text(text);
3515                        }
3516                    }
3517                }
3518            }
3519        }
3520    }
3521
3522    /// Emit a CDC change event and replicate to WAL buffer.
3523    /// Create a `MutationEngine` bound to this runtime.
3524    ///
3525    /// The engine is cheap to construct (no allocation) and should be
3526    /// dropped after `apply` returns. Use this from application-layer
3527    /// `create_row` / `create_rows_batch` instead of calling
3528    /// `bulk_insert` + `index_entity_insert` + `cdc_emit` separately.
3529    pub(crate) fn mutation_engine(&self) -> crate::runtime::mutation::MutationEngine<'_> {
3530        crate::runtime::mutation::MutationEngine::new(self)
3531    }
3532
3533    /// Public-mutation gate snapshot (PLAN.md W1).
3534    ///
3535    /// Surfaces that accept untrusted client requests (SQL DML/DDL,
3536    /// gRPC mutating RPCs, HTTP/native wire mutations, admin
3537    /// maintenance, serverless lifecycle) call `check_write` before
3538    /// dispatching to storage. Returns `RedDBError::ReadOnly` on any
3539    /// instance running as a replica or with `options.read_only =
3540    /// true`. The replica internal logical-WAL apply path reaches into
3541    /// the store directly and never calls this method, so legitimate
3542    /// replica catch-up still works.
3543    pub fn check_write(&self, kind: crate::runtime::write_gate::WriteKind) -> RedDBResult<()> {
3544        self.inner.write_gate.check(kind)
3545    }
3546
3547    /// Read-only handle to the gate, useful for transports that want
3548    /// to surface the policy in health/status output without taking on
3549    /// a dependency on the concrete enum.
3550    pub fn write_gate(&self) -> &crate::runtime::write_gate::WriteGate {
3551        &self.inner.write_gate
3552    }
3553
3554    /// Process lifecycle handle (PLAN.md Phase 1). Health probes,
3555    /// admin/shutdown, and signal handlers consult this single
3556    /// state machine.
3557    pub fn lifecycle(&self) -> &crate::runtime::lifecycle::Lifecycle {
3558        &self.inner.lifecycle
3559    }
3560
3561    /// Operator-imposed resource limits (PLAN.md Phase 4.1).
3562    pub fn resource_limits(&self) -> &crate::runtime::resource_limits::ResourceLimits {
3563        &self.inner.resource_limits
3564    }
3565
3566    /// Append-only audit log for admin mutations (PLAN.md Phase 6.5).
3567    pub fn audit_log(&self) -> &crate::runtime::audit_log::AuditLogger {
3568        &self.inner.audit_log
3569    }
3570
3571    /// Shared `Arc` to the audit logger — used by collaborators (the
3572    /// lease lifecycle, future request-context plumbing) that need to
3573    /// keep the logger alive past the runtime's stack frame.
3574    pub fn audit_log_arc(&self) -> Arc<crate::runtime::audit_log::AuditLogger> {
3575        Arc::clone(&self.inner.audit_log)
3576    }
3577
3578    pub(crate) fn emit_control_event(
3579        &self,
3580        kind: crate::runtime::control_events::EventKind,
3581        outcome: crate::runtime::control_events::Outcome,
3582        action: &'static str,
3583        resource: Option<String>,
3584        reason: Option<String>,
3585        extra_fields: Vec<(String, crate::runtime::control_events::Sensitivity)>,
3586    ) -> RedDBResult<()> {
3587        use crate::runtime::control_events::{
3588            ActorRef, ControlEvent, ControlEventCtx, ControlEventLedger, Sensitivity,
3589        };
3590
3591        let tenant = current_tenant();
3592        let principal = current_auth_identity();
3593        let actor_user = principal
3594            .as_ref()
3595            .map(|(principal, _)| UserId::from_parts(tenant.as_deref(), principal));
3596        let actor = actor_user
3597            .as_ref()
3598            .map(ActorRef::User)
3599            .unwrap_or(ActorRef::Anonymous);
3600        let ctx = ControlEventCtx {
3601            actor,
3602            scope: tenant
3603                .as_ref()
3604                .map(|scope| std::borrow::Cow::Borrowed(scope.as_str())),
3605            request_id: Some(std::borrow::Cow::Owned(format!(
3606                "conn-{}",
3607                current_connection_id()
3608            ))),
3609            trace_id: None,
3610        };
3611        let mut fields = std::collections::HashMap::new();
3612        fields.insert(
3613            "connection_id".to_string(),
3614            Sensitivity::raw(current_connection_id().to_string()),
3615        );
3616        if let Some((_, role)) = principal {
3617            fields.insert("actor_role".to_string(), Sensitivity::raw(role.as_str()));
3618        }
3619        for (key, value) in extra_fields {
3620            fields.insert(key, value);
3621        }
3622        let event = ControlEvent {
3623            kind,
3624            outcome,
3625            action: std::borrow::Cow::Borrowed(action),
3626            resource,
3627            reason,
3628            matched_policy_id: None,
3629            fields,
3630        };
3631        let ledger = self.inner.control_event_ledger.read();
3632        match ledger.emit(&ctx, event) {
3633            Ok(_) => Ok(()),
3634            Err(err) if self.inner.control_event_config.require_persistence() => {
3635                Err(RedDBError::Internal(err.to_string()))
3636            }
3637            Err(_) => Ok(()),
3638        }
3639    }
3640
3641    fn policy_mutation_control_ctx<'a>(
3642        &self,
3643        actor: &'a crate::auth::UserId,
3644        tenant: Option<&'a str>,
3645    ) -> crate::runtime::control_events::ControlEventCtx<'a> {
3646        crate::runtime::control_events::ControlEventCtx {
3647            actor: crate::runtime::control_events::ActorRef::User(actor),
3648            scope: tenant.map(std::borrow::Cow::Borrowed),
3649            request_id: Some(std::borrow::Cow::Owned(format!(
3650                "conn-{}",
3651                current_connection_id()
3652            ))),
3653            trace_id: None,
3654        }
3655    }
3656
3657    fn emit_query_audit(
3658        &self,
3659        query: &str,
3660        plan: &QueryAuditPlan,
3661        duration_ms: u64,
3662        result: &RuntimeQueryResult,
3663    ) {
3664        if !self.inner.query_audit.has_rules() {
3665            return;
3666        }
3667        let actor = current_auth_identity().map(|(principal, _)| principal);
3668        let tenant = current_tenant();
3669        let row_count = if result.statement_type == "select" {
3670            result.result.records.len() as u64
3671        } else {
3672            result.affected_rows
3673        };
3674        self.inner
3675            .query_audit
3676            .emit(crate::runtime::query_audit::QueryAuditEvent {
3677                actor,
3678                tenant,
3679                statement_kind: plan.statement_kind,
3680                touched_collections: plan.collections.clone(),
3681                duration_ms,
3682                row_count,
3683                request_id: Some(crate::crypto::uuid::Uuid::new_v7().to_string()),
3684                query_hash: Some(blake3::hash(query.as_bytes()).to_hex().to_string()),
3685            });
3686    }
3687
3688    /// Slice 10 of issue #527 — shared queue telemetry counters
3689    /// (delivered/acked/nacked). Cloned by `queue_delivery.rs` on
3690    /// each transition.
3691    pub(crate) fn queue_telemetry(
3692        &self,
3693    ) -> &crate::runtime::queue_telemetry::QueueTelemetryCounters {
3694        &self.inner.queue_telemetry
3695    }
3696
3697    /// Snapshots of the queue telemetry counters in label-deterministic
3698    /// order for `/metrics` rendering and the integration test.
3699    pub fn queue_telemetry_snapshot(
3700        &self,
3701    ) -> crate::runtime::queue_telemetry::QueueTelemetrySnapshot {
3702        crate::runtime::queue_telemetry::QueueTelemetrySnapshot {
3703            delivered: self.inner.queue_telemetry.delivered_snapshot(),
3704            acked: self.inner.queue_telemetry.acked_snapshot(),
3705            nacked: self.inner.queue_telemetry.nacked_snapshot(),
3706        }
3707    }
3708
3709    /// Slice 10 of issue #527 — render-time scan of pending entries
3710    /// per (queue, group) for the `queue_pending_gauge` exposition.
3711    /// Walks `red_queue_meta` live so the gauge cannot drift from
3712    /// the source of truth.
3713    pub fn queue_pending_counts(&self) -> Vec<((String, String), u64)> {
3714        let store = self.inner.db.store();
3715        crate::runtime::impl_queue::pending_counts_by_group(store.as_ref())
3716            .into_iter()
3717            .collect()
3718    }
3719
3720    /// Shared `Arc` to the write gate. Same rationale as
3721    /// `audit_log_arc`: collaborators (lease lifecycle, refresh
3722    /// thread) need a clone-cheap handle they can move into a
3723    /// background thread.
3724    pub fn write_gate_arc(&self) -> Arc<crate::runtime::write_gate::WriteGate> {
3725        Arc::clone(&self.inner.write_gate)
3726    }
3727
3728    /// Serverless writer-lease state machine. `None` when the operator
3729    /// did not opt into lease fencing (`RED_LEASE_REQUIRED` unset).
3730    pub fn lease_lifecycle(&self) -> Option<&Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>> {
3731        self.inner.lease_lifecycle.get()
3732    }
3733
3734    /// Install the lease lifecycle. Idempotent; subsequent calls
3735    /// return the previously stored value untouched.
3736    pub fn set_lease_lifecycle(
3737        &self,
3738        lifecycle: Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>,
3739    ) -> Result<(), Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>> {
3740        self.inner.lease_lifecycle.set(lifecycle)
3741    }
3742
3743    /// Reject the call when the requested batch size exceeds
3744    /// `RED_MAX_BATCH_SIZE`. Returns `RedDBError::QuotaExceeded`
3745    /// shaped so the HTTP layer can map it to 413 Payload Too
3746    /// Large (PLAN.md Phase 4.1).
3747    pub fn check_batch_size(&self, requested: usize) -> RedDBResult<()> {
3748        if self.inner.resource_limits.batch_size_exceeded(requested) {
3749            let max = self.inner.resource_limits.max_batch_size.unwrap_or(0);
3750            return Err(RedDBError::QuotaExceeded(format!(
3751                "max_batch_size:{requested}:{max}"
3752            )));
3753        }
3754        Ok(())
3755    }
3756
3757    /// Reject the call when the local DB file exceeds
3758    /// `RED_MAX_DB_SIZE_BYTES`. Reads file metadata once per call —
3759    /// the cost is a single `stat()` syscall, negligible against the
3760    /// I/O the caller is about to do. Returns `QuotaExceeded` shaped
3761    /// for HTTP 507 Insufficient Storage.
3762    pub fn check_db_size(&self) -> RedDBResult<()> {
3763        let Some(limit) = self.inner.resource_limits.max_db_size_bytes else {
3764            return Ok(());
3765        };
3766        if limit == 0 {
3767            return Ok(());
3768        }
3769        let Some(path) = self.inner.db.path() else {
3770            return Ok(());
3771        };
3772        let current = std::fs::metadata(path).map(|m| m.len()).unwrap_or(0);
3773        if current > limit {
3774            return Err(RedDBError::QuotaExceeded(format!(
3775                "max_db_size_bytes:{current}:{limit}"
3776            )));
3777        }
3778        Ok(())
3779    }
3780
3781    /// Graceful shutdown coordinator (PLAN.md Phase 1.1).
3782    ///
3783    /// Steps, in order, all idempotent across re-entrant calls:
3784    ///   1. Move lifecycle into `ShuttingDown` (concurrent callers
3785    ///      observe `Stopped` after first finishes).
3786    ///   2. Flush WAL + run final checkpoint via `db.flush()` so
3787    ///      every acked write is durable on disk.
3788    ///   3. If `backup_on_shutdown == true` and a remote backend is
3789    ///      configured, run a synchronous `trigger_backup()` so the
3790    ///      remote head reflects the final state.
3791    ///   4. Stamp the report and move to `Stopped`. Subsequent calls
3792    ///      return the cached report without re-running anything.
3793    ///
3794    /// On any error, the runtime is still marked `Stopped` so the
3795    /// process can exit; the caller logs the error context but does
3796    /// not retry the same shutdown — the operator can inspect the
3797    /// report fields to see which step failed.
3798    pub fn graceful_shutdown(
3799        &self,
3800        backup_on_shutdown: bool,
3801    ) -> RedDBResult<crate::runtime::lifecycle::ShutdownReport> {
3802        if !self.inner.lifecycle.begin_shutdown() {
3803            // Someone else already shut down (or is in flight). Return
3804            // the cached report so the HTTP caller and SIGTERM handler
3805            // get the same idempotent answer.
3806            return Ok(self.inner.lifecycle.shutdown_report().unwrap_or_default());
3807        }
3808
3809        let started_ms = std::time::SystemTime::now()
3810            .duration_since(std::time::UNIX_EPOCH)
3811            .map(|d| d.as_millis() as u64)
3812            .unwrap_or(0);
3813        let mut report = crate::runtime::lifecycle::ShutdownReport {
3814            started_at_ms: started_ms,
3815            ..Default::default()
3816        };
3817
3818        // Flush WAL + run any pending checkpoint. Local fsync is
3819        // unconditional — even a lease-lost replica needs its WAL on
3820        // disk before exit so a future restore has the latest tail.
3821        // The remote upload is gated separately so a lost-lease writer
3822        // doesn't clobber the new holder's state on its way out.
3823        let flush_res = self.inner.db.flush_local_only();
3824        report.flushed_wal = flush_res.is_ok();
3825        report.final_checkpoint = flush_res.is_ok();
3826        if let Err(err) = &flush_res {
3827            tracing::error!(
3828                target: "reddb::lifecycle",
3829                error = %err,
3830                "graceful_shutdown: local flush failed"
3831            );
3832        } else if let Err(lease_err) =
3833            self.assert_remote_write_allowed("shutdown/checkpoint_upload")
3834        {
3835            tracing::warn!(
3836                target: "reddb::serverless::lease",
3837                error = %lease_err,
3838                "graceful_shutdown: remote upload skipped — lease not held"
3839            );
3840        } else if let Err(err) = self.inner.db.upload_to_remote_backend() {
3841            tracing::error!(
3842                target: "reddb::lifecycle",
3843                error = %err,
3844                "graceful_shutdown: remote upload failed"
3845            );
3846        }
3847
3848        // Optional final backup. Skipped silently when no remote
3849        // backend is configured — `trigger_backup()` returns Err
3850        // anyway in that case, but logging it as a shutdown failure
3851        // would be misleading on a standalone (no-backend) runtime.
3852        if backup_on_shutdown && self.inner.db.remote_backend.is_some() {
3853            // The trigger_backup gate now reads `WriteKind::Backup`,
3854            // which a replica/read_only instance refuses. That's
3855            // intentional — replicas don't drive backups; only the
3856            // primary does. We still want shutdown to flush its WAL
3857            // even if the backup branch is gated off.
3858            match self.trigger_backup() {
3859                Ok(result) => {
3860                    report.backup_uploaded = result.uploaded;
3861                }
3862                Err(err) => {
3863                    tracing::warn!(
3864                        target: "reddb::lifecycle",
3865                        error = %err,
3866                        "graceful_shutdown: final backup skipped"
3867                    );
3868                }
3869            }
3870        }
3871
3872        let completed_ms = std::time::SystemTime::now()
3873            .duration_since(std::time::UNIX_EPOCH)
3874            .map(|d| d.as_millis() as u64)
3875            .unwrap_or(started_ms);
3876        report.completed_at_ms = completed_ms;
3877        report.duration_ms = completed_ms.saturating_sub(started_ms);
3878
3879        self.inner.lifecycle.finish_shutdown(report.clone());
3880        Ok(report)
3881    }
3882
3883    /// Emit a CDC record without invalidating the result cache.
3884    ///
3885    /// Used by `MutationEngine::append_batch` which calls
3886    /// `invalidate_result_cache` once for the whole batch before this
3887    /// loop, avoiding N write-lock acquisitions.
3888    pub(crate) fn cdc_emit_no_cache_invalidate(
3889        &self,
3890        operation: crate::replication::cdc::ChangeOperation,
3891        collection: &str,
3892        entity_id: u64,
3893        entity_kind: &str,
3894    ) -> u64 {
3895        let lsn = self
3896            .inner
3897            .cdc
3898            .emit(operation, collection, entity_id, entity_kind);
3899
3900        // Append to logical WAL replication buffer (if primary mode)
3901        if let Some(ref primary) = self.inner.db.replication {
3902            let store = self.inner.db.store();
3903            let entity = if operation == crate::replication::cdc::ChangeOperation::Delete {
3904                None
3905            } else {
3906                store.get(collection, EntityId::new(entity_id))
3907            };
3908            let record = ChangeRecord {
3909                lsn,
3910                timestamp: SystemTime::now()
3911                    .duration_since(UNIX_EPOCH)
3912                    .unwrap_or_default()
3913                    .as_millis() as u64,
3914                operation,
3915                collection: collection.to_string(),
3916                entity_id,
3917                entity_kind: entity_kind.to_string(),
3918                entity_bytes: entity
3919                    .as_ref()
3920                    .map(|e| UnifiedStore::serialize_entity(e, store.format_version())),
3921                metadata: self.latest_metadata_for(collection, entity_id),
3922                refresh_records: None,
3923            };
3924            let encoded = record.encode();
3925            primary.wal_buffer.append(record.lsn, encoded.clone());
3926            if let Some(spool) = &primary.logical_wal_spool {
3927                let _ = spool.append(record.lsn, &encoded);
3928            }
3929        }
3930        lsn
3931    }
3932
3933    pub(crate) fn cdc_emit_insert_batch_no_cache_invalidate(
3934        &self,
3935        collection: &str,
3936        ids: &[EntityId],
3937        entity_kind: &str,
3938    ) -> Vec<u64> {
3939        if ids.is_empty() {
3940            return Vec::new();
3941        }
3942
3943        // Without logical replication, CDC only needs the in-memory event
3944        // ring. Reserve all LSNs and push the batch under one mutex instead
3945        // of taking the ring lock once per inserted row.
3946        if self.inner.db.replication.is_none() {
3947            return self.inner.cdc.emit_batch_same_collection(
3948                crate::replication::cdc::ChangeOperation::Insert,
3949                collection,
3950                entity_kind,
3951                ids.iter().map(|id| id.raw()),
3952            );
3953        }
3954
3955        // Replication needs one logical-WAL record per entity with the
3956        // serialized entity bytes, so keep the existing per-row path.
3957        ids.iter()
3958            .map(|id| {
3959                self.cdc_emit_no_cache_invalidate(
3960                    crate::replication::cdc::ChangeOperation::Insert,
3961                    collection,
3962                    id.raw(),
3963                    entity_kind,
3964                )
3965            })
3966            .collect()
3967    }
3968
3969    pub fn cdc_emit(
3970        &self,
3971        operation: crate::replication::cdc::ChangeOperation,
3972        collection: &str,
3973        entity_id: u64,
3974        entity_kind: &str,
3975    ) -> u64 {
3976        let lsn = self
3977            .inner
3978            .cdc
3979            .emit(operation, collection, entity_id, entity_kind);
3980        // Perf: prior to this we called `invalidate_result_cache()`
3981        // which wipes EVERY cached query, across every table, under
3982        // a write lock — turning each INSERT into a serialisation
3983        // point for all readers. Swap to the per-table variant so
3984        // unrelated query caches survive.
3985        self.invalidate_result_cache_for_table(collection);
3986
3987        // Append to logical WAL replication buffer (if primary mode)
3988        if let Some(ref primary) = self.inner.db.replication {
3989            let store = self.inner.db.store();
3990            let entity = if operation == crate::replication::cdc::ChangeOperation::Delete {
3991                None
3992            } else {
3993                store.get(collection, EntityId::new(entity_id))
3994            };
3995            let record = ChangeRecord {
3996                lsn,
3997                timestamp: SystemTime::now()
3998                    .duration_since(UNIX_EPOCH)
3999                    .unwrap_or_default()
4000                    .as_millis() as u64,
4001                operation,
4002                collection: collection.to_string(),
4003                entity_id,
4004                entity_kind: entity_kind.to_string(),
4005                entity_bytes: entity
4006                    .as_ref()
4007                    .map(|entity| UnifiedStore::serialize_entity(entity, store.format_version())),
4008                metadata: self.latest_metadata_for(collection, entity_id),
4009                refresh_records: None,
4010            };
4011            let encoded = record.encode();
4012            primary.wal_buffer.append(record.lsn, encoded.clone());
4013            if let Some(spool) = &primary.logical_wal_spool {
4014                let _ = spool.append(record.lsn, &encoded);
4015            }
4016        }
4017        lsn
4018    }
4019
4020    pub(crate) fn cdc_emit_kv(
4021        &self,
4022        operation: crate::replication::cdc::ChangeOperation,
4023        collection: &str,
4024        key: &str,
4025        entity_id: u64,
4026        before: Option<crate::json::Value>,
4027        after: Option<crate::json::Value>,
4028    ) -> u64 {
4029        let lsn = self
4030            .inner
4031            .cdc
4032            .emit_kv(operation, collection, key, entity_id, before, after);
4033        self.inner.kv_stats.incr_watch_events_emitted();
4034        self.invalidate_result_cache_for_table(collection);
4035        lsn
4036    }
4037
4038    pub(crate) fn record_kv_watch_event(
4039        &self,
4040        operation: crate::replication::cdc::ChangeOperation,
4041        collection: &str,
4042        key: &str,
4043        entity_id: u64,
4044        before: Option<crate::json::Value>,
4045        after: Option<crate::json::Value>,
4046    ) {
4047        if self.current_xid().is_some() {
4048            let conn_id = current_connection_id();
4049            let event = crate::replication::cdc::KvWatchEvent {
4050                collection: collection.to_string(),
4051                key: key.to_string(),
4052                op: operation,
4053                before,
4054                after,
4055                lsn: 0,
4056                committed_at: 0,
4057                dropped_event_count: 0,
4058            };
4059            self.inner
4060                .pending_kv_watch_events
4061                .write()
4062                .entry(conn_id)
4063                .or_default()
4064                .push(event);
4065            return;
4066        }
4067
4068        self.cdc_emit_kv(operation, collection, key, entity_id, before, after);
4069    }
4070
4071    pub(crate) fn cdc_emit_prebuilt(
4072        &self,
4073        operation: crate::replication::cdc::ChangeOperation,
4074        collection: &str,
4075        entity: &UnifiedEntity,
4076        entity_kind: &str,
4077        metadata: Option<&crate::storage::Metadata>,
4078        invalidate_cache: bool,
4079    ) -> u64 {
4080        self.cdc_emit_prebuilt_with_columns(
4081            operation,
4082            collection,
4083            entity,
4084            entity_kind,
4085            metadata,
4086            invalidate_cache,
4087            None,
4088        )
4089    }
4090
4091    /// `cdc_emit_prebuilt` plus the list of column names whose values
4092    /// changed on this update. Callers that have already computed a
4093    /// `RowDamageVector` pass it here so downstream CDC consumers can
4094    /// filter events by touched column without re-diffing.
4095    /// `changed_columns` is only meaningful for `Update` operations —
4096    /// insert and delete events ignore it.
4097    pub(crate) fn cdc_emit_prebuilt_with_columns(
4098        &self,
4099        operation: crate::replication::cdc::ChangeOperation,
4100        collection: &str,
4101        entity: &UnifiedEntity,
4102        entity_kind: &str,
4103        metadata: Option<&crate::storage::Metadata>,
4104        invalidate_cache: bool,
4105        changed_columns: Option<Vec<String>>,
4106    ) -> u64 {
4107        if invalidate_cache {
4108            self.invalidate_result_cache();
4109        }
4110
4111        let public_id = entity.logical_id().raw();
4112        let lsn = self.inner.cdc.emit_with_columns(
4113            operation,
4114            collection,
4115            public_id,
4116            entity_kind,
4117            changed_columns,
4118        );
4119
4120        if let Some(ref primary) = self.inner.db.replication {
4121            let store = self.inner.db.store();
4122            let record = ChangeRecord {
4123                lsn,
4124                timestamp: SystemTime::now()
4125                    .duration_since(UNIX_EPOCH)
4126                    .unwrap_or_default()
4127                    .as_millis() as u64,
4128                operation,
4129                collection: collection.to_string(),
4130                entity_id: entity.id.raw(),
4131                entity_kind: entity_kind.to_string(),
4132                entity_bytes: Some(UnifiedStore::serialize_entity(
4133                    entity,
4134                    store.format_version(),
4135                )),
4136                metadata: metadata
4137                    .map(metadata_to_json)
4138                    .or_else(|| self.latest_metadata_for(collection, entity.id.raw())),
4139                refresh_records: None,
4140            };
4141            let encoded = record.encode();
4142            primary.wal_buffer.append(record.lsn, encoded.clone());
4143            if let Some(spool) = &primary.logical_wal_spool {
4144                let _ = spool.append(record.lsn, &encoded);
4145            }
4146        }
4147
4148        lsn
4149    }
4150
4151    pub(crate) fn cdc_emit_prebuilt_batch<'a, I>(
4152        &self,
4153        operation: crate::replication::cdc::ChangeOperation,
4154        entity_kind: &str,
4155        items: I,
4156        invalidate_cache: bool,
4157    ) where
4158        I: IntoIterator<
4159            Item = (
4160                &'a str,
4161                &'a UnifiedEntity,
4162                Option<&'a crate::storage::Metadata>,
4163            ),
4164        >,
4165    {
4166        let items: Vec<(&str, &UnifiedEntity, Option<&crate::storage::Metadata>)> =
4167            items.into_iter().collect();
4168        if items.is_empty() {
4169            return;
4170        }
4171
4172        if invalidate_cache {
4173            self.invalidate_result_cache();
4174        }
4175
4176        for (collection, entity, metadata) in items {
4177            self.cdc_emit_prebuilt(operation, collection, entity, entity_kind, metadata, false);
4178        }
4179    }
4180
4181    fn run_replica_loop(&self, primary_addr: String) {
4182        let endpoint = if primary_addr.starts_with("http") {
4183            primary_addr
4184        } else {
4185            format!("http://{primary_addr}")
4186        };
4187        let poll_ms = self.inner.db.options().replication.poll_interval_ms;
4188        let max_count = self.inner.db.options().replication.max_batch_size;
4189        let mut since_lsn = self.config_u64("red.replication.last_applied_lsn", 0);
4190
4191        let runtime = match tokio::runtime::Builder::new_current_thread()
4192            .enable_all()
4193            .build()
4194        {
4195            Ok(runtime) => runtime,
4196            Err(_) => return,
4197        };
4198
4199        runtime.block_on(async move {
4200            use crate::grpc::proto::red_db_client::RedDbClient;
4201            use crate::grpc::proto::JsonPayloadRequest;
4202
4203            let mut client = loop {
4204                match RedDbClient::connect(endpoint.clone()).await {
4205                    Ok(client) => {
4206                        self.persist_replication_health("connecting", "", None, None);
4207                        break client;
4208                    }
4209                    Err(_) => {
4210                        self.persist_replication_health(
4211                            "connecting",
4212                            "waiting for primary connection",
4213                            None,
4214                            None,
4215                        );
4216                        std::thread::sleep(std::time::Duration::from_millis(poll_ms.max(250)))
4217                    }
4218                }
4219            };
4220
4221            // PLAN.md Phase 11.5 — stateful applier guards LSN
4222            // monotonicity across pulls. Seed with the persisted
4223            // `last_applied_lsn` so reboots don't lose the chain
4224            // pointer.
4225            let applier = crate::replication::logical::LogicalChangeApplier::new(since_lsn);
4226
4227            loop {
4228                let payload = crate::json!({
4229                    "since_lsn": since_lsn,
4230                    "max_count": max_count
4231                });
4232                let request = tonic::Request::new(JsonPayloadRequest {
4233                    payload_json: crate::json::to_string(&payload)
4234                        .unwrap_or_else(|_| "{}".to_string()),
4235                });
4236
4237                if let Ok(response) = client.pull_wal_records(request).await {
4238                    if let Ok(value) =
4239                        crate::json::from_str::<crate::json::Value>(&response.into_inner().payload)
4240                    {
4241                        let current_lsn =
4242                            value.get("current_lsn").and_then(crate::json::Value::as_u64);
4243                        let oldest_available_lsn = value
4244                            .get("oldest_available_lsn")
4245                            .and_then(crate::json::Value::as_u64);
4246                        if since_lsn > 0
4247                            && oldest_available_lsn
4248                                .map(|oldest| oldest > since_lsn.saturating_add(1))
4249                                .unwrap_or(false)
4250                        {
4251                            self.persist_replication_health(
4252                                "stalled_gap",
4253                                "replica is behind the oldest logical WAL available on primary; re-bootstrap required",
4254                                current_lsn,
4255                                oldest_available_lsn,
4256                            );
4257                            std::thread::sleep(std::time::Duration::from_millis(poll_ms.max(250)));
4258                            continue;
4259                        }
4260                        if let Some(records) =
4261                            value.get("records").and_then(crate::json::Value::as_array)
4262                        {
4263                            for record in records {
4264                                let Some(data_hex) =
4265                                    record.get("data").and_then(crate::json::Value::as_str)
4266                                else {
4267                                    continue;
4268                                };
4269                                let Ok(data) = hex::decode(data_hex) else {
4270                                    self.inner.replica_apply_metrics.record(
4271                                        crate::replication::logical::ApplyErrorKind::Decode,
4272                                    );
4273                                    self.persist_replication_health(
4274                                        "apply_error",
4275                                        "failed to decode WAL record hex payload",
4276                                        current_lsn,
4277                                        oldest_available_lsn,
4278                                    );
4279                                    continue;
4280                                };
4281                                let Ok(change) = ChangeRecord::decode(&data) else {
4282                                    self.inner.replica_apply_metrics.record(
4283                                        crate::replication::logical::ApplyErrorKind::Decode,
4284                                    );
4285                                    self.persist_replication_health(
4286                                        "apply_error",
4287                                        "failed to decode logical WAL record",
4288                                        current_lsn,
4289                                        oldest_available_lsn,
4290                                    );
4291                                    continue;
4292                                };
4293                                match applier.apply(
4294                                    self.inner.db.as_ref(),
4295                                    &change,
4296                                    ApplyMode::Replica,
4297                                ) {
4298                                    Ok(crate::replication::logical::ApplyOutcome::Applied) => {
4299                                        self.invalidate_result_cache_for_table(&change.collection);
4300                                        since_lsn = since_lsn.max(change.lsn);
4301                                        self.persist_replica_lsn(since_lsn);
4302                                    }
4303                                    Ok(_) => {
4304                                        // Idempotent / Skipped: no advance, no error.
4305                                    }
4306                                    Err(err) => {
4307                                        self.inner.replica_apply_metrics.record(err.kind());
4308                                        // Issue #205 — emit operator-grade event
4309                                        // for the two replication-fatal kinds. `Gap`
4310                                        // / `Apply` / `Decode` already persist via
4311                                        // `persist_replication_health`; the
4312                                        // OperatorEvent variants only cover the
4313                                        // two "stream is broken" / "follower
4314                                        // diverged" conditions an operator must act
4315                                        // on out-of-band.
4316                                        match &err {
4317                                            crate::replication::logical::LogicalApplyError::Divergence { lsn, expected: _, got: _ } => {
4318                                                crate::telemetry::operator_event::OperatorEvent::Divergence {
4319                                                    peer: "primary".to_string(),
4320                                                    leader_lsn: *lsn,
4321                                                    follower_lsn: since_lsn,
4322                                                }
4323                                                .emit_global();
4324                                            }
4325                                            crate::replication::logical::LogicalApplyError::Gap { last, next } => {
4326                                                crate::telemetry::operator_event::OperatorEvent::ReplicationBroken {
4327                                                    peer: "primary".to_string(),
4328                                                    reason: format!("stalled gap last={last} next={next}"),
4329                                                }
4330                                                .emit_global();
4331                                            }
4332                                            _ => {}
4333                                        }
4334                                        let kind = match &err {
4335                                            crate::replication::logical::LogicalApplyError::Gap { .. } => "stalled_gap",
4336                                            crate::replication::logical::LogicalApplyError::Divergence { .. } => "divergence",
4337                                            _ => "apply_error",
4338                                        };
4339                                        self.persist_replication_health(
4340                                            kind,
4341                                            &format!("replica apply rejected: {err}"),
4342                                            current_lsn,
4343                                            oldest_available_lsn,
4344                                        );
4345                                        // Stop applying this batch. The
4346                                        // outer loop will retry on next
4347                                        // pull, which on a real Gap will
4348                                        // not magically heal — operator
4349                                        // must rebootstrap. For
4350                                        // Divergence, we explicitly do
4351                                        // not advance; this keeps the
4352                                        // replica visibly unhealthy
4353                                        // instead of silently swallowing
4354                                        // corruption.
4355                                        break;
4356                                    }
4357                                }
4358                            }
4359                        }
4360                        self.persist_replication_health(
4361                            "healthy",
4362                            "",
4363                            current_lsn,
4364                            oldest_available_lsn,
4365                        );
4366                    } else {
4367                        self.persist_replication_health(
4368                            "apply_error",
4369                            "failed to parse pull_wal_records response",
4370                            None,
4371                            None,
4372                        );
4373                    }
4374                } else {
4375                    self.persist_replication_health(
4376                        "connecting",
4377                        "primary pull_wal_records request failed",
4378                        None,
4379                        None,
4380                    );
4381                }
4382
4383                std::thread::sleep(std::time::Duration::from_millis(poll_ms));
4384            }
4385        });
4386    }
4387
4388    /// Poll CDC events since a given LSN.
4389    pub fn cdc_poll(
4390        &self,
4391        since_lsn: u64,
4392        max_count: usize,
4393    ) -> Vec<crate::replication::cdc::ChangeEvent> {
4394        self.inner.cdc.poll(since_lsn, max_count)
4395    }
4396
4397    /// PLAN.md Phase 11.4 — current CDC LSN. Public mutation
4398    /// surfaces (HTTP query, gRPC entity ops) call this immediately
4399    /// after a successful write to feed `enforce_commit_policy`.
4400    pub fn cdc_current_lsn(&self) -> u64 {
4401        self.inner.cdc.current_lsn()
4402    }
4403
4404    pub fn kv_watch_events_since(
4405        &self,
4406        collection: &str,
4407        key: &str,
4408        since_lsn: u64,
4409        max_count: usize,
4410    ) -> Vec<crate::replication::cdc::KvWatchEvent> {
4411        self.inner
4412            .cdc
4413            .poll(since_lsn, max_count)
4414            .into_iter()
4415            .filter_map(|event| event.kv)
4416            .filter(|event| event.collection == collection && event.key == key)
4417            .collect()
4418    }
4419
4420    pub fn kv_watch_events_since_prefix(
4421        &self,
4422        collection: &str,
4423        prefix: &str,
4424        since_lsn: u64,
4425        max_count: usize,
4426    ) -> Vec<crate::replication::cdc::KvWatchEvent> {
4427        self.inner
4428            .cdc
4429            .poll(since_lsn, max_count)
4430            .into_iter()
4431            .filter_map(|event| event.kv)
4432            .filter(|event| event.collection == collection && event.key.starts_with(prefix))
4433            .collect()
4434    }
4435
4436    pub(crate) fn kv_watch_subscribe<'a>(
4437        &'a self,
4438        collection: impl Into<String>,
4439        key: impl Into<String>,
4440        from_lsn: Option<u64>,
4441    ) -> crate::runtime::kv_watch::KvWatchStream<'a> {
4442        crate::runtime::kv_watch::KvWatchStream::subscribe(
4443            &self.inner.cdc,
4444            &self.inner.kv_stats,
4445            collection,
4446            key,
4447            from_lsn,
4448            self.kv_watch_idle_timeout_ms(),
4449        )
4450    }
4451
4452    pub(crate) fn kv_watch_subscribe_prefix<'a>(
4453        &'a self,
4454        collection: impl Into<String>,
4455        prefix: impl Into<String>,
4456        from_lsn: Option<u64>,
4457    ) -> crate::runtime::kv_watch::KvWatchStream<'a> {
4458        crate::runtime::kv_watch::KvWatchStream::subscribe_prefix(
4459            &self.inner.cdc,
4460            &self.inner.kv_stats,
4461            collection,
4462            prefix,
4463            from_lsn,
4464            self.kv_watch_idle_timeout_ms(),
4465        )
4466    }
4467
4468    pub(crate) fn kv_watch_idle_timeout_ms(&self) -> u64 {
4469        self.config_u64("red.config.kv.watch.idle_timeout_ms", 60_000)
4470    }
4471
4472    /// Get backup scheduler status.
4473    pub fn backup_status(&self) -> crate::replication::scheduler::BackupStatus {
4474        self.inner.backup_scheduler.status()
4475    }
4476
4477    /// Borrow the runtime's result Blob Cache.
4478    ///
4479    /// Wired for the `/admin/blob_cache/sweep` and
4480    /// `/admin/blob_cache/flush_namespace` HTTP handlers (issue #148
4481    /// follow-up): both delegate to
4482    /// `crate::storage::cache::sweeper::BlobCacheSweeper`, which takes a
4483    /// `&BlobCache`. Also used by `trigger_backup` when
4484    /// `red.config.backup.include_blob_cache=true` to locate the L2
4485    /// directory for archival.
4486    pub fn result_blob_cache(&self) -> &crate::storage::cache::BlobCache {
4487        &self.inner.result_blob_cache
4488    }
4489
4490    /// PLAN.md Phase 11.4 — owned snapshot of every registered
4491    /// replica's state on this primary. Returns empty vec on
4492    /// non-primary instances or when no replicas are registered yet.
4493    pub fn primary_replica_snapshots(&self) -> Vec<crate::replication::primary::ReplicaState> {
4494        self.inner
4495            .db
4496            .replication
4497            .as_ref()
4498            .map(|repl| repl.replica_snapshots())
4499            .unwrap_or_default()
4500    }
4501
4502    /// PLAN.md Phase 11.4 — active commit policy. Reads
4503    /// `RED_PRIMARY_COMMIT_POLICY` once at runtime construction;
4504    /// future env reloads will need a reload endpoint. Default is
4505    /// `Local` — current behavior, no replica blocking.
4506    pub fn commit_policy(&self) -> crate::replication::CommitPolicy {
4507        crate::replication::CommitPolicy::from_env()
4508    }
4509
4510    /// PLAN.md Phase 11.5 — accessor for replica-side apply error
4511    /// counters (gap / divergence / apply / decode). Returned
4512    /// snapshot is consistent across the four counters; the labels
4513    /// match `reddb_replica_apply_errors_total{kind}`.
4514    pub fn replica_apply_error_counts(
4515        &self,
4516    ) -> [(crate::replication::logical::ApplyErrorKind, u64); 4] {
4517        self.inner.replica_apply_metrics.snapshot()
4518    }
4519
4520    /// PLAN.md Phase 4.4 — per-caller quota bucket. Always
4521    /// returned; `is_configured()` lets callers short-circuit.
4522    pub fn quota_bucket(&self) -> &crate::runtime::quota_bucket::QuotaBucket {
4523        &self.inner.quota_bucket
4524    }
4525
4526    /// PLAN.md Phase 11.4 — observability snapshot of every
4527    /// replica's durable LSN as known to the commit waiter. Empty
4528    /// vec on non-primary instances or when no replica has acked.
4529    pub fn commit_waiter_snapshot(&self) -> Vec<(String, u64)> {
4530        self.inner
4531            .db
4532            .replication
4533            .as_ref()
4534            .map(|repl| repl.commit_waiter.snapshot())
4535            .unwrap_or_default()
4536    }
4537
4538    /// PLAN.md Phase 11.4 — `(reached, timed_out, not_required, last_micros)`
4539    /// counters for /metrics. Always-zero on non-primary instances.
4540    pub fn commit_waiter_metrics_snapshot(&self) -> (u64, u64, u64, u64) {
4541        self.inner
4542            .db
4543            .replication
4544            .as_ref()
4545            .map(|repl| repl.commit_waiter.metrics_snapshot())
4546            .unwrap_or((0, 0, 0, 0))
4547    }
4548
4549    /// PLAN.md Phase 11.4 — block until at least `count` replicas
4550    /// have durably applied through `target_lsn`, or `timeout`
4551    /// elapses. Returns the `AwaitOutcome` so the caller can decide
4552    /// whether to surface a timeout error to the client or continue
4553    /// (the policy mapping lives in the commit dispatcher).
4554    ///
4555    /// Foundation only — the write commit path doesn't yet call
4556    /// this. Wiring it is a per-surface task gated on the operator
4557    /// flipping `RED_PRIMARY_COMMIT_POLICY` away from `local`.
4558    pub fn await_replica_acks(
4559        &self,
4560        target_lsn: u64,
4561        count: u32,
4562        timeout: std::time::Duration,
4563    ) -> crate::replication::AwaitOutcome {
4564        match &self.inner.db.replication {
4565            Some(repl) => repl.commit_waiter.await_acks(target_lsn, count, timeout),
4566            None => {
4567                // No replication configured: policy must be `Local`.
4568                // Treat as immediate `NotRequired` so callers don't
4569                // block on a degenerate setup.
4570                crate::replication::AwaitOutcome::NotRequired
4571            }
4572        }
4573    }
4574
4575    /// PLAN.md Phase 11.4 — enforce the configured commit policy
4576    /// against `post_lsn` (the LSN of the just-completed write).
4577    /// Returns `Ok(AwaitOutcome)` on every successful enforcement
4578    /// (including `Reached` and `TimedOut` when fail-on-timeout is
4579    /// off). Returns `Err(ReadOnly)` only when:
4580    ///   * policy is `AckN(n)` with `n > 0`
4581    ///   * the wait timed out
4582    ///   * `RED_COMMIT_FAIL_ON_TIMEOUT=true` is set
4583    ///
4584    /// The HTTP / gRPC / wire surfaces map the error to 504 / wire
4585    /// backoff. Default behaviour (env unset) logs warn and returns
4586    /// success — matches PLAN.md "default v1 stays local" semantics
4587    /// while still letting the operator opt into hard-blocking.
4588    pub fn enforce_commit_policy(
4589        &self,
4590        post_lsn: u64,
4591    ) -> RedDBResult<crate::replication::AwaitOutcome> {
4592        let n = match self.commit_policy() {
4593            crate::replication::CommitPolicy::AckN(n) if n > 0 => n,
4594            _ => return Ok(crate::replication::AwaitOutcome::NotRequired),
4595        };
4596        let timeout_ms = std::env::var("RED_REPLICATION_ACK_TIMEOUT_MS")
4597            .ok()
4598            .and_then(|v| v.parse::<u64>().ok())
4599            .unwrap_or(5_000);
4600        let outcome =
4601            self.await_replica_acks(post_lsn, n, std::time::Duration::from_millis(timeout_ms));
4602        {
4603            use crate::runtime::control_events::{EventKind, Outcome, Sensitivity};
4604            let (event_outcome, fields) = match &outcome {
4605                crate::replication::AwaitOutcome::Reached(count) => (
4606                    Outcome::Allowed,
4607                    vec![
4608                        (
4609                            "post_lsn".to_string(),
4610                            Sensitivity::raw(post_lsn.to_string()),
4611                        ),
4612                        ("required".to_string(), Sensitivity::raw(n.to_string())),
4613                        ("observed".to_string(), Sensitivity::raw(count.to_string())),
4614                        (
4615                            "timeout_ms".to_string(),
4616                            Sensitivity::raw(timeout_ms.to_string()),
4617                        ),
4618                    ],
4619                ),
4620                crate::replication::AwaitOutcome::TimedOut { observed, required } => (
4621                    Outcome::Error,
4622                    vec![
4623                        (
4624                            "post_lsn".to_string(),
4625                            Sensitivity::raw(post_lsn.to_string()),
4626                        ),
4627                        (
4628                            "required".to_string(),
4629                            Sensitivity::raw(required.to_string()),
4630                        ),
4631                        (
4632                            "observed".to_string(),
4633                            Sensitivity::raw(observed.to_string()),
4634                        ),
4635                        (
4636                            "timeout_ms".to_string(),
4637                            Sensitivity::raw(timeout_ms.to_string()),
4638                        ),
4639                    ],
4640                ),
4641                crate::replication::AwaitOutcome::NotRequired => (Outcome::Allowed, Vec::new()),
4642            };
4643            if !fields.is_empty() {
4644                self.emit_control_event(
4645                    EventKind::ReplicationSafety,
4646                    event_outcome,
4647                    "replication_commit_policy",
4648                    Some(format!("replication:lsn:{post_lsn}")),
4649                    None,
4650                    fields,
4651                )?;
4652            }
4653        }
4654        if let crate::replication::AwaitOutcome::TimedOut { observed, required } = &outcome {
4655            tracing::warn!(
4656                target: "reddb::commit",
4657                post_lsn,
4658                observed = *observed,
4659                required = *required,
4660                timeout_ms,
4661                "ack_n: timed out waiting for replicas"
4662            );
4663            let fail = std::env::var("RED_COMMIT_FAIL_ON_TIMEOUT")
4664                .ok()
4665                .map(|v| {
4666                    let t = v.trim();
4667                    t.eq_ignore_ascii_case("true") || t == "1" || t.eq_ignore_ascii_case("yes")
4668                })
4669                .unwrap_or(false);
4670            if fail {
4671                return Err(RedDBError::ReadOnly(format!(
4672                    "commit policy timed out at lsn {post_lsn}: observed={observed} required={required} (RED_COMMIT_FAIL_ON_TIMEOUT=true)"
4673                )));
4674            }
4675        }
4676        Ok(outcome)
4677    }
4678
4679    /// PLAN.md Phase 6.3 — whether at-rest encryption is configured.
4680    /// Reads `RED_ENCRYPTION_KEY` / `RED_ENCRYPTION_KEY_FILE` lazily;
4681    /// returns `("enabled", None)` when a key is loadable, `("error", Some(msg))`
4682    /// when the operator set the env but it doesn't parse, and
4683    /// `("disabled", None)` when no key is configured. The pager
4684    /// hookup is deferred — this accessor surfaces the operator's
4685    /// intent for /admin/status without yet using the key in writes.
4686    pub fn encryption_at_rest_status(&self) -> (&'static str, Option<String>) {
4687        match crate::crypto::page_encryption::key_from_env() {
4688            Ok(Some(_)) => ("enabled", None),
4689            Ok(None) => ("disabled", None),
4690            Err(err) => ("error", Some(err)),
4691        }
4692    }
4693
4694    /// PLAN.md Phase 11.5 — current replica apply health label
4695    /// (`ok`, `gap`, `divergence`, `apply_error`, `connecting`,
4696    /// `stalled_gap`). Read from the persisted `red.replication.state`
4697    /// config key updated by the replica loop. Returns `None` on
4698    /// non-replica instances or when no apply has run yet.
4699    pub fn replica_apply_health(&self) -> Option<String> {
4700        let state = self.config_string("red.replication.state", "");
4701        if state.is_empty() {
4702            None
4703        } else {
4704            Some(state)
4705        }
4706    }
4707
4708    /// Current local LSN paired with the LSN of the most recently
4709    /// archived WAL segment. The difference is the replication /
4710    /// archive lag operators alert on (PLAN.md Phase 5.1). Returns
4711    /// `(0, 0)` when neither replication nor archiving is configured.
4712    pub fn wal_archive_progress(&self) -> (u64, u64) {
4713        let current_lsn = self
4714            .inner
4715            .db
4716            .replication
4717            .as_ref()
4718            .map(|repl| {
4719                repl.logical_wal_spool
4720                    .as_ref()
4721                    .map(|spool| spool.current_lsn())
4722                    .unwrap_or_else(|| repl.wal_buffer.current_lsn())
4723            })
4724            .unwrap_or_else(|| self.inner.cdc.current_lsn());
4725        let last_archived_lsn = self.config_u64("red.config.timeline.last_archived_lsn", 0);
4726        (current_lsn, last_archived_lsn)
4727    }
4728
4729    /// Trigger an immediate backup.
4730    pub fn trigger_backup(&self) -> RedDBResult<crate::replication::scheduler::BackupResult> {
4731        let result = (|| {
4732            self.check_write(crate::runtime::write_gate::WriteKind::Backup)?;
4733            // Defense in depth — check_write above already rejects when
4734            // the lease is NotHeld, but log + audit the lease angle here
4735            // explicitly so dashboards distinguish "lease lost" from a
4736            // generic read-only refusal.
4737            self.assert_remote_write_allowed("admin/backup")?;
4738            let started = std::time::Instant::now();
4739            let snapshot = self.create_snapshot()?;
4740            let mut uploaded = false;
4741
4742            if let (Some(backend), Some(path)) =
4743                (&self.inner.db.remote_backend, self.inner.db.path())
4744            {
4745                let default_snapshot_prefix = self.inner.db.options().default_snapshot_prefix();
4746                let default_wal_prefix = self.inner.db.options().default_wal_archive_prefix();
4747                let default_head_key = self.inner.db.options().default_backup_head_key();
4748                let snapshot_prefix = self.config_string(
4749                    "red.config.backup.snapshot_prefix",
4750                    &default_snapshot_prefix,
4751                );
4752                let wal_prefix =
4753                    self.config_string("red.config.wal.archive.prefix", &default_wal_prefix);
4754                let head_key = self.config_string("red.config.backup.head_key", &default_head_key);
4755                let timeline_id = self.config_string("red.config.timeline.id", "main");
4756                let snapshot_key = crate::storage::wal::archive_snapshot(
4757                    backend.as_ref(),
4758                    path,
4759                    snapshot.snapshot_id,
4760                    &snapshot_prefix,
4761                )
4762                .map_err(|err| RedDBError::Internal(err.to_string()))?;
4763                let current_lsn = self
4764                    .inner
4765                    .db
4766                    .replication
4767                    .as_ref()
4768                    .map(|repl| {
4769                        repl.logical_wal_spool
4770                            .as_ref()
4771                            .map(|spool| spool.current_lsn())
4772                            .unwrap_or_else(|| repl.wal_buffer.current_lsn())
4773                    })
4774                    .unwrap_or_else(|| self.inner.cdc.current_lsn());
4775                let last_archived_lsn = self.config_u64("red.config.timeline.last_archived_lsn", 0);
4776                // Hash the local snapshot bytes so the manifest can carry
4777                // the digest for restore-side verification (PLAN.md
4778                // Phase 4). Failure to hash is non-fatal — we still
4779                // publish the manifest, just without a checksum, so a
4780                // future fix can backfill rather than losing the backup.
4781                let snapshot_sha256 =
4782                    crate::storage::wal::SnapshotManifest::compute_snapshot_sha256(path)
4783                        .map_err(|err| {
4784                            tracing::warn!(
4785                                target: "reddb::backup",
4786                                error = %err,
4787                                snapshot_id = snapshot.snapshot_id,
4788                                "snapshot hash failed; manifest will lack checksum"
4789                            );
4790                        })
4791                        .ok();
4792                let manifest = crate::storage::wal::SnapshotManifest {
4793                    timeline_id: timeline_id.clone(),
4794                    snapshot_key: snapshot_key.clone(),
4795                    snapshot_id: snapshot.snapshot_id,
4796                    snapshot_time: snapshot.created_at_unix_ms as u64,
4797                    base_lsn: current_lsn,
4798                    schema_version: crate::api::REDDB_FORMAT_VERSION,
4799                    format_version: crate::api::REDDB_FORMAT_VERSION,
4800                    snapshot_sha256,
4801                };
4802                crate::storage::wal::publish_snapshot_manifest(backend.as_ref(), &manifest)
4803                    .map_err(|err| RedDBError::Internal(err.to_string()))?;
4804
4805                // PLAN.md Phase 11.3 — read the head of the WAL hash chain
4806                // so the new segment can link back. `None` means we're
4807                // starting a fresh timeline (after a clean restore or on
4808                // first archive ever); the segment's `prev_hash` will be
4809                // `None` and restore-side validation accepts that only for
4810                // the first segment in `plan.wal_segments`.
4811                let prev_segment_hash =
4812                    self.config_string("red.config.timeline.last_segment_hash", "");
4813                let prev_hash_arg = if prev_segment_hash.is_empty() {
4814                    None
4815                } else {
4816                    Some(prev_segment_hash)
4817                };
4818
4819                let archived_lsn = if let Some(primary) = &self.inner.db.replication {
4820                    let oldest = primary
4821                        .logical_wal_spool
4822                        .as_ref()
4823                        .and_then(|spool| spool.oldest_lsn().ok().flatten())
4824                        .or_else(|| primary.wal_buffer.oldest_lsn())
4825                        .unwrap_or(last_archived_lsn);
4826                    if last_archived_lsn > 0 && last_archived_lsn < oldest.saturating_sub(1) {
4827                        return Err(RedDBError::Internal(format!(
4828                        "logical WAL gap detected: last_archived_lsn={last_archived_lsn}, oldest_available_lsn={oldest}"
4829                    )));
4830                    }
4831                    let records = if let Some(spool) = &primary.logical_wal_spool {
4832                        spool
4833                            .read_since(last_archived_lsn, usize::MAX)
4834                            .map_err(|err| RedDBError::Internal(err.to_string()))?
4835                    } else {
4836                        primary.wal_buffer.read_since(last_archived_lsn, usize::MAX)
4837                    };
4838                    if let Some(meta) = crate::storage::wal::archive_change_records(
4839                        backend.as_ref(),
4840                        &wal_prefix,
4841                        &records,
4842                        prev_hash_arg,
4843                    )
4844                    .map_err(|err| RedDBError::Internal(err.to_string()))?
4845                    {
4846                        if let Some(spool) = &primary.logical_wal_spool {
4847                            let _ = spool.prune_through(meta.lsn_end);
4848                        }
4849                        // Advance the chain head so the next archive call
4850                        // links to this segment's hash. If the segment has
4851                        // no sha256 (legacy / hashing failed) we leave the
4852                        // head as-is — the next segment then carries the
4853                        // prior chain head, preserving continuity.
4854                        if let Some(sha) = &meta.sha256 {
4855                            self.inner.db.store().set_config_tree(
4856                                "red.config.timeline",
4857                                &crate::json!({ "last_segment_hash": sha }),
4858                            );
4859                        }
4860                        meta.lsn_end
4861                    } else {
4862                        last_archived_lsn
4863                    }
4864                } else {
4865                    last_archived_lsn
4866                };
4867
4868                let head = crate::storage::wal::BackupHead {
4869                    timeline_id,
4870                    snapshot_key,
4871                    snapshot_id: snapshot.snapshot_id,
4872                    snapshot_time: snapshot.created_at_unix_ms as u64,
4873                    current_lsn,
4874                    last_archived_lsn: archived_lsn,
4875                    wal_prefix,
4876                };
4877                crate::storage::wal::publish_backup_head(backend.as_ref(), &head_key, &head)
4878                    .map_err(|err| RedDBError::Internal(err.to_string()))?;
4879                self.inner.db.store().set_config_tree(
4880                    "red.config.timeline",
4881                    &crate::json!({
4882                        "last_archived_lsn": archived_lsn,
4883                        "id": head.timeline_id
4884                    }),
4885                );
4886
4887                // PLAN.md Phase 2.4 — refresh the unified `MANIFEST.json`
4888                // at the prefix root so external tooling sees a single
4889                // catalog of every snapshot + WAL segment with their
4890                // checksums. Best-effort: a manifest publish failure
4891                // doesn't fail the backup (the per-artifact sidecars
4892                // already give restore-side integrity), but it does log
4893                // so dashboards can flag stale catalogs.
4894                if let Err(err) = crate::storage::wal::publish_unified_manifest_for_prefix(
4895                    backend.as_ref(),
4896                    &snapshot_prefix,
4897                ) {
4898                    tracing::warn!(
4899                        target: "reddb::backup",
4900                        error = %err,
4901                        snapshot_prefix = %snapshot_prefix,
4902                        "unified MANIFEST.json refresh failed; per-artifact sidecars unaffected"
4903                    );
4904                }
4905
4906                // PLAN.md Phase 11.4 — when the operator picked a
4907                // commit policy that demands replica durability, block
4908                // until the configured count of replicas has acked the
4909                // archived LSN (or the timeout fires). For backup the
4910                // policy decides the *DR posture* — `local` returns
4911                // immediately, `ack_n` ensures at least N replicas saw
4912                // the new tail before we report success to the
4913                // operator. A `TimedOut` is logged but does NOT fail
4914                // the backup: the local WAL + remote upload are durable
4915                // regardless; the missing acks are reported via
4916                // /metrics and /admin/status so the operator can decide.
4917                match self.commit_policy() {
4918                    crate::replication::CommitPolicy::AckN(n) if n > 0 => {
4919                        let timeout = std::env::var("RED_REPLICATION_ACK_TIMEOUT_MS")
4920                            .ok()
4921                            .and_then(|v| v.parse::<u64>().ok())
4922                            .unwrap_or(5_000);
4923                        let outcome = self.await_replica_acks(
4924                            archived_lsn,
4925                            n,
4926                            std::time::Duration::from_millis(timeout),
4927                        );
4928                        match outcome {
4929                            crate::replication::AwaitOutcome::Reached(count) => {
4930                                tracing::debug!(
4931                                    target: "reddb::backup",
4932                                    archived_lsn,
4933                                    n,
4934                                    count,
4935                                    "ack_n: replicas synced before backup return"
4936                                );
4937                            }
4938                            crate::replication::AwaitOutcome::TimedOut { observed, required } => {
4939                                tracing::warn!(
4940                                    target: "reddb::backup",
4941                                    archived_lsn,
4942                                    observed,
4943                                    required,
4944                                    timeout_ms = timeout,
4945                                    "ack_n: timed out waiting for replicas; backup uploaded but DR posture degraded"
4946                                );
4947                            }
4948                            crate::replication::AwaitOutcome::NotRequired => {}
4949                        }
4950                    }
4951                    _ => {} // Local / RemoteWal / Quorum: no blocking yet
4952                }
4953
4954                // Issue #148 follow-up — opt-in archive of the L2 Blob Cache
4955                // directory tree. Default off so a standard backup stays
4956                // small; flip via `red.config.backup.include_blob_cache=true`
4957                // when warm-cache restore is required (per
4958                // docs/operations/blob-cache-backup-restore.md §1).
4959                //
4960                // The L2 tree is *derived* state (ADR 0006) — its absence
4961                // never causes data loss; it only affects post-restore
4962                // p99 latency until the cache re-warms. We therefore log
4963                // (not fail) on per-file upload errors so a partial L2
4964                // upload never aborts a healthy snapshot+WAL backup.
4965                if self.config_bool("red.config.backup.include_blob_cache", false) {
4966                    let blob_cache_prefix = self.config_string(
4967                        "red.config.backup.blob_cache_prefix",
4968                        &format!("{snapshot_prefix}blob_cache/"),
4969                    );
4970                    if let Some(l2_path) = self.inner.result_blob_cache.l2_path() {
4971                        match crate::storage::cache::archive_blob_cache_l2(
4972                            backend.as_ref(),
4973                            l2_path,
4974                            &blob_cache_prefix,
4975                        ) {
4976                            Ok(count) => {
4977                                tracing::info!(
4978                                    target: "reddb::backup",
4979                                    files_uploaded = count,
4980                                    blob_cache_prefix = %blob_cache_prefix,
4981                                    "include_blob_cache: archived L2 directory"
4982                                );
4983                            }
4984                            Err(err) => {
4985                                tracing::warn!(
4986                                    target: "reddb::backup",
4987                                    error = %err,
4988                                    blob_cache_prefix = %blob_cache_prefix,
4989                                    "include_blob_cache: L2 archive failed; backup proceeding (cache is derived state)"
4990                                );
4991                            }
4992                        }
4993                    } else {
4994                        tracing::debug!(
4995                            target: "reddb::backup",
4996                            "include_blob_cache=true but no L2 path configured; nothing to archive"
4997                        );
4998                    }
4999                }
5000
5001                uploaded = true;
5002            }
5003
5004            Ok(crate::replication::scheduler::BackupResult {
5005                snapshot_id: snapshot.snapshot_id,
5006                uploaded,
5007                duration_ms: started.elapsed().as_millis() as u64,
5008                timestamp: snapshot.created_at_unix_ms as u64,
5009            })
5010        })();
5011
5012        use crate::runtime::control_events::{EventKind, Outcome, Sensitivity};
5013        let (current_lsn, last_archived_lsn) = self.wal_archive_progress();
5014        let mut fields = vec![
5015            (
5016                "current_lsn".to_string(),
5017                Sensitivity::raw(current_lsn.to_string()),
5018            ),
5019            (
5020                "last_archived_lsn".to_string(),
5021                Sensitivity::raw(last_archived_lsn.to_string()),
5022            ),
5023        ];
5024        if let Ok(backup) = &result {
5025            fields.push((
5026                "snapshot_id".to_string(),
5027                Sensitivity::raw(backup.snapshot_id.to_string()),
5028            ));
5029            fields.push((
5030                "uploaded".to_string(),
5031                Sensitivity::raw(backup.uploaded.to_string()),
5032            ));
5033            fields.push((
5034                "duration_ms".to_string(),
5035                Sensitivity::raw(backup.duration_ms.to_string()),
5036            ));
5037            fields.push((
5038                "snapshot_time".to_string(),
5039                Sensitivity::raw(backup.timestamp.to_string()),
5040            ));
5041        }
5042        let outcome = match &result {
5043            Ok(_) => Outcome::Allowed,
5044            Err(err) => control_event_outcome_for_error(err),
5045        };
5046        let reason = result.as_ref().err().map(|err| err.to_string());
5047        self.emit_control_event(
5048            EventKind::BackupRun,
5049            outcome,
5050            "backup_trigger",
5051            Some("backup:trigger".to_string()),
5052            reason,
5053            fields,
5054        )?;
5055        result
5056    }
5057
5058    pub fn acquire(&self) -> RedDBResult<RuntimeConnection> {
5059        let mut pool = self
5060            .inner
5061            .pool
5062            .lock()
5063            .map_err(|e| RedDBError::Internal(format!("connection pool lock poisoned: {e}")))?;
5064        if pool.active >= self.inner.pool_config.max_connections {
5065            return Err(RedDBError::Internal(
5066                "connection pool exhausted".to_string(),
5067            ));
5068        }
5069
5070        let id = if let Some(id) = pool.idle.pop() {
5071            id
5072        } else {
5073            let id = pool.next_id;
5074            pool.next_id += 1;
5075            id
5076        };
5077        pool.active += 1;
5078        pool.total_checkouts += 1;
5079        drop(pool);
5080
5081        Ok(RuntimeConnection {
5082            id,
5083            inner: Arc::clone(&self.inner),
5084        })
5085    }
5086
5087    pub fn checkpoint(&self) -> RedDBResult<()> {
5088        // Local fsync always allowed — losing the lease shouldn't
5089        // prevent us from durably persisting what's already in memory.
5090        // The remote upload is the side-effect that risks clobbering a
5091        // peer's state, so it's behind the lease gate.
5092        self.inner.db.flush_local_only().map_err(|err| {
5093            // Issue #205 — local flush failure is a CheckpointFailed
5094            // operator-grade event. The local-flush path also covers
5095            // the WAL fsync we depend on, so a failure here doubles as
5096            // the WalFsyncFailed signal for the runtime entry point.
5097            let msg = err.to_string();
5098            crate::telemetry::operator_event::OperatorEvent::CheckpointFailed {
5099                lsn: 0,
5100                error: msg.clone(),
5101            }
5102            .emit_global();
5103            crate::telemetry::operator_event::OperatorEvent::WalFsyncFailed {
5104                path: "<flush_local_only>".to_string(),
5105                error: msg.clone(),
5106            }
5107            .emit_global();
5108            RedDBError::Engine(msg)
5109        })?;
5110        if let Err(err) = self.assert_remote_write_allowed("checkpoint") {
5111            tracing::warn!(
5112                target: "reddb::serverless::lease",
5113                error = %err,
5114                "checkpoint: skipping remote upload — lease not held"
5115            );
5116            return Ok(());
5117        }
5118        self.inner
5119            .db
5120            .upload_to_remote_backend()
5121            .map_err(|err| RedDBError::Engine(err.to_string()))
5122    }
5123
5124    /// Guard remote-mutating operations on the writer lease.
5125    /// Returns `Ok(())` when no remote backend is configured (the
5126    /// lease is irrelevant) or the lease state is `NotRequired` /
5127    /// `Held`. Returns `RedDBError::ReadOnly` when the lease is
5128    /// `NotHeld`, with an audit-friendly action label so the caller
5129    /// can record the rejection.
5130    pub(crate) fn assert_remote_write_allowed(&self, action: &str) -> RedDBResult<()> {
5131        if self.inner.db.remote_backend.is_none() {
5132            return Ok(());
5133        }
5134        match self.inner.write_gate.lease_state() {
5135            crate::runtime::write_gate::LeaseGateState::NotHeld => {
5136                self.inner.audit_log.record(
5137                    action,
5138                    "system",
5139                    "remote_backend",
5140                    "err: writer lease not held",
5141                    crate::json::Value::Null,
5142                );
5143                Err(RedDBError::ReadOnly(format!(
5144                    "writer lease not held — {action} blocked (serverless fence)"
5145                )))
5146            }
5147            _ => Ok(()),
5148        }
5149    }
5150
5151    pub fn run_maintenance(&self) -> RedDBResult<()> {
5152        self.inner
5153            .db
5154            .run_maintenance()
5155            .map_err(|err| RedDBError::Internal(err.to_string()))
5156    }
5157
5158    pub fn scan_collection(
5159        &self,
5160        collection: &str,
5161        cursor: Option<ScanCursor>,
5162        limit: usize,
5163    ) -> RedDBResult<ScanPage> {
5164        let store = self.inner.db.store();
5165        let manager = store
5166            .get_collection(collection)
5167            .ok_or_else(|| RedDBError::NotFound(collection.to_string()))?;
5168
5169        let mut entities = manager.query_all(|_| true);
5170        entities.sort_by_key(|entity| entity.id.raw());
5171
5172        let offset = cursor.map(|cursor| cursor.offset).unwrap_or(0);
5173        let total = entities.len();
5174        let end = total.min(offset.saturating_add(limit.max(1)));
5175        let items = if offset >= total {
5176            Vec::new()
5177        } else {
5178            entities[offset..end].to_vec()
5179        };
5180        let next = (end < total).then_some(ScanCursor { offset: end });
5181
5182        Ok(ScanPage {
5183            collection: collection.to_string(),
5184            items,
5185            next,
5186            total,
5187        })
5188    }
5189
5190    pub fn catalog(&self) -> CatalogModelSnapshot {
5191        self.inner.db.catalog_model_snapshot()
5192    }
5193
5194    pub fn catalog_consistency_report(&self) -> crate::catalog::CatalogConsistencyReport {
5195        self.inner.db.catalog_consistency_report()
5196    }
5197
5198    pub fn catalog_attention_summary(&self) -> CatalogAttentionSummary {
5199        crate::catalog::attention_summary(&self.catalog())
5200    }
5201
5202    pub fn collection_attention(&self) -> Vec<CollectionDescriptor> {
5203        crate::catalog::collection_attention(&self.catalog())
5204    }
5205
5206    pub fn index_attention(&self) -> Vec<CatalogIndexStatus> {
5207        crate::catalog::index_attention(&self.catalog())
5208    }
5209
5210    pub fn graph_projection_attention(&self) -> Vec<CatalogGraphProjectionStatus> {
5211        crate::catalog::graph_projection_attention(&self.catalog())
5212    }
5213
5214    pub fn analytics_job_attention(&self) -> Vec<CatalogAnalyticsJobStatus> {
5215        crate::catalog::analytics_job_attention(&self.catalog())
5216    }
5217
5218    pub fn stats(&self) -> RuntimeStats {
5219        let pool = runtime_pool_lock(self);
5220        RuntimeStats {
5221            active_connections: pool.active,
5222            idle_connections: pool.idle.len(),
5223            total_checkouts: pool.total_checkouts,
5224            paged_mode: self.inner.db.is_paged(),
5225            started_at_unix_ms: self.inner.started_at_unix_ms,
5226            store: self.inner.db.stats(),
5227            system: SystemInfo::collect(),
5228            result_blob_cache: self.inner.result_blob_cache.stats(),
5229            kv: self.inner.kv_stats.snapshot(),
5230            metrics_ingest: self.inner.metrics_ingest_stats.snapshot(),
5231        }
5232    }
5233
5234    pub(crate) fn record_metrics_ingest(
5235        &self,
5236        accepted_samples: u64,
5237        accepted_series: u64,
5238        rejected_samples: u64,
5239        rejected_series: u64,
5240    ) {
5241        self.inner.metrics_ingest_stats.record(
5242            accepted_samples,
5243            accepted_series,
5244            rejected_samples,
5245            rejected_series,
5246        );
5247    }
5248
5249    pub(crate) fn record_metrics_cardinality_budget_rejections(&self, rejected_series: u64) {
5250        self.inner
5251            .metrics_ingest_stats
5252            .record_cardinality_budget_rejections(rejected_series);
5253    }
5254
5255    pub(crate) fn record_metrics_tenant_activity(
5256        &self,
5257        tenant: &str,
5258        namespace: &str,
5259        operation: &str,
5260    ) {
5261        self.inner
5262            .metrics_tenant_activity_stats
5263            .record(tenant, namespace, operation);
5264    }
5265
5266    pub(crate) fn metrics_tenant_activity_snapshot(
5267        &self,
5268    ) -> Vec<crate::runtime::MetricsTenantActivityStats> {
5269        self.inner.metrics_tenant_activity_stats.snapshot()
5270    }
5271
5272    /// Execute a query under a typed scope override without embedding
5273    /// the tenant / user / role values into the SQL string. Use this
5274    /// from transport middleware (HTTP / gRPC / worker loops) where the
5275    /// scope is resolved from auth claims and the SQL is a parameterised
5276    /// template — avoids the string-concat injection risk of building
5277    /// `WITHIN TENANT '<id>' …` manually, and is drop-in compatible with
5278    /// prepared statements that didn't know about tenancy.
5279    ///
5280    /// Precedence matches the `WITHIN` clause: the passed `scope`
5281    /// overrides `SET LOCAL TENANT`, which overrides `SET TENANT`.
5282    /// The override is pushed on the thread-local scope stack for the
5283    /// duration of the call and popped on return — pool-shared
5284    /// connections cannot leak it across requests.
5285    pub fn execute_query_with_scope(
5286        &self,
5287        query: &str,
5288        scope: crate::runtime::within_clause::ScopeOverride,
5289    ) -> RedDBResult<RuntimeQueryResult> {
5290        if scope.is_empty() {
5291            return self.execute_query(query);
5292        }
5293        let _scope_guard = ScopeOverrideGuard::install(scope);
5294        self.execute_query(query)
5295    }
5296
5297    /// Issue #205 — single lifecycle exit for slow-query logging.
5298    ///
5299    /// `execute_query_inner` does the real work; this wrapper times it
5300    /// and, if elapsed exceeds the configured threshold, hands the
5301    /// triple `(QueryKind, elapsed_ms, sql_redacted, scope)` to the
5302    /// SlowQueryLogger. The threshold + sample_pct were captured at
5303    /// SlowQueryLogger construction (runtime startup), so the per-call
5304    /// cost on below-threshold paths is one relaxed atomic load.
5305    pub fn execute_query(&self, query: &str) -> RedDBResult<RuntimeQueryResult> {
5306        let started = std::time::Instant::now();
5307        let result = self.execute_query_inner(query);
5308        let elapsed_ms = started.elapsed().as_millis() as u64;
5309
5310        // Build EffectiveScope from the same thread-locals frame-build
5311        // consults — keeps the slow-log row consistent with the audit /
5312        // RLS view of "this statement". `ai_scope()` is the canonical
5313        // builder.
5314        let scope = self.ai_scope();
5315        let kind = match result
5316            .as_ref()
5317            .map(|r| r.statement_type)
5318            .unwrap_or("select")
5319        {
5320            "select" => crate::telemetry::slow_query_logger::QueryKind::Select,
5321            "insert" => crate::telemetry::slow_query_logger::QueryKind::Insert,
5322            "update" => crate::telemetry::slow_query_logger::QueryKind::Update,
5323            "delete" => crate::telemetry::slow_query_logger::QueryKind::Delete,
5324            _ => crate::telemetry::slow_query_logger::QueryKind::Internal,
5325        };
5326        // SQL redaction: pass the raw query through. The slow-query
5327        // logger writes structured JSON so embedded literals stay
5328        // escape-safe at the JSON boundary (proven by
5329        // `adversarial_sql_is_escape_safe` in slow_query_logger.rs).
5330        // PII redaction (e.g. literal masking) is a follow-up.
5331        self.inner
5332            .slow_query_logger
5333            .record(kind, elapsed_ms, query.to_string(), &scope);
5334
5335        result
5336    }
5337
5338    #[inline(never)]
5339    fn execute_query_inner(&self, query: &str) -> RedDBResult<RuntimeQueryResult> {
5340        // ── ULTRA-TURBO: autocommit `SELECT * FROM t WHERE _entity_id = N` ──
5341        //
5342        // Moved above every boot-cost the normal path pays (WITHIN
5343        // strip, SET LOCAL parse, tx_local_tenants read, snapshot
5344        // guard, tracing span, tx_contexts read) because the bench's
5345        // `select_point` scenario was observed at 28× vs PostgreSQL —
5346        // the dominant cost wasn't the entity fetch but the ceremony
5347        // before it. Only fires when there's no ambient transaction
5348        // context or WITHIN override, so the snapshot install we skip
5349        // truly is a no-op for this query.
5350        if !has_scope_override_active()
5351            && !query.trim_start().starts_with("WITHIN")
5352            && !query.trim_start().starts_with("within")
5353            && !self.inner.query_audit.has_rules()
5354            && !self
5355                .inner
5356                .tx_contexts
5357                .read()
5358                .contains_key(&current_connection_id())
5359        {
5360            if let Some(result) = self.try_fast_entity_lookup(query) {
5361                return result;
5362            }
5363        }
5364
5365        // `WITHIN TENANT '<id>' [USER '<u>'] [AS ROLE '<r>'] <stmt>` —
5366        // strip the prefix, push a stack-scoped override, recurse on
5367        // the inner statement, pop on return. Stack lives in a
5368        // thread-local but is balanced by the RAII guard, so a
5369        // pool-shared connection cannot leak the override across
5370        // requests and an early `?` return still pops cleanly.
5371        match crate::runtime::within_clause::try_strip_within_prefix(query) {
5372            Ok(Some((scope, inner))) => {
5373                let _scope_guard = ScopeOverrideGuard::install(scope);
5374                // Re-enter the inner path, NOT `execute_query`, so the
5375                // slow-query lifecycle hook records exactly one row per
5376                // top-level statement (the WITHIN-stripped form would
5377                // double-record).
5378                return self.execute_query_inner(inner);
5379            }
5380            Ok(None) => {}
5381            Err(msg) => return Err(RedDBError::Query(msg)),
5382        }
5383
5384        // `EXPLAIN <stmt>` — introspection. Runs the planner on the
5385        // inner statement (WITHOUT executing it) and returns the
5386        // CanonicalLogicalNode tree as rows so the caller can see the
5387        // operator shape and estimated cost. `EXPLAIN ALTER FOR ...`
5388        // is a distinct schema-diff command and continues down the
5389        // regular SQL path.
5390        if let Some(inner) = strip_explain_prefix(query) {
5391            return self.explain_as_rows(query, inner);
5392        }
5393
5394        // `SET LOCAL TENANT '<id>'` — write the per-transaction tenant
5395        // override and return. Outside a transaction the statement is
5396        // an error (matches PG semantics: SET LOCAL only takes effect
5397        // within an active transaction).
5398        if let Some(value) = parse_set_local_tenant(query)? {
5399            let conn_id = current_connection_id();
5400            if !self.inner.tx_contexts.read().contains_key(&conn_id) {
5401                return Err(RedDBError::Query(
5402                    "SET LOCAL TENANT requires an active transaction".to_string(),
5403                ));
5404            }
5405            self.inner
5406                .tx_local_tenants
5407                .write()
5408                .insert(conn_id, value.clone());
5409            return Ok(RuntimeQueryResult::ok_message(
5410                query.to_string(),
5411                &match &value {
5412                    Some(id) => format!("local tenant set: {id}"),
5413                    None => "local tenant cleared".to_string(),
5414                },
5415                "set_local_tenant",
5416            ));
5417        }
5418
5419        if super::red_schema::is_system_schema_write(query) {
5420            return Err(RedDBError::Query(
5421                super::red_schema::READ_ONLY_ERROR.to_string(),
5422            ));
5423        }
5424
5425        let rewritten_query = super::red_schema::rewrite_virtual_names(query);
5426        let execution_query = rewritten_query.as_deref().unwrap_or(query);
5427
5428        let frame = super::statement_frame::StatementExecutionFrame::build(self, execution_query)?;
5429        let _frame_guards = frame.install(self);
5430
5431        // Phase 6 logging: enter a span stamped with conn_id / tenant
5432        // / query_len. Every downstream tracing::info!/warn!/error!
5433        // inherits these fields — no need to thread them manually
5434        // through storage/scan layers. Entered AFTER the WITHIN /
5435        // SET LOCAL TENANT resolution above so the span reflects the
5436        // effective scope for this statement.
5437        let _log_span = crate::telemetry::span::query_span(query).entered();
5438
5439        // ── CTE prelude (#41) — `WITH x AS (...) SELECT ... FROM x` ──
5440        if let Some(rewritten) = frame.prepare_cte(execution_query)? {
5441            return self.execute_query_expr(rewritten);
5442        }
5443
5444        // ── TURBO: bypass SQL parse for SELECT * FROM x WHERE _entity_id = N ──
5445        if !self.inner.query_audit.has_rules() {
5446            if let Some(result) = self.try_fast_entity_lookup(execution_query) {
5447                return result;
5448            }
5449        }
5450
5451        // ── Result cache: return cached result if still fresh (30s TTL) ──
5452        if !self.inner.query_audit.has_rules() {
5453            if let Some(result) = frame.read_result_cache(self) {
5454                return Ok(result);
5455            }
5456        }
5457
5458        let prepared = frame.prepare_statement(self, execution_query)?;
5459        let mode = prepared.mode;
5460        let expr = prepared.expr;
5461
5462        let statement = query_expr_name(&expr);
5463        let result_cache_scopes = query_expr_result_cache_scopes(&expr);
5464        let control_event_specs = query_control_event_specs(&expr);
5465        let query_audit_plan = query_audit_plan(&expr);
5466
5467        let _lock_guard = match frame.prepare_dispatch(self, &expr) {
5468            Ok(guard) => guard,
5469            Err(err) => {
5470                let outcome = control_event_outcome_for_error(&err);
5471                for spec in &control_event_specs {
5472                    self.emit_control_event(
5473                        spec.kind,
5474                        outcome,
5475                        spec.action,
5476                        spec.resource.clone(),
5477                        Some(err.to_string()),
5478                        spec.fields.clone(),
5479                    )?;
5480                }
5481                return Err(err);
5482            }
5483        };
5484        let frame_iface: &dyn super::statement_frame::ReadFrame = &frame;
5485        let query_audit_started = std::time::Instant::now();
5486
5487        let query_result = match expr {
5488            QueryExpr::Graph(_) | QueryExpr::Path(_) => {
5489                // Apply MVCC visibility + RLS gate while materialising the
5490                // graph: every node entity is screened against the source
5491                // collection's policy chain (basic and `Nodes`-targeted)
5492                // and dropped when the caller's tenant / role doesn't
5493                // admit it. Edges are pruned automatically because the
5494                // graph builder skips edges whose endpoints aren't in
5495                // `allowed_nodes`.
5496                let (graph, node_properties, edge_properties) =
5497                    self.materialize_graph_with_rls()?;
5498                let result =
5499                    crate::storage::query::unified::UnifiedExecutor::execute_on_with_graph_properties(
5500                        &graph,
5501                        &expr,
5502                        node_properties,
5503                        edge_properties,
5504                    )
5505                        .map_err(|err| RedDBError::Query(err.to_string()))?;
5506
5507                Ok(RuntimeQueryResult {
5508                    query: query.to_string(),
5509                    mode,
5510                    statement,
5511                    engine: "materialized-graph",
5512                    result,
5513                    affected_rows: 0,
5514                    statement_type: "select",
5515                })
5516            }
5517            QueryExpr::Table(table) => {
5518                let table = self.resolve_table_expr_subqueries(
5519                    table,
5520                    &frame as &dyn super::statement_frame::ReadFrame,
5521                )?;
5522                if super::red_schema::is_virtual_table(&table.table) {
5523                    return Ok(RuntimeQueryResult {
5524                        query: query.to_string(),
5525                        mode,
5526                        statement,
5527                        engine: "runtime-red-schema",
5528                        result: super::red_schema::red_query(
5529                            self,
5530                            &table.table,
5531                            &table,
5532                            &frame as &dyn super::statement_frame::ReadFrame,
5533                        )?,
5534                        affected_rows: 0,
5535                        statement_type: "select",
5536                    });
5537                }
5538
5539                if let Some(result) = self.execute_probabilistic_select(&table)? {
5540                    return Ok(RuntimeQueryResult {
5541                        query: query.to_string(),
5542                        mode,
5543                        statement,
5544                        engine: "runtime-probabilistic",
5545                        result,
5546                        affected_rows: 0,
5547                        statement_type: "select",
5548                    });
5549                }
5550
5551                // Foreign-table intercept (Phase 3.2.2 PG parity).
5552                //
5553                // When the referenced table matches a `CREATE FOREIGN TABLE`
5554                // registration, short-circuit into the FDW scan. Phase 3.2
5555                // wrappers don't yet support pushdown, so filters/projections
5556                // apply post-scan via `apply_foreign_table_filters` — good
5557                // enough for correctness; perf work lands in 3.2.3.
5558                if self.inner.foreign_tables.is_foreign_table(&table.table) {
5559                    let records = self
5560                        .inner
5561                        .foreign_tables
5562                        .scan(&table.table)
5563                        .map_err(|e| RedDBError::Internal(e.to_string()))?;
5564                    let result = apply_foreign_table_filters(records, &table);
5565                    return Ok(RuntimeQueryResult {
5566                        query: query.to_string(),
5567                        mode,
5568                        statement,
5569                        engine: "runtime-fdw",
5570                        result,
5571                        affected_rows: 0,
5572                        statement_type: "select",
5573                    });
5574                }
5575
5576                // Row-Level Security enforcement (Phase 2.5.2 PG parity).
5577                //
5578                // When RLS is enabled on this table, fetch every policy
5579                // that applies to the current (role, SELECT) pair and
5580                // fold them into the query's WHERE clause: policies
5581                // OR-combine (any of them admitting the row is enough),
5582                // then AND into the caller's existing filter.
5583                //
5584                // Anonymous callers (no thread-local identity) pass
5585                // `role = None`; policies with a specific `TO role`
5586                // clause skip, but `TO PUBLIC` policies still apply.
5587                //
5588                // When `inject_rls_filters` returns `None` the table has
5589                // RLS enabled but no policy admits the caller's role —
5590                // short-circuit with an empty result set instead of
5591                // synthesising a contradiction filter.
5592                let Some(table_with_rls) = self.authorize_relational_table_select(
5593                    table,
5594                    &frame as &dyn super::statement_frame::ReadFrame,
5595                )?
5596                else {
5597                    let empty = crate::storage::query::unified::UnifiedResult::empty();
5598                    return Ok(RuntimeQueryResult {
5599                        query: query.to_string(),
5600                        mode,
5601                        statement,
5602                        engine: "runtime-table-rls",
5603                        result: empty,
5604                        affected_rows: 0,
5605                        statement_type: "select",
5606                    });
5607                };
5608                Ok(RuntimeQueryResult {
5609                    query: query.to_string(),
5610                    mode,
5611                    statement,
5612                    engine: "runtime-table",
5613                    result: execute_runtime_table_query(
5614                        &self.inner.db,
5615                        &table_with_rls,
5616                        Some(&self.inner.index_store),
5617                    )?,
5618                    affected_rows: 0,
5619                    statement_type: "select",
5620                })
5621            }
5622            QueryExpr::Join(join) => {
5623                // Fold per-table RLS filters into each `QueryExpr::Table`
5624                // leaf of the join tree before executing. Without this
5625                // the join executor scans both tables raw and ignores
5626                // policies — a `WITHIN TENANT 'x'` against a join of
5627                // two tenant-scoped tables would leak cross-tenant rows.
5628                // When any leaf has RLS enabled and zero matching policy,
5629                // short-circuit to an empty join result instead of
5630                // emitting a contradiction filter.
5631                let join_with_rls = match self.authorize_relational_join_select(
5632                    join,
5633                    &frame as &dyn super::statement_frame::ReadFrame,
5634                )? {
5635                    Some(j) => j,
5636                    None => {
5637                        return Ok(RuntimeQueryResult {
5638                            query: query.to_string(),
5639                            mode,
5640                            statement,
5641                            engine: "runtime-join-rls",
5642                            result: crate::storage::query::unified::UnifiedResult::empty(),
5643                            affected_rows: 0,
5644                            statement_type: "select",
5645                        });
5646                    }
5647                };
5648                Ok(RuntimeQueryResult {
5649                    query: query.to_string(),
5650                    mode,
5651                    statement,
5652                    engine: "runtime-join",
5653                    result: execute_runtime_join_query(&self.inner.db, &join_with_rls)?,
5654                    affected_rows: 0,
5655                    statement_type: "select",
5656                })
5657            }
5658            QueryExpr::Vector(vector) => Ok(RuntimeQueryResult {
5659                query: query.to_string(),
5660                mode,
5661                statement,
5662                engine: "runtime-vector",
5663                result: execute_runtime_vector_query(&self.inner.db, &vector)?,
5664                affected_rows: 0,
5665                statement_type: "select",
5666            }),
5667            QueryExpr::Hybrid(hybrid) => Ok(RuntimeQueryResult {
5668                query: query.to_string(),
5669                mode,
5670                statement,
5671                engine: "runtime-hybrid",
5672                result: execute_runtime_hybrid_query(&self.inner.db, &hybrid)?,
5673                affected_rows: 0,
5674                statement_type: "select",
5675            }),
5676            // DML execution
5677            QueryExpr::Insert(ref insert) if super::red_schema::is_virtual_table(&insert.table) => {
5678                Err(RedDBError::Query(
5679                    super::red_schema::READ_ONLY_ERROR.to_string(),
5680                ))
5681            }
5682            QueryExpr::Update(ref update) if super::red_schema::is_virtual_table(&update.table) => {
5683                Err(RedDBError::Query(
5684                    super::red_schema::READ_ONLY_ERROR.to_string(),
5685                ))
5686            }
5687            QueryExpr::Delete(ref delete) if super::red_schema::is_virtual_table(&delete.table) => {
5688                Err(RedDBError::Query(
5689                    super::red_schema::READ_ONLY_ERROR.to_string(),
5690                ))
5691            }
5692            QueryExpr::Insert(ref insert) => self
5693                .with_deferred_store_wal_for_dml(self.insert_may_emit_events(insert), || {
5694                    self.execute_insert(query, insert)
5695                }),
5696            QueryExpr::Update(ref update) => self
5697                .with_deferred_store_wal_for_dml(self.update_may_emit_events(update), || {
5698                    self.execute_update(query, update)
5699                }),
5700            QueryExpr::Delete(ref delete) => self
5701                .with_deferred_store_wal_for_dml(self.delete_may_emit_events(delete), || {
5702                    self.execute_delete(query, delete)
5703                }),
5704            // DDL execution
5705            QueryExpr::CreateTable(ref create) => self.execute_create_table(query, create),
5706            QueryExpr::CreateCollection(ref create) => {
5707                self.execute_create_collection(query, create)
5708            }
5709            QueryExpr::CreateVector(ref create) => self.execute_create_vector(query, create),
5710            QueryExpr::DropTable(ref drop_tbl) => self.execute_drop_table(query, drop_tbl),
5711            QueryExpr::DropGraph(ref drop_graph) => self.execute_drop_graph(query, drop_graph),
5712            QueryExpr::DropVector(ref drop_vector) => self.execute_drop_vector(query, drop_vector),
5713            QueryExpr::DropDocument(ref drop_document) => {
5714                self.execute_drop_document(query, drop_document)
5715            }
5716            QueryExpr::DropKv(ref drop_kv) => self.execute_drop_kv(query, drop_kv),
5717            QueryExpr::DropCollection(ref drop_collection) => {
5718                self.execute_drop_collection(query, drop_collection)
5719            }
5720            QueryExpr::Truncate(ref truncate) => self.execute_truncate(query, truncate),
5721            QueryExpr::AlterTable(ref alter) => self.execute_alter_table(query, alter),
5722            QueryExpr::ExplainAlter(ref explain) => self.execute_explain_alter(query, explain),
5723            // Graph analytics commands
5724            QueryExpr::GraphCommand(ref cmd) => self.execute_graph_command(query, cmd),
5725            // Search commands
5726            QueryExpr::SearchCommand(ref cmd) => self.execute_search_command(query, cmd),
5727            // ASK: RAG query with LLM synthesis
5728            QueryExpr::Ask(ref ask) => self.execute_ask(query, ask),
5729            QueryExpr::CreateIndex(ref create_idx) => self.execute_create_index(query, create_idx),
5730            QueryExpr::DropIndex(ref drop_idx) => self.execute_drop_index(query, drop_idx),
5731            QueryExpr::ProbabilisticCommand(ref cmd) => {
5732                self.execute_probabilistic_command(query, cmd)
5733            }
5734            // Time-series DDL
5735            QueryExpr::CreateTimeSeries(ref ts) => self.execute_create_timeseries(query, ts),
5736            QueryExpr::DropTimeSeries(ref ts) => self.execute_drop_timeseries(query, ts),
5737            // Queue DDL and commands
5738            QueryExpr::CreateQueue(ref q) => self.execute_create_queue(query, q),
5739            QueryExpr::AlterQueue(ref q) => self.execute_alter_queue(query, q),
5740            QueryExpr::DropQueue(ref q) => self.execute_drop_queue(query, q),
5741            QueryExpr::QueueSelect(ref q) => self.execute_queue_select(query, q),
5742            QueryExpr::QueueCommand(ref cmd) => self.execute_queue_command(query, cmd),
5743            QueryExpr::EventsBackfill(ref backfill) => {
5744                self.execute_events_backfill(query, backfill)
5745            }
5746            QueryExpr::EventsBackfillStatus { ref collection } => Err(RedDBError::Query(format!(
5747                "EVENTS BACKFILL STATUS for '{collection}' is not implemented in this slice"
5748            ))),
5749            QueryExpr::KvCommand(ref cmd) => self.execute_kv_command(query, cmd),
5750            QueryExpr::ConfigCommand(ref cmd) => self.execute_config_command(query, cmd),
5751            QueryExpr::CreateTree(ref tree) => self.execute_create_tree(query, tree),
5752            QueryExpr::DropTree(ref tree) => self.execute_drop_tree(query, tree),
5753            QueryExpr::TreeCommand(ref cmd) => self.execute_tree_command(query, cmd),
5754            // SET CONFIG key = value
5755            QueryExpr::SetConfig { ref key, ref value } => {
5756                if key.starts_with("red.secret.") {
5757                    return Err(RedDBError::Query(
5758                        "red.secret.* is reserved for vault secrets; use SET SECRET".to_string(),
5759                    ));
5760                }
5761                match self.check_managed_config_write_for_set_config(key) {
5762                    Err(err) => Err(err),
5763                    Ok(()) => {
5764                        let store = self.inner.db.store();
5765                        let json_val = match value {
5766                            Value::Text(s) => crate::serde_json::Value::String(s.to_string()),
5767                            Value::Integer(n) => crate::serde_json::Value::Number(*n as f64),
5768                            Value::Float(n) => crate::serde_json::Value::Number(*n),
5769                            Value::Boolean(b) => crate::serde_json::Value::Bool(*b),
5770                            _ => crate::serde_json::Value::String(value.to_string()),
5771                        };
5772                        store.set_config_tree(key, &json_val);
5773                        update_current_config_value(key, value.clone());
5774                        // Config changes can flip runtime behavior mid-session
5775                        // (auto_decrypt, auto_encrypt, etc.) — invalidate the
5776                        // result cache so subsequent reads re-execute against
5777                        // the new config.
5778                        self.invalidate_result_cache();
5779                        Ok(RuntimeQueryResult::ok_message(
5780                            query.to_string(),
5781                            &format!("config set: {key}"),
5782                            "set",
5783                        ))
5784                    }
5785                }
5786            }
5787            // SET SECRET key = value
5788            QueryExpr::SetSecret { ref key, ref value } => {
5789                if key.starts_with("red.config.") {
5790                    return Err(RedDBError::Query(
5791                        "red.config.* is reserved for config; use SET CONFIG".to_string(),
5792                    ));
5793                }
5794                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
5795                    RedDBError::Query("SET SECRET requires an enabled, unsealed vault".to_string())
5796                })?;
5797                if matches!(value, Value::Null) {
5798                    auth_store
5799                        .vault_kv_try_delete(key)
5800                        .map_err(|err| RedDBError::Query(err.to_string()))?;
5801                    update_current_secret_value(key, None);
5802                    self.invalidate_result_cache();
5803                    return Ok(RuntimeQueryResult::ok_message(
5804                        query.to_string(),
5805                        &format!("secret deleted: {key}"),
5806                        "delete_secret",
5807                    ));
5808                }
5809                let value = secret_sql_value_to_string(value)?;
5810                auth_store
5811                    .vault_kv_try_set(key.clone(), value.clone())
5812                    .map_err(|err| RedDBError::Query(err.to_string()))?;
5813                update_current_secret_value(key, Some(value));
5814                self.invalidate_result_cache();
5815                Ok(RuntimeQueryResult::ok_message(
5816                    query.to_string(),
5817                    &format!("secret set: {key}"),
5818                    "set_secret",
5819                ))
5820            }
5821            // DELETE SECRET key
5822            QueryExpr::DeleteSecret { ref key } => {
5823                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
5824                    RedDBError::Query(
5825                        "DELETE SECRET requires an enabled, unsealed vault".to_string(),
5826                    )
5827                })?;
5828                let deleted = auth_store
5829                    .vault_kv_try_delete(key)
5830                    .map_err(|err| RedDBError::Query(err.to_string()))?;
5831                if deleted {
5832                    update_current_secret_value(key, None);
5833                }
5834                self.invalidate_result_cache();
5835                Ok(RuntimeQueryResult::ok_message(
5836                    query.to_string(),
5837                    &format!("secret deleted: {key}"),
5838                    if deleted {
5839                        "delete_secret"
5840                    } else {
5841                        "delete_secret_not_found"
5842                    },
5843                ))
5844            }
5845            // SHOW SECRET[S] [prefix]
5846            QueryExpr::ShowSecrets { ref prefix } => {
5847                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
5848                    RedDBError::Query("SHOW SECRET requires an enabled, unsealed vault".to_string())
5849                })?;
5850                if !auth_store.is_vault_backed() {
5851                    return Err(RedDBError::Query(
5852                        "SHOW SECRET requires an enabled, unsealed vault".to_string(),
5853                    ));
5854                }
5855                let mut keys = auth_store.vault_kv_keys();
5856                keys.sort();
5857                let mut result = UnifiedResult::with_columns(vec![
5858                    "key".into(),
5859                    "value".into(),
5860                    "status".into(),
5861                ]);
5862                for key in keys {
5863                    if let Some(ref pfx) = prefix {
5864                        if !key.starts_with(pfx) {
5865                            continue;
5866                        }
5867                    }
5868                    let mut record = UnifiedRecord::new();
5869                    record.set("key", Value::text(key));
5870                    record.set("value", Value::text("***"));
5871                    record.set("status", Value::text("active"));
5872                    result.push(record);
5873                }
5874                Ok(RuntimeQueryResult {
5875                    query: query.to_string(),
5876                    mode,
5877                    statement: "show_secrets",
5878                    engine: "runtime-secret",
5879                    result,
5880                    affected_rows: 0,
5881                    statement_type: "select",
5882                })
5883            }
5884            // SHOW CONFIG [prefix]
5885            QueryExpr::ShowConfig { ref prefix } => {
5886                let store = self.inner.db.store();
5887                let all_collections = store.list_collections();
5888                if !all_collections.contains(&"red_config".to_string()) {
5889                    let result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
5890                    return Ok(RuntimeQueryResult {
5891                        query: query.to_string(),
5892                        mode,
5893                        statement: "show_config",
5894                        engine: "runtime-config",
5895                        result,
5896                        affected_rows: 0,
5897                        statement_type: "select",
5898                    });
5899                }
5900                let manager = store
5901                    .get_collection("red_config")
5902                    .ok_or_else(|| RedDBError::NotFound("red_config".to_string()))?;
5903                let entities = manager.query_all(|_| true);
5904                let mut latest = std::collections::BTreeMap::<String, (u64, Value, Value)>::new();
5905                for entity in entities {
5906                    if let EntityData::Row(ref row) = entity.data {
5907                        if let Some(ref named) = row.named {
5908                            let key_val = named.get("key").cloned().unwrap_or(Value::Null);
5909                            let val = named.get("value").cloned().unwrap_or(Value::Null);
5910                            let key_str = match &key_val {
5911                                Value::Text(s) => s.as_ref(),
5912                                _ => continue,
5913                            };
5914                            if let Some(ref pfx) = prefix {
5915                                if !key_str.starts_with(pfx.as_str()) {
5916                                    continue;
5917                                }
5918                            }
5919                            let entity_id = entity.id.raw();
5920                            match latest.get(key_str) {
5921                                Some((prev_id, _, _)) if *prev_id > entity_id => {}
5922                                _ => {
5923                                    latest.insert(key_str.to_string(), (entity_id, key_val, val));
5924                                }
5925                            }
5926                        }
5927                    }
5928                }
5929                let mut result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
5930                for (_, key_val, val) in latest.into_values() {
5931                    let mut record = UnifiedRecord::new();
5932                    record.set("key", key_val);
5933                    record.set("value", val);
5934                    result.push(record);
5935                }
5936                Ok(RuntimeQueryResult {
5937                    query: query.to_string(),
5938                    mode,
5939                    statement: "show_config",
5940                    engine: "runtime-config",
5941                    result,
5942                    affected_rows: 0,
5943                    statement_type: "select",
5944                })
5945            }
5946            // Session-local multi-tenancy handle (Phase 2.5.3).
5947            //
5948            // SET TENANT 'id' / SET TENANT NULL / RESET TENANT — writes
5949            // the thread-local; SHOW TENANT returns it. Paired with the
5950            // CURRENT_TENANT() scalar for use in RLS policies.
5951            QueryExpr::SetTenant(ref value) => {
5952                match value {
5953                    Some(id) => set_current_tenant(id.clone()),
5954                    None => clear_current_tenant(),
5955                }
5956                Ok(RuntimeQueryResult::ok_message(
5957                    query.to_string(),
5958                    &match value {
5959                        Some(id) => format!("tenant set: {id}"),
5960                        None => "tenant cleared".to_string(),
5961                    },
5962                    "set_tenant",
5963                ))
5964            }
5965            QueryExpr::ShowTenant => {
5966                let mut result = UnifiedResult::with_columns(vec!["tenant".into()]);
5967                let mut record = UnifiedRecord::new();
5968                record.set(
5969                    "tenant",
5970                    current_tenant().map(Value::text).unwrap_or(Value::Null),
5971                );
5972                result.push(record);
5973                Ok(RuntimeQueryResult {
5974                    query: query.to_string(),
5975                    mode,
5976                    statement: "show_tenant",
5977                    engine: "runtime-tenant",
5978                    result,
5979                    affected_rows: 0,
5980                    statement_type: "select",
5981                })
5982            }
5983            // Transaction control (Phase 2.3 PG parity).
5984            //
5985            // BEGIN allocates a real `Xid` and stores a `TxnContext` keyed by
5986            // the current connection's id. COMMIT/ROLLBACK release it through
5987            // the `SnapshotManager` so future snapshots see the correct set of
5988            // active/aborted transactions.
5989            //
5990            // Tuple stamping (xmin/xmax) and read-path visibility filtering
5991            // land in Phase 2.3.2 — this dispatch only manages the snapshot
5992            // registry. Statements running outside a TxnContext still behave
5993            // as autocommit (xid=0 → visible to every snapshot).
5994            QueryExpr::TransactionControl(ref ctl) => {
5995                use crate::storage::query::ast::TxnControl;
5996                use crate::storage::transaction::snapshot::{TxnContext, Xid};
5997                use crate::storage::transaction::IsolationLevel;
5998
5999                // Phase 2.3 keys transactions by a thread-local connection id.
6000                // The stdio/gRPC paths wire a real per-connection id later;
6001                // for embedded use (one RedDBRuntime per process-ish caller)
6002                // we fall back to a deterministic placeholder.
6003                let conn_id = current_connection_id();
6004
6005                let (kind, msg) = match ctl {
6006                    TxnControl::Begin => {
6007                        let mgr = Arc::clone(&self.inner.snapshot_manager);
6008                        let xid = mgr.begin();
6009                        let snapshot = mgr.snapshot(xid);
6010                        let ctx = TxnContext {
6011                            xid,
6012                            isolation: IsolationLevel::SnapshotIsolation,
6013                            snapshot,
6014                            savepoints: Vec::new(),
6015                            released_sub_xids: Vec::new(),
6016                        };
6017                        self.inner.tx_contexts.write().insert(conn_id, ctx);
6018                        ("begin", format!("BEGIN — xid={xid} (snapshot isolation)"))
6019                    }
6020                    TxnControl::Commit => {
6021                        // SET LOCAL TENANT ends with the transaction.
6022                        self.inner.tx_local_tenants.write().remove(&conn_id);
6023                        let ctx = self.inner.tx_contexts.write().remove(&conn_id);
6024                        match ctx {
6025                            Some(ctx) => {
6026                                let mut own_xids = std::collections::HashSet::new();
6027                                own_xids.insert(ctx.xid);
6028                                for (_, sub) in &ctx.savepoints {
6029                                    own_xids.insert(*sub);
6030                                }
6031                                for sub in &ctx.released_sub_xids {
6032                                    own_xids.insert(*sub);
6033                                }
6034                                if let Err(err) = self.check_table_row_write_conflicts(
6035                                    conn_id,
6036                                    &ctx.snapshot,
6037                                    &own_xids,
6038                                ) {
6039                                    for (_, sub) in &ctx.savepoints {
6040                                        self.inner.snapshot_manager.rollback(*sub);
6041                                    }
6042                                    for sub in &ctx.released_sub_xids {
6043                                        self.inner.snapshot_manager.rollback(*sub);
6044                                    }
6045                                    self.inner.snapshot_manager.rollback(ctx.xid);
6046                                    self.revive_pending_versioned_updates(conn_id);
6047                                    self.revive_pending_tombstones(conn_id);
6048                                    self.discard_pending_kv_watch_events(conn_id);
6049                                    self.discard_pending_store_wal_actions(conn_id);
6050                                    return Err(err);
6051                                }
6052                                self.restore_pending_write_stamps(conn_id);
6053                                if let Err(err) = self.flush_pending_store_wal_actions(conn_id) {
6054                                    for (_, sub) in &ctx.savepoints {
6055                                        self.inner.snapshot_manager.rollback(*sub);
6056                                    }
6057                                    for sub in &ctx.released_sub_xids {
6058                                        self.inner.snapshot_manager.rollback(*sub);
6059                                    }
6060                                    self.inner.snapshot_manager.rollback(ctx.xid);
6061                                    self.revive_pending_versioned_updates(conn_id);
6062                                    self.revive_pending_tombstones(conn_id);
6063                                    self.discard_pending_kv_watch_events(conn_id);
6064                                    return Err(err);
6065                                }
6066                                // Phase 2.3.2e: commit every open sub-xid
6067                                // so they also become visible. Their
6068                                // work is promoted to the parent txn's
6069                                // result exactly like a RELEASE would
6070                                // have done.
6071                                for (_, sub) in &ctx.savepoints {
6072                                    self.inner.snapshot_manager.commit(*sub);
6073                                }
6074                                for sub in &ctx.released_sub_xids {
6075                                    self.inner.snapshot_manager.commit(*sub);
6076                                }
6077                                self.inner.snapshot_manager.commit(ctx.xid);
6078                                self.finalize_pending_versioned_updates(conn_id);
6079                                self.finalize_pending_tombstones(conn_id);
6080                                self.finalize_pending_kv_watch_events(conn_id);
6081                                ("commit", format!("COMMIT — xid={} committed", ctx.xid))
6082                            }
6083                            None => (
6084                                "commit",
6085                                "COMMIT outside transaction — no-op (autocommit)".to_string(),
6086                            ),
6087                        }
6088                    }
6089                    TxnControl::Rollback => {
6090                        self.inner.tx_local_tenants.write().remove(&conn_id);
6091                        let ctx = self.inner.tx_contexts.write().remove(&conn_id);
6092                        match ctx {
6093                            Some(ctx) => {
6094                                // Phase 2.3.2e: abort every open sub-xid
6095                                // too so their writes stay hidden.
6096                                for (_, sub) in &ctx.savepoints {
6097                                    self.inner.snapshot_manager.rollback(*sub);
6098                                }
6099                                for sub in &ctx.released_sub_xids {
6100                                    self.inner.snapshot_manager.rollback(*sub);
6101                                }
6102                                self.inner.snapshot_manager.rollback(ctx.xid);
6103                                // Phase 2.3.2b: tuples that the txn had
6104                                // xmax-stamped become live again — wipe xmax
6105                                // back to 0 so later snapshots see them.
6106                                self.revive_pending_versioned_updates(conn_id);
6107                                self.revive_pending_tombstones(conn_id);
6108                                self.discard_pending_kv_watch_events(conn_id);
6109                                self.discard_pending_store_wal_actions(conn_id);
6110                                ("rollback", format!("ROLLBACK — xid={} aborted", ctx.xid))
6111                            }
6112                            None => (
6113                                "rollback",
6114                                "ROLLBACK outside transaction — no-op (autocommit)".to_string(),
6115                            ),
6116                        }
6117                    }
6118                    // Phase 2.3.2e: savepoints map onto sub-xids. Each
6119                    // SAVEPOINT allocates a fresh xid and pushes it
6120                    // onto the per-txn stack so subsequent writes can
6121                    // be selectively rolled back. RELEASE pops without
6122                    // aborting; ROLLBACK TO aborts the sub-xid (and
6123                    // any nested ones) + revives their tombstones.
6124                    TxnControl::Savepoint(name) => {
6125                        let mgr = Arc::clone(&self.inner.snapshot_manager);
6126                        let mut guard = self.inner.tx_contexts.write();
6127                        match guard.get_mut(&conn_id) {
6128                            Some(ctx) => {
6129                                let sub = mgr.begin();
6130                                ctx.savepoints.push((name.clone(), sub));
6131                                ("savepoint", format!("SAVEPOINT {name} — sub_xid={sub}"))
6132                            }
6133                            None => (
6134                                "savepoint",
6135                                "SAVEPOINT outside transaction — no-op".to_string(),
6136                            ),
6137                        }
6138                    }
6139                    TxnControl::ReleaseSavepoint(name) => {
6140                        let mut guard = self.inner.tx_contexts.write();
6141                        match guard.get_mut(&conn_id) {
6142                            Some(ctx) => {
6143                                let pos = ctx
6144                                    .savepoints
6145                                    .iter()
6146                                    .position(|(n, _)| n == name)
6147                                    .ok_or_else(|| {
6148                                        RedDBError::Internal(format!(
6149                                            "savepoint {name} does not exist"
6150                                        ))
6151                                    })?;
6152                                // RELEASE pops the named savepoint and
6153                                // any nested ones. Their sub-xids move
6154                                // to `released_sub_xids` so they commit
6155                                // (or roll back) alongside the parent
6156                                // xid — PG semantics: released
6157                                // savepoints still contribute their
6158                                // work, but their names are gone.
6159                                let released = ctx.savepoints.len() - pos;
6160                                let popped: Vec<Xid> = ctx
6161                                    .savepoints
6162                                    .split_off(pos)
6163                                    .into_iter()
6164                                    .map(|(_, x)| x)
6165                                    .collect();
6166                                ctx.released_sub_xids.extend(popped);
6167                                (
6168                                    "release_savepoint",
6169                                    format!("RELEASE SAVEPOINT {name} — {released} level(s)"),
6170                                )
6171                            }
6172                            None => (
6173                                "release_savepoint",
6174                                "RELEASE outside transaction — no-op".to_string(),
6175                            ),
6176                        }
6177                    }
6178                    TxnControl::RollbackToSavepoint(name) => {
6179                        let mgr = Arc::clone(&self.inner.snapshot_manager);
6180                        // Splice out the savepoint + nested ones under
6181                        // a narrow lock, then run the snapshot-manager
6182                        // + tombstone side-effects without the tx map
6183                        // held so nothing re-enters.
6184                        let drop_result: Option<(Xid, Vec<Xid>)> = {
6185                            let mut guard = self.inner.tx_contexts.write();
6186                            if let Some(ctx) = guard.get_mut(&conn_id) {
6187                                let pos = ctx
6188                                    .savepoints
6189                                    .iter()
6190                                    .position(|(n, _)| n == name)
6191                                    .ok_or_else(|| {
6192                                        RedDBError::Internal(format!(
6193                                            "savepoint {name} does not exist"
6194                                        ))
6195                                    })?;
6196                                let savepoint_xid = ctx.savepoints[pos].1;
6197                                let aborted: Vec<Xid> = ctx
6198                                    .savepoints
6199                                    .split_off(pos)
6200                                    .into_iter()
6201                                    .map(|(_, x)| x)
6202                                    .collect();
6203                                Some((savepoint_xid, aborted))
6204                            } else {
6205                                None
6206                            }
6207                        };
6208
6209                        match drop_result {
6210                            Some((savepoint_xid, aborted)) => {
6211                                for x in &aborted {
6212                                    mgr.rollback(*x);
6213                                }
6214                                let reverted_updates =
6215                                    self.revive_versioned_updates_since(conn_id, savepoint_xid);
6216                                let revived = self.revive_tombstones_since(conn_id, savepoint_xid);
6217                                (
6218                                    "rollback_to_savepoint",
6219                                    format!(
6220                                        "ROLLBACK TO SAVEPOINT {name} — aborted {} sub_xid(s), reverted {reverted_updates} update(s), revived {revived} tombstone(s)",
6221                                        aborted.len(),
6222                                    ),
6223                                )
6224                            }
6225                            None => (
6226                                "rollback_to_savepoint",
6227                                "ROLLBACK TO outside transaction — no-op".to_string(),
6228                            ),
6229                        }
6230                    }
6231                };
6232                Ok(RuntimeQueryResult::ok_message(
6233                    query.to_string(),
6234                    &msg,
6235                    kind,
6236                ))
6237            }
6238            // Schema + Sequence DDL (Phase 1.3 PG parity).
6239            //
6240            // Schemas are lightweight logical namespaces: a CREATE SCHEMA call
6241            // just registers the name in `red_config` under `schema.{name}`.
6242            // Table lookups still happen by collection name; clients using
6243            // `schema.table` qualified names collapse to collection `schema.table`.
6244            //
6245            // Sequences persist a 64-bit counter + metadata (start, increment)
6246            // in `red_config` under `sequence.{name}.*`. Scalar callers
6247            // `nextval('name')` / `currval('name')` arrive with the MVCC phase
6248            // once we have a proper mutating-function dispatch path; for now the
6249            // DDL just establishes the catalog entry so clients don't error.
6250            QueryExpr::CreateSchema(ref q) => {
6251                let store = self.inner.db.store();
6252                let key = format!("schema.{}", q.name);
6253                if store.get_config(&key).is_some() {
6254                    if q.if_not_exists {
6255                        return Ok(RuntimeQueryResult::ok_message(
6256                            query.to_string(),
6257                            &format!("schema {} already exists — skipped", q.name),
6258                            "create_schema",
6259                        ));
6260                    }
6261                    return Err(RedDBError::Internal(format!(
6262                        "schema {} already exists",
6263                        q.name
6264                    )));
6265                }
6266                store.set_config_tree(&key, &crate::serde_json::Value::Bool(true));
6267                Ok(RuntimeQueryResult::ok_message(
6268                    query.to_string(),
6269                    &format!("schema {} created", q.name),
6270                    "create_schema",
6271                ))
6272            }
6273            QueryExpr::DropSchema(ref q) => {
6274                let store = self.inner.db.store();
6275                let key = format!("schema.{}", q.name);
6276                let existed = store.get_config(&key).is_some();
6277                if !existed && !q.if_exists {
6278                    return Err(RedDBError::Internal(format!(
6279                        "schema {} does not exist",
6280                        q.name
6281                    )));
6282                }
6283                // Remove marker from red_config via set to null.
6284                store.set_config_tree(&key, &crate::serde_json::Value::Null);
6285                let suffix = if q.cascade {
6286                    " (CASCADE accepted — tables untouched)"
6287                } else {
6288                    ""
6289                };
6290                Ok(RuntimeQueryResult::ok_message(
6291                    query.to_string(),
6292                    &format!("schema {} dropped{}", q.name, suffix),
6293                    "drop_schema",
6294                ))
6295            }
6296            QueryExpr::CreateSequence(ref q) => {
6297                let store = self.inner.db.store();
6298                let base = format!("sequence.{}", q.name);
6299                let start_key = format!("{base}.start");
6300                let incr_key = format!("{base}.increment");
6301                let curr_key = format!("{base}.current");
6302                if store.get_config(&start_key).is_some() {
6303                    if q.if_not_exists {
6304                        return Ok(RuntimeQueryResult::ok_message(
6305                            query.to_string(),
6306                            &format!("sequence {} already exists — skipped", q.name),
6307                            "create_sequence",
6308                        ));
6309                    }
6310                    return Err(RedDBError::Internal(format!(
6311                        "sequence {} already exists",
6312                        q.name
6313                    )));
6314                }
6315                // Persist start + increment, and set current so the first
6316                // nextval returns `start`.
6317                let initial_current = q.start - q.increment;
6318                store.set_config_tree(
6319                    &start_key,
6320                    &crate::serde_json::Value::Number(q.start as f64),
6321                );
6322                store.set_config_tree(
6323                    &incr_key,
6324                    &crate::serde_json::Value::Number(q.increment as f64),
6325                );
6326                store.set_config_tree(
6327                    &curr_key,
6328                    &crate::serde_json::Value::Number(initial_current as f64),
6329                );
6330                Ok(RuntimeQueryResult::ok_message(
6331                    query.to_string(),
6332                    &format!(
6333                        "sequence {} created (start={}, increment={})",
6334                        q.name, q.start, q.increment
6335                    ),
6336                    "create_sequence",
6337                ))
6338            }
6339            QueryExpr::DropSequence(ref q) => {
6340                let store = self.inner.db.store();
6341                let base = format!("sequence.{}", q.name);
6342                let existed = store.get_config(&format!("{base}.start")).is_some();
6343                if !existed && !q.if_exists {
6344                    return Err(RedDBError::Internal(format!(
6345                        "sequence {} does not exist",
6346                        q.name
6347                    )));
6348                }
6349                for k in ["start", "increment", "current"] {
6350                    store.set_config_tree(&format!("{base}.{k}"), &crate::serde_json::Value::Null);
6351                }
6352                Ok(RuntimeQueryResult::ok_message(
6353                    query.to_string(),
6354                    &format!("sequence {} dropped", q.name),
6355                    "drop_sequence",
6356                ))
6357            }
6358            // Views — CREATE [MATERIALIZED] VIEW (Phase 2.1 PG parity).
6359            //
6360            // The view definition is stored in-memory on RuntimeInner (not
6361            // persisted). SELECTs that reference the view name will substitute
6362            // the stored `QueryExpr` via `resolve_view_reference` during
6363            // planning (same entry point used by table-name resolution).
6364            //
6365            // Materialized views additionally allocate a slot in
6366            // `MaterializedViewCache`; a REFRESH repopulates that slot.
6367            QueryExpr::CreateView(ref q) => {
6368                let mut views = self.inner.views.write();
6369                if views.contains_key(&q.name) && !q.or_replace {
6370                    if q.if_not_exists {
6371                        return Ok(RuntimeQueryResult::ok_message(
6372                            query.to_string(),
6373                            &format!("view {} already exists — skipped", q.name),
6374                            "create_view",
6375                        ));
6376                    }
6377                    return Err(RedDBError::Internal(format!(
6378                        "view {} already exists",
6379                        q.name
6380                    )));
6381                }
6382                views.insert(q.name.clone(), Arc::new(q.clone()));
6383                drop(views);
6384
6385                // Materialized view: register cache slot (data is empty until REFRESH).
6386                if q.materialized {
6387                    use crate::storage::cache::result::{MaterializedViewDef, RefreshPolicy};
6388                    let refresh = match q.refresh_every_ms {
6389                        Some(ms) => RefreshPolicy::Periodic(std::time::Duration::from_millis(ms)),
6390                        None => RefreshPolicy::Manual,
6391                    };
6392                    let dependencies = collect_table_refs(&q.query);
6393                    let def = MaterializedViewDef {
6394                        name: q.name.clone(),
6395                        query: format!("<parsed view {}>", q.name),
6396                        dependencies: dependencies.clone(),
6397                        refresh,
6398                        retention_duration_ms: q.retention_duration_ms,
6399                    };
6400                    self.inner.materialized_views.write().register(def);
6401
6402                    // Issue #593 slice 9a — persist the descriptor to
6403                    // the system catalog so the definition survives a
6404                    // restart. Upsert semantics (delete-then-insert by
6405                    // name) keep the catalog free of duplicate rows
6406                    // across `CREATE OR REPLACE` churn.
6407                    let descriptor =
6408                        crate::runtime::continuous_materialized_view::MaterializedViewDescriptor {
6409                            name: q.name.clone(),
6410                            source_sql: query.to_string(),
6411                            source_collections: dependencies,
6412                            refresh_every_ms: q.refresh_every_ms,
6413                            retention_duration_ms: q.retention_duration_ms,
6414                        };
6415                    let store = self.inner.db.store();
6416                    crate::runtime::continuous_materialized_view::persist_descriptor(
6417                        store.as_ref(),
6418                        &descriptor,
6419                    )?;
6420
6421                    // Issue #594 slice 9b — provision a Table-shaped
6422                    // backing collection named after the view. The
6423                    // rewriter skips materialized views (see
6424                    // `rewrite_view_refs_inner`) so `SELECT FROM v`
6425                    // resolves to this collection directly. Empty
6426                    // until REFRESH wires through it in 9c.
6427                    self.ensure_materialized_view_backing(&q.name)?;
6428                }
6429                // Plan cache may have cached a plan that didn't know about this
6430                // view — invalidate so future references pick up the new binding.
6431                // Result cache gets flushed too: OR REPLACE must not serve a
6432                // prior execution of the obsolete body.
6433                self.invalidate_plan_cache();
6434                self.invalidate_result_cache();
6435
6436                Ok(RuntimeQueryResult::ok_message(
6437                    query.to_string(),
6438                    &format!(
6439                        "{}view {} created",
6440                        if q.materialized { "materialized " } else { "" },
6441                        q.name
6442                    ),
6443                    "create_view",
6444                ))
6445            }
6446            QueryExpr::DropView(ref q) => {
6447                let mut views = self.inner.views.write();
6448                let removed = views.remove(&q.name);
6449                let existed = removed.is_some();
6450                let removed_materialized =
6451                    removed.as_ref().map(|v| v.materialized).unwrap_or(false);
6452                drop(views);
6453                if q.materialized || existed {
6454                    // Try the materialised cache too — silent if absent.
6455                    self.inner.materialized_views.write().remove(&q.name);
6456                    // Issue #593 slice 9a — remove any persisted
6457                    // catalog row. Idempotent: a no-op when the view
6458                    // was never materialized (no row was ever written).
6459                    let store = self.inner.db.store();
6460                    crate::runtime::continuous_materialized_view::remove_by_name(
6461                        store.as_ref(),
6462                        &q.name,
6463                    )?;
6464                }
6465                // Issue #594 slice 9b — drop the backing collection
6466                // that was provisioned at CREATE time. Only mat views
6467                // ever had one; regular views never did.
6468                if removed_materialized || q.materialized {
6469                    self.drop_materialized_view_backing(&q.name)?;
6470                }
6471                // Drop any plan / result cache entries that baked the
6472                // view body into their QueryExpr.
6473                self.invalidate_plan_cache();
6474                self.invalidate_result_cache();
6475                if !existed && !q.if_exists {
6476                    return Err(RedDBError::Internal(format!(
6477                        "view {} does not exist",
6478                        q.name
6479                    )));
6480                }
6481                self.invalidate_plan_cache();
6482                Ok(RuntimeQueryResult::ok_message(
6483                    query.to_string(),
6484                    &format!("view {} dropped", q.name),
6485                    "drop_view",
6486                ))
6487            }
6488            QueryExpr::RefreshMaterializedView(ref q) => {
6489                // Look up the view definition, execute its underlying query,
6490                // and stash the serialized result in the materialised cache.
6491                let view = {
6492                    let views = self.inner.views.read();
6493                    views.get(&q.name).cloned()
6494                };
6495                let view = match view {
6496                    Some(v) => v,
6497                    None => {
6498                        return Err(RedDBError::Internal(format!(
6499                            "view {} does not exist",
6500                            q.name
6501                        )))
6502                    }
6503                };
6504                if !view.materialized {
6505                    return Err(RedDBError::Internal(format!(
6506                        "view {} is not materialized — REFRESH requires \
6507                         CREATE MATERIALIZED VIEW",
6508                        q.name
6509                    )));
6510                }
6511                // Execute the underlying query fresh.
6512                let started = std::time::Instant::now();
6513                let now_ms = std::time::SystemTime::now()
6514                    .duration_since(std::time::UNIX_EPOCH)
6515                    .map(|d| d.as_millis() as u64)
6516                    .unwrap_or(0);
6517                match self.execute_query_expr((*view.query).clone()) {
6518                    Ok(inner_result) => {
6519                        // Issue #595 slice 9c — atomically replace the
6520                        // backing collection's contents under a single
6521                        // WAL group. Concurrent SELECT from the view
6522                        // sees either the prior or new contents, never
6523                        // partial. A crash before the WAL commit lands
6524                        // leaves the prior contents intact on recovery.
6525                        let entities =
6526                            view_records_to_entities(&q.name, &inner_result.result.records);
6527                        let row_count = entities.len() as u64;
6528                        let store = self.inner.db.store();
6529                        let serialized_records = match store.refresh_collection(&q.name, entities) {
6530                            Ok(records) => records,
6531                            Err(err) => {
6532                                let duration_ms = started.elapsed().as_millis() as u64;
6533                                let msg = err.to_string();
6534                                self.inner
6535                                    .materialized_views
6536                                    .write()
6537                                    .record_refresh_failure(
6538                                        &q.name,
6539                                        msg.clone(),
6540                                        duration_ms,
6541                                        now_ms,
6542                                    );
6543                                return Err(RedDBError::Internal(format!(
6544                                    "REFRESH MATERIALIZED VIEW {}: {msg}",
6545                                    q.name
6546                                )));
6547                            }
6548                        };
6549
6550                        // Issue #596 slice 9d — emit a Refresh
6551                        // ChangeRecord into the logical-WAL spool so
6552                        // replicas deterministically replay the same
6553                        // backing-collection contents via
6554                        // `LogicalChangeApplier::apply_record`.
6555                        if let Some(ref primary) = self.inner.db.replication {
6556                            let lsn = self.inner.cdc.emit(
6557                                crate::replication::cdc::ChangeOperation::Refresh,
6558                                &q.name,
6559                                0,
6560                                "refresh",
6561                            );
6562                            self.invalidate_result_cache_for_table(&q.name);
6563                            let timestamp = std::time::SystemTime::now()
6564                                .duration_since(std::time::UNIX_EPOCH)
6565                                .unwrap_or_default()
6566                                .as_millis() as u64;
6567                            let record = ChangeRecord::for_refresh(
6568                                lsn,
6569                                timestamp,
6570                                q.name.clone(),
6571                                serialized_records,
6572                            );
6573                            let encoded = record.encode();
6574                            primary.wal_buffer.append(record.lsn, encoded.clone());
6575                            if let Some(spool) = &primary.logical_wal_spool {
6576                                let _ = spool.append(record.lsn, &encoded);
6577                            }
6578                        }
6579
6580                        let duration_ms = started.elapsed().as_millis() as u64;
6581                        let serialized = format!("{:?}", inner_result.result);
6582                        self.inner
6583                            .materialized_views
6584                            .write()
6585                            .record_refresh_success(
6586                                &q.name,
6587                                serialized.into_bytes(),
6588                                row_count,
6589                                duration_ms,
6590                                now_ms,
6591                            );
6592                        // SELECT FROM v now reads through the rewriter
6593                        // skip into the backing collection — drop the
6594                        // result cache so prior empty-backing reads
6595                        // don't shadow the new contents.
6596                        self.invalidate_result_cache();
6597                        Ok(RuntimeQueryResult::ok_message(
6598                            query.to_string(),
6599                            &format!("materialized view {} refreshed", q.name),
6600                            "refresh_materialized_view",
6601                        ))
6602                    }
6603                    Err(err) => {
6604                        let duration_ms = started.elapsed().as_millis() as u64;
6605                        let msg = err.to_string();
6606                        self.inner
6607                            .materialized_views
6608                            .write()
6609                            .record_refresh_failure(&q.name, msg.clone(), duration_ms, now_ms);
6610                        Err(err)
6611                    }
6612                }
6613            }
6614            // Row Level Security (Phase 2.5 PG parity).
6615            //
6616            // Policies live in an in-memory registry keyed by (table, name).
6617            // Enforcement (AND-ing the policy's USING clause into every
6618            // query's WHERE for the table) arrives in Phase 2.5.2 via the
6619            // filter compiler; this dispatch only manages the catalog.
6620            QueryExpr::CreatePolicy(ref q) => {
6621                let key = (q.table.clone(), q.name.clone());
6622                self.inner
6623                    .rls_policies
6624                    .write()
6625                    .insert(key, Arc::new(q.clone()));
6626                self.invalidate_plan_cache();
6627                // Issue #120 — surface policy names in the
6628                // schema-vocabulary so AskPipeline (#121) can resolve
6629                // a policy reference back to its table.
6630                self.schema_vocabulary_apply(
6631                    crate::runtime::schema_vocabulary::DdlEvent::CreatePolicy {
6632                        collection: q.table.clone(),
6633                        policy: q.name.clone(),
6634                    },
6635                );
6636                Ok(RuntimeQueryResult::ok_message(
6637                    query.to_string(),
6638                    &format!("policy {} on {} created", q.name, q.table),
6639                    "create_policy",
6640                ))
6641            }
6642            QueryExpr::DropPolicy(ref q) => {
6643                let removed = self
6644                    .inner
6645                    .rls_policies
6646                    .write()
6647                    .remove(&(q.table.clone(), q.name.clone()))
6648                    .is_some();
6649                if !removed && !q.if_exists {
6650                    return Err(RedDBError::Internal(format!(
6651                        "policy {} on {} does not exist",
6652                        q.name, q.table
6653                    )));
6654                }
6655                self.invalidate_plan_cache();
6656                // Issue #120 — keep the schema-vocabulary policy
6657                // entry in sync.
6658                self.schema_vocabulary_apply(
6659                    crate::runtime::schema_vocabulary::DdlEvent::DropPolicy {
6660                        collection: q.table.clone(),
6661                        policy: q.name.clone(),
6662                    },
6663                );
6664                Ok(RuntimeQueryResult::ok_message(
6665                    query.to_string(),
6666                    &format!("policy {} on {} dropped", q.name, q.table),
6667                    "drop_policy",
6668                ))
6669            }
6670            // Foreign Data Wrappers (Phase 3.2 PG parity).
6671            //
6672            // CREATE SERVER / CREATE FOREIGN TABLE register into the shared
6673            // `ForeignTableRegistry`. The read path consults that registry
6674            // before dispatching a SELECT — when the table name matches a
6675            // registered foreign table, we forward the scan to the wrapper
6676            // and skip the normal collection lookup.
6677            //
6678            // Phase 3.2 is in-memory only; persistence across restarts is a
6679            // 3.2.2 follow-up that mirrors the view registry pattern.
6680            QueryExpr::CreateServer(ref q) => {
6681                use crate::storage::fdw::FdwOptions;
6682                let registry = Arc::clone(&self.inner.foreign_tables);
6683                if registry.server(&q.name).is_some() {
6684                    if q.if_not_exists {
6685                        return Ok(RuntimeQueryResult::ok_message(
6686                            query.to_string(),
6687                            &format!("server {} already exists — skipped", q.name),
6688                            "create_server",
6689                        ));
6690                    }
6691                    return Err(RedDBError::Internal(format!(
6692                        "server {} already exists",
6693                        q.name
6694                    )));
6695                }
6696                let mut opts = FdwOptions::new();
6697                for (k, v) in &q.options {
6698                    opts.values.insert(k.clone(), v.clone());
6699                }
6700                registry
6701                    .create_server(&q.name, &q.wrapper, opts)
6702                    .map_err(|e| RedDBError::Internal(e.to_string()))?;
6703                Ok(RuntimeQueryResult::ok_message(
6704                    query.to_string(),
6705                    &format!("server {} created (wrapper {})", q.name, q.wrapper),
6706                    "create_server",
6707                ))
6708            }
6709            QueryExpr::DropServer(ref q) => {
6710                let existed = self.inner.foreign_tables.drop_server(&q.name);
6711                if !existed && !q.if_exists {
6712                    return Err(RedDBError::Internal(format!(
6713                        "server {} does not exist",
6714                        q.name
6715                    )));
6716                }
6717                Ok(RuntimeQueryResult::ok_message(
6718                    query.to_string(),
6719                    &format!(
6720                        "server {} dropped{}",
6721                        q.name,
6722                        if q.cascade { " (cascade)" } else { "" }
6723                    ),
6724                    "drop_server",
6725                ))
6726            }
6727            QueryExpr::CreateForeignTable(ref q) => {
6728                use crate::storage::fdw::{FdwOptions, ForeignColumn, ForeignTable};
6729                let registry = Arc::clone(&self.inner.foreign_tables);
6730                if registry.foreign_table(&q.name).is_some() {
6731                    if q.if_not_exists {
6732                        return Ok(RuntimeQueryResult::ok_message(
6733                            query.to_string(),
6734                            &format!("foreign table {} already exists — skipped", q.name),
6735                            "create_foreign_table",
6736                        ));
6737                    }
6738                    return Err(RedDBError::Internal(format!(
6739                        "foreign table {} already exists",
6740                        q.name
6741                    )));
6742                }
6743                let mut opts = FdwOptions::new();
6744                for (k, v) in &q.options {
6745                    opts.values.insert(k.clone(), v.clone());
6746                }
6747                let columns: Vec<ForeignColumn> = q
6748                    .columns
6749                    .iter()
6750                    .map(|c| ForeignColumn {
6751                        name: c.name.clone(),
6752                        data_type: c.data_type.clone(),
6753                        not_null: c.not_null,
6754                    })
6755                    .collect();
6756                registry
6757                    .create_foreign_table(ForeignTable {
6758                        name: q.name.clone(),
6759                        server_name: q.server.clone(),
6760                        columns,
6761                        options: opts,
6762                    })
6763                    .map_err(|e| RedDBError::Internal(e.to_string()))?;
6764                self.invalidate_plan_cache();
6765                Ok(RuntimeQueryResult::ok_message(
6766                    query.to_string(),
6767                    &format!("foreign table {} created (server {})", q.name, q.server),
6768                    "create_foreign_table",
6769                ))
6770            }
6771            QueryExpr::DropForeignTable(ref q) => {
6772                let existed = self.inner.foreign_tables.drop_foreign_table(&q.name);
6773                if !existed && !q.if_exists {
6774                    return Err(RedDBError::Internal(format!(
6775                        "foreign table {} does not exist",
6776                        q.name
6777                    )));
6778                }
6779                self.invalidate_plan_cache();
6780                Ok(RuntimeQueryResult::ok_message(
6781                    query.to_string(),
6782                    &format!("foreign table {} dropped", q.name),
6783                    "drop_foreign_table",
6784                ))
6785            }
6786            // COPY table FROM 'path' (Phase 1.5 PG parity).
6787            //
6788            // Stream CSV rows through the shared `CsvImporter`. The collection
6789            // is auto-created on first insert (via `insert_auto`-style path);
6790            // VACUUM/ANALYZE afterwards is up to the caller.
6791            QueryExpr::CopyFrom(ref q) => {
6792                use crate::storage::import::{CsvConfig, CsvImporter};
6793                let store = self.inner.db.store();
6794                let cfg = CsvConfig {
6795                    collection: q.table.clone(),
6796                    has_header: q.has_header,
6797                    delimiter: q.delimiter.map(|c| c as u8).unwrap_or(b','),
6798                    ..CsvConfig::default()
6799                };
6800                let importer = CsvImporter::new(cfg);
6801                let stats = importer
6802                    .import_file(&q.path, store.as_ref())
6803                    .map_err(|e| RedDBError::Internal(format!("COPY failed: {e}")))?;
6804                // Tables are written → invalidate cached plans / result cache.
6805                self.note_table_write(&q.table);
6806                Ok(RuntimeQueryResult::ok_message(
6807                    query.to_string(),
6808                    &format!(
6809                        "COPY imported {} rows into {} ({} errors skipped, {}ms)",
6810                        stats.records_imported, q.table, stats.errors_skipped, stats.duration_ms
6811                    ),
6812                    "copy_from",
6813                ))
6814            }
6815            // Maintenance commands (Phase 1.2 PG parity).
6816            //
6817            // - VACUUM [FULL] [table]: refreshes planner stats for the target
6818            //   collection(s) and — when FULL — triggers a full pager persist
6819            //   (flushes dirty pages + fsync). Also invalidates the result cache
6820            //   so subsequent reads re-execute against the freshly compacted
6821            //   storage. RedDB's segment/btree GC runs continuously via the
6822            //   background lifecycle; explicit space reclamation for sealed
6823            //   segments arrives with Phase 2.3 (MVCC + dead-tuple reclamation).
6824            // - ANALYZE [table]: reruns `analyze_collection` +
6825            //   `persist_table_stats` via `refresh_table_planner_stats` so the
6826            //   planner has fresh histograms, distinct estimates, null counts.
6827            //
6828            // Both commands accept an optional target; omitting the target
6829            // iterates every collection in the store.
6830            QueryExpr::MaintenanceCommand(ref cmd) => {
6831                use crate::storage::query::ast::MaintenanceCommand as Mc;
6832                let store = self.inner.db.store();
6833                let (kind, msg) = match cmd {
6834                    Mc::Analyze { target } => {
6835                        let targets: Vec<String> = match target {
6836                            Some(t) => vec![t.clone()],
6837                            None => store.list_collections(),
6838                        };
6839                        for t in &targets {
6840                            self.refresh_table_planner_stats(t);
6841                        }
6842                        (
6843                            "analyze",
6844                            format!("ANALYZE refreshed stats for {} table(s)", targets.len()),
6845                        )
6846                    }
6847                    Mc::Vacuum { target, full } => {
6848                        let targets: Vec<String> = match target {
6849                            Some(t) => vec![t.clone()],
6850                            None => store.list_collections(),
6851                        };
6852                        let cutoff_xid = self.mvcc_vacuum_cutoff_xid();
6853                        let mut vacuum_stats =
6854                            crate::storage::unified::store::MvccVacuumStats::default();
6855                        for t in &targets {
6856                            let stats = store.vacuum_mvcc_history(t, cutoff_xid).map_err(|e| {
6857                                RedDBError::Internal(format!(
6858                                    "VACUUM MVCC history failed for {t}: {e}"
6859                                ))
6860                            })?;
6861                            if stats.reclaimed_versions > 0 {
6862                                self.rebuild_runtime_indexes_for_table(t)?;
6863                            }
6864                            vacuum_stats.add(&stats);
6865                        }
6866                        self.inner.snapshot_manager.prune_aborted(cutoff_xid);
6867                        // Stats refresh covers every target (same as ANALYZE).
6868                        for t in &targets {
6869                            self.refresh_table_planner_stats(t);
6870                        }
6871                        // FULL forces a pager persist (dirty-page flush + fsync).
6872                        // Regular VACUUM relies on the background writer / segment
6873                        // lifecycle so the command is non-blocking.
6874                        let persisted = if *full {
6875                            match store.persist() {
6876                                Ok(()) => true,
6877                                Err(e) => {
6878                                    return Err(RedDBError::Internal(format!(
6879                                        "VACUUM FULL persist failed: {e:?}"
6880                                    )));
6881                                }
6882                            }
6883                        } else {
6884                            false
6885                        };
6886                        // Result cache depended on pre-vacuum state.
6887                        self.invalidate_result_cache();
6888                        (
6889                            "vacuum",
6890                            format!(
6891                                "VACUUM{} processed {} table(s): scanned_versions={}, retained_versions={}, reclaimed_versions={}, retained_history_versions={}, reclaimed_history_versions={}, retained_tombstones={}, reclaimed_tombstones={}{}",
6892                                if *full { " FULL" } else { "" },
6893                                targets.len(),
6894                                vacuum_stats.scanned_versions,
6895                                vacuum_stats.retained_versions,
6896                                vacuum_stats.reclaimed_versions,
6897                                vacuum_stats.retained_history_versions,
6898                                vacuum_stats.reclaimed_history_versions,
6899                                vacuum_stats.retained_tombstones,
6900                                vacuum_stats.reclaimed_tombstones,
6901                                if persisted {
6902                                    " (pages flushed to disk)"
6903                                } else {
6904                                    ""
6905                                }
6906                            ),
6907                        )
6908                    }
6909                };
6910                Ok(RuntimeQueryResult::ok_message(
6911                    query.to_string(),
6912                    &msg,
6913                    kind,
6914                ))
6915            }
6916            // GRANT / REVOKE / ALTER USER (RBAC milestone).
6917            //
6918            // These hit the AuthStore directly. The privilege-check
6919            // gate at the top of `execute_query_expr` already decided
6920            // whether the caller may even run the statement; here we
6921            // just translate the AST into AuthStore calls.
6922            QueryExpr::Grant(ref g) => self.execute_grant_statement(query, g),
6923            QueryExpr::Revoke(ref r) => self.execute_revoke_statement(query, r),
6924            QueryExpr::AlterUser(ref a) => self.execute_alter_user_statement(query, a),
6925            QueryExpr::CreateIamPolicy { ref id, ref json } => {
6926                self.execute_create_iam_policy(query, id, json)
6927            }
6928            QueryExpr::DropIamPolicy { ref id } => self.execute_drop_iam_policy(query, id),
6929            QueryExpr::AttachPolicy {
6930                ref policy_id,
6931                ref principal,
6932            } => self.execute_attach_policy(query, policy_id, principal),
6933            QueryExpr::DetachPolicy {
6934                ref policy_id,
6935                ref principal,
6936            } => self.execute_detach_policy(query, policy_id, principal),
6937            QueryExpr::ShowPolicies { ref filter } => {
6938                self.execute_show_policies(query, filter.as_ref())
6939            }
6940            QueryExpr::ShowEffectivePermissions {
6941                ref user,
6942                ref resource,
6943            } => self.execute_show_effective_permissions(query, user, resource.as_ref()),
6944            QueryExpr::SimulatePolicy {
6945                ref user,
6946                ref action,
6947                ref resource,
6948            } => self.execute_simulate_policy(query, user, action, resource),
6949            QueryExpr::CreateMigration(ref q) => self.execute_create_migration(query, q),
6950            QueryExpr::ApplyMigration(ref q) => self.execute_apply_migration(query, q),
6951            QueryExpr::RollbackMigration(ref q) => self.execute_rollback_migration(query, q),
6952            QueryExpr::ExplainMigration(ref q) => self.execute_explain_migration(query, q),
6953        };
6954
6955        if !control_event_specs.is_empty() {
6956            let (outcome, reason) = match &query_result {
6957                Ok(_) => (crate::runtime::control_events::Outcome::Allowed, None),
6958                Err(err) => (control_event_outcome_for_error(err), Some(err.to_string())),
6959            };
6960            for spec in &control_event_specs {
6961                self.emit_control_event(
6962                    spec.kind,
6963                    outcome,
6964                    spec.action,
6965                    spec.resource.clone(),
6966                    reason.clone(),
6967                    spec.fields.clone(),
6968                )?;
6969            }
6970        }
6971
6972        if let (Some(plan), Ok(result)) = (&query_audit_plan, &query_result) {
6973            self.emit_query_audit(
6974                query,
6975                plan,
6976                query_audit_started.elapsed().as_millis() as u64,
6977                result,
6978            );
6979        }
6980
6981        // Decrypt Value::Secret columns in-place before caching, so
6982        // cached results match the post-decrypt shape and repeat
6983        // queries skip the per-row AES-GCM pass.
6984        let mut query_result = query_result;
6985        if let Ok(ref mut result) = query_result {
6986            if result.statement_type == "select" {
6987                self.apply_secret_decryption(result);
6988            }
6989        }
6990
6991        // Cache SELECT results for 30s.
6992        // Skip: pre-serialized JSON (large clone), and result sets > 5 rows.
6993        // Large multi-row results (range scans, filtered scans) are rarely
6994        // repeated with the same literal values so the cache hit rate is near
6995        // zero while the clone cost (100 records × ~16 fields each) is high.
6996        // Aggregations (1 row) and point lookups (1 row) still benefit.
6997        if let Ok(ref result) = query_result {
6998            frame.write_result_cache(self, result, result_cache_scopes);
6999        }
7000
7001        query_result
7002    }
7003
7004    /// Snapshot of every registered materialized view's runtime
7005    /// state — feeds the `red.materialized_views` virtual table.
7006    /// Issue #583 slice 10.
7007    pub fn materialized_view_metadata(
7008        &self,
7009    ) -> Vec<crate::storage::cache::result::MaterializedViewMetadata> {
7010        // Issue #595 slice 9c — `current_row_count` is now scraped
7011        // live from the backing collection rather than read from the
7012        // cache slot. Mirrors the slice-10 invariant on
7013        // `queue_pending_gauge` in #527: the live store is the source
7014        // of truth, the cache slot only carries last-refresh telemetry
7015        // (timing, error, refresh cadence).
7016        let store = self.inner.db.store();
7017        let mut entries = self.inner.materialized_views.read().metadata();
7018        for entry in &mut entries {
7019            if let Some(manager) = store.get_collection(&entry.name) {
7020                entry.current_row_count = manager.count() as u64;
7021            }
7022        }
7023        entries
7024    }
7025
7026    /// Drive scheduled refreshes for materialized views with a
7027    /// `REFRESH EVERY <duration>` clause. Called from the background
7028    /// scheduler thread (and from unit tests with a fake clock via
7029    /// `claim_due_at`). Each invocation atomically claims the set of
7030    /// due views (so two concurrent ticks never double-fire the same
7031    /// view) and runs each refresh through the standard execution
7032    /// path — failures are captured in `last_error` and the prior
7033    /// content stays intact. Issue #583 slice 10.
7034    /// Snapshot of every tracked retention sweeper state — feeds the
7035    /// three extra columns on `red.retention`. Issue #584 slice 12.
7036    pub(crate) fn retention_sweeper_snapshot(
7037        &self,
7038    ) -> Vec<(String, crate::runtime::retention_sweeper::SweeperState)> {
7039        self.inner.retention_sweeper.read().snapshot()
7040    }
7041
7042    /// Drive one tick of the retention sweeper. Iterates collections
7043    /// with a retention policy set, physically deletes at most
7044    /// `batch_size` expired rows per collection, and records the
7045    /// `last_sweep_at_ms` / `rows_swept_total` / pending estimate that
7046    /// `red.retention` exposes. Called from the background sweeper
7047    /// thread; safe to invoke directly from tests with a small batch
7048    /// size to drain rows deterministically. Issue #584 slice 12.
7049    ///
7050    /// Deletes are issued as `DELETE FROM <collection> WHERE
7051    /// <ts_column> < <cutoff>` through the standard `execute_query`
7052    /// chokepoint so WAL participation and snapshot guards apply
7053    /// exactly as for a user-issued DELETE — replicas replay the
7054    /// sweeper's deletes via the same WAL stream with no special
7055    /// handling on the replication side.
7056    ///
7057    /// Batching is enforced by tightening the cutoff: if more than
7058    /// `batch_size` rows are expired, the cutoff is dropped to the
7059    /// `batch_size`-th oldest expired timestamp + 1 so the predicate
7060    /// matches roughly `batch_size` rows; the remainder is reported
7061    /// as `current_rows_pending_sweep_estimate` and drained on the
7062    /// next tick.
7063    pub fn sweep_retention_tick(&self, batch_size: usize) {
7064        if batch_size == 0 {
7065            return;
7066        }
7067        let now_ms = std::time::SystemTime::now()
7068            .duration_since(std::time::UNIX_EPOCH)
7069            .map(|d| d.as_millis() as u64)
7070            .unwrap_or(0);
7071
7072        let store = self.inner.db.store();
7073        let collections = store.list_collections();
7074        for name in collections {
7075            let Some(contract) = self.inner.db.collection_contract(&name) else {
7076                continue;
7077            };
7078            let Some(retention_ms) = contract.retention_duration_ms else {
7079                continue;
7080            };
7081            let Some(ts_column) =
7082                crate::runtime::retention_filter::resolve_timestamp_column(&contract)
7083            else {
7084                continue;
7085            };
7086            let Some(manager) = store.get_collection(&name) else {
7087                continue;
7088            };
7089            let cutoff = (now_ms as i64).saturating_sub(retention_ms as i64);
7090
7091            // Single pass: collect expired timestamps. We keep the
7092            // full Vec rather than a bounded heap because the partial
7093            // sort below is the simplest correct way to find the
7094            // batch-th oldest; for the slice's "1000-row default
7095            // batch" target this is bounded enough for production
7096            // operation, and the alternative (in-place heap of size
7097            // batch+1) is a follow-up optimisation.
7098            let mut expired_ts: Vec<i64> = Vec::new();
7099            manager.for_each_entity(|entity| {
7100                let ts = match ts_column.as_str() {
7101                    "created_at" => Some(entity.created_at as i64),
7102                    "updated_at" => Some(entity.updated_at as i64),
7103                    other => entity
7104                        .data
7105                        .as_row()
7106                        .and_then(|row| row.get_field(other))
7107                        .and_then(|v| match v {
7108                            crate::storage::schema::Value::TimestampMs(t) => Some(*t),
7109                            crate::storage::schema::Value::Timestamp(t) => {
7110                                Some(t.saturating_mul(1_000))
7111                            }
7112                            crate::storage::schema::Value::BigInt(t) => Some(*t),
7113                            crate::storage::schema::Value::UnsignedInteger(t) => {
7114                                i64::try_from(*t).ok()
7115                            }
7116                            crate::storage::schema::Value::Integer(t) => Some(*t),
7117                            _ => None,
7118                        }),
7119                };
7120                if let Some(t) = ts {
7121                    if t < cutoff {
7122                        expired_ts.push(t);
7123                    }
7124                }
7125                true
7126            });
7127
7128            let total_expired = expired_ts.len() as u64;
7129            if total_expired == 0 {
7130                self.inner
7131                    .retention_sweeper
7132                    .write()
7133                    .record_tick(&name, 0, 0, now_ms);
7134                continue;
7135            }
7136
7137            let (effective_cutoff, pending) = if (total_expired as usize) <= batch_size {
7138                (cutoff, 0u64)
7139            } else {
7140                // Tighten the cutoff to the (batch_size)-th oldest
7141                // expired timestamp + 1 so DELETE matches roughly
7142                // `batch_size` rows.
7143                expired_ts.sort_unstable();
7144                let nth = expired_ts[batch_size - 1];
7145                (
7146                    nth.saturating_add(1),
7147                    total_expired.saturating_sub(batch_size as u64),
7148                )
7149            };
7150
7151            let stmt = format!(
7152                "DELETE FROM {} WHERE {} < {}",
7153                name, ts_column, effective_cutoff
7154            );
7155            let deleted = match self.execute_query(&stmt) {
7156                Ok(r) => r.affected_rows,
7157                Err(_) => 0,
7158            };
7159
7160            self.inner
7161                .retention_sweeper
7162                .write()
7163                .record_tick(&name, deleted, pending, now_ms);
7164        }
7165    }
7166
7167    pub fn refresh_due_materialized_views(&self) {
7168        let due = {
7169            let mut cache = self.inner.materialized_views.write();
7170            cache.claim_due_at(std::time::Instant::now())
7171        };
7172        for name in due {
7173            // Round-trip through `execute_query` (rather than the
7174            // prepared-statement `execute_query_expr` fast path, which
7175            // explicitly rejects DDL/maintenance statements). Failures
7176            // are captured inside the RefreshMaterializedView handler
7177            // via `record_refresh_failure`; the scheduler ignores the
7178            // Result so one bad view doesn't halt the loop.
7179            let stmt = format!("REFRESH MATERIALIZED VIEW {}", name);
7180            let _ = self.execute_query(&stmt);
7181        }
7182    }
7183
7184    /// Execute a pre-parsed `QueryExpr` directly, bypassing SQL parsing and the
7185    /// plan cache. Used by the prepared-statement fast path so that `execute_prepared`
7186    /// calls pay zero parse + cache overhead.
7187    ///
7188    /// Applies secret decryption on SELECT results, identical to `execute_query`.
7189    pub fn execute_query_expr(&self, expr: QueryExpr) -> RedDBResult<RuntimeQueryResult> {
7190        let _config_snapshot_guard = ConfigSnapshotGuard::install(Arc::clone(&self.inner.db));
7191        let _secret_store_guard = SecretStoreGuard::install(self.inner.auth_store.read().clone());
7192        // View rewrite (Phase 2.1): substitute any `QueryExpr::Table(tq)`
7193        // whose `tq.table` matches a registered view with the view's
7194        // underlying query. Safe to call even when no views are registered.
7195        let expr = self.rewrite_view_refs(expr);
7196
7197        self.validate_model_operations_before_auth(&expr)?;
7198        // Granular RBAC privilege check. Runs before dispatch so a
7199        // denied caller never reaches storage. Fail-closed: any error
7200        // resolving the action / resource produces PermissionDenied.
7201        if let Err(err) = self.check_query_privilege(&expr) {
7202            return Err(RedDBError::Query(format!("permission denied: {err}")));
7203        }
7204
7205        let statement = query_expr_name(&expr);
7206        let mode = detect_mode(statement);
7207        let query_str = statement;
7208
7209        let result = self.dispatch_expr(expr, query_str, mode)?;
7210        let mut r = result;
7211        if r.statement_type == "select" {
7212            self.apply_secret_decryption(&mut r);
7213        }
7214        Ok(r)
7215    }
7216
7217    pub(super) fn validate_model_operations_before_auth(
7218        &self,
7219        expr: &QueryExpr,
7220    ) -> RedDBResult<()> {
7221        use crate::catalog::CollectionModel;
7222        use crate::runtime::ddl::polymorphic_resolver;
7223        use crate::storage::query::ast::KvCommand;
7224
7225        let system_schema_target = match expr {
7226            QueryExpr::DropTable(q) => Some(q.name.as_str()),
7227            QueryExpr::DropGraph(q) => Some(q.name.as_str()),
7228            QueryExpr::DropVector(q) => Some(q.name.as_str()),
7229            QueryExpr::DropDocument(q) => Some(q.name.as_str()),
7230            QueryExpr::DropKv(q) => Some(q.name.as_str()),
7231            QueryExpr::DropCollection(q) => Some(q.name.as_str()),
7232            QueryExpr::Truncate(q) => Some(q.name.as_str()),
7233            _ => None,
7234        };
7235        if system_schema_target.is_some_and(crate::runtime::impl_ddl::is_system_schema_name) {
7236            return Err(RedDBError::Query("system schema is read-only".to_string()));
7237        }
7238
7239        let expected = match expr {
7240            QueryExpr::DropTable(q) => Some((q.name.as_str(), CollectionModel::Table)),
7241            QueryExpr::DropGraph(q) => Some((q.name.as_str(), CollectionModel::Graph)),
7242            QueryExpr::DropVector(q) => Some((q.name.as_str(), CollectionModel::Vector)),
7243            QueryExpr::DropDocument(q) => Some((q.name.as_str(), CollectionModel::Document)),
7244            QueryExpr::DropKv(q) => Some((q.name.as_str(), q.model)),
7245            QueryExpr::DropCollection(q) => q.model.map(|model| (q.name.as_str(), model)),
7246            QueryExpr::Truncate(q) => q.model.map(|model| (q.name.as_str(), model)),
7247            QueryExpr::KvCommand(cmd) => {
7248                let (collection, model) = match cmd {
7249                    KvCommand::Put {
7250                        collection, model, ..
7251                    }
7252                    | KvCommand::Get {
7253                        collection, model, ..
7254                    }
7255                    | KvCommand::Incr {
7256                        collection, model, ..
7257                    }
7258                    | KvCommand::Cas {
7259                        collection, model, ..
7260                    }
7261                    | KvCommand::Delete {
7262                        collection, model, ..
7263                    } => (collection.as_str(), *model),
7264                    KvCommand::Rotate { collection, .. }
7265                    | KvCommand::History { collection, .. }
7266                    | KvCommand::List { collection, .. }
7267                    | KvCommand::Purge { collection, .. } => {
7268                        (collection.as_str(), CollectionModel::Vault)
7269                    }
7270                    KvCommand::InvalidateTags { collection, .. } => {
7271                        (collection.as_str(), CollectionModel::Kv)
7272                    }
7273                    KvCommand::Watch {
7274                        collection, model, ..
7275                    } => (collection.as_str(), *model),
7276                    KvCommand::Unseal { collection, .. } => {
7277                        (collection.as_str(), CollectionModel::Vault)
7278                    }
7279                };
7280                Some((collection, model))
7281            }
7282            QueryExpr::ConfigCommand(cmd) => {
7283                self.validate_config_command_before_auth(cmd)?;
7284                None
7285            }
7286            _ => None,
7287        };
7288
7289        let Some((name, expected_model)) = expected else {
7290            return Ok(());
7291        };
7292        let snapshot = self.inner.db.catalog_model_snapshot();
7293        let Some(actual_model) = snapshot
7294            .collections
7295            .iter()
7296            .find(|collection| collection.name == name)
7297            .map(|collection| collection.declared_model.unwrap_or(collection.model))
7298        else {
7299            return Ok(());
7300        };
7301        polymorphic_resolver::ensure_model_match(expected_model, actual_model)
7302    }
7303
7304    /// Walk a `QueryExpr` and replace `QueryExpr::Table(tq)` nodes whose
7305    /// `tq.table` matches a registered view name with the view's stored
7306    /// body. Recurses through joins so `SELECT ... FROM t JOIN myview ...`
7307    /// resolves correctly. Pure operation — no side effects.
7308    pub(super) fn rewrite_view_refs(&self, expr: QueryExpr) -> QueryExpr {
7309        // Fast path: no views registered → return original expression.
7310        if self.inner.views.read().is_empty() {
7311            return expr;
7312        }
7313        self.rewrite_view_refs_inner(expr)
7314    }
7315
7316    fn rewrite_view_refs_inner(&self, expr: QueryExpr) -> QueryExpr {
7317        use crate::storage::query::ast::{Filter, TableSource};
7318        match expr {
7319            QueryExpr::Table(mut tq) => {
7320                // 1. If the TableSource is a subquery, recurse into it so
7321                //    `SELECT ... FROM (SELECT ... FROM myview) t` expands.
7322                //    The legacy `table` field (set to a synthetic
7323                //    "__subq_NNNN" sentinel) stays as-is so callers that
7324                //    read it keep compiling.
7325                if let Some(TableSource::Subquery(body)) = tq.source.take() {
7326                    tq.source = Some(TableSource::Subquery(Box::new(
7327                        self.rewrite_view_refs_inner(*body),
7328                    )));
7329                    return QueryExpr::Table(tq);
7330                }
7331
7332                // 2. Restore the source field (took it above for match).
7333                // When the source was `None` or `TableSource::Name(_)`, the
7334                // real lookup key is `tq.table` — check the view registry.
7335                let maybe_view = {
7336                    let views = self.inner.views.read();
7337                    views.get(&tq.table).cloned()
7338                };
7339                let Some(view) = maybe_view else {
7340                    return QueryExpr::Table(tq);
7341                };
7342
7343                // Issue #594 slice 9b — materialized views are read
7344                // from their backing collection, not by substituting
7345                // the body. Returning the TableQuery as-is lets the
7346                // normal table-read path resolve `SELECT FROM v`
7347                // against the collection provisioned at CREATE time.
7348                if view.materialized {
7349                    return QueryExpr::Table(tq);
7350                }
7351
7352                // Recurse into the view body — views may reference other
7353                // views. The recursion yields the final QueryExpr we need
7354                // to merge the outer's filter / limit / offset into.
7355                let inner_expr = self.rewrite_view_refs_inner((*view.query).clone());
7356
7357                // Phase 5: when the body is a Table we merge the outer
7358                // TableQuery's WHERE / LIMIT / OFFSET into it so stacked
7359                // views filter recursively. Non-table bodies (Search,
7360                // Ask, Vector, Graph, Hybrid) can't meaningfully combine
7361                // with an outer Table query today — return the body
7362                // verbatim; outer predicates are lost. Full projection
7363                // merge lands in Phase 5.2.
7364                match inner_expr {
7365                    QueryExpr::Table(mut inner_tq) => {
7366                        if let Some(outer_filter) = tq.filter.take() {
7367                            inner_tq.filter = Some(match inner_tq.filter.take() {
7368                                Some(existing) => {
7369                                    Filter::And(Box::new(existing), Box::new(outer_filter))
7370                                }
7371                                None => outer_filter,
7372                            });
7373                            // Keep the `Expr` form in lock-step with the
7374                            // merged `Filter`. The executor prefers
7375                            // `where_expr` and nulls `filter` when it is
7376                            // present (see `execute_query_inner`), so a
7377                            // stacked view whose outer predicate was only
7378                            // merged into `filter` would silently drop that
7379                            // predicate at eval time (#635).
7380                            inner_tq.where_expr = inner_tq
7381                                .filter
7382                                .as_ref()
7383                                .map(crate::storage::query::sql_lowering::filter_to_expr);
7384                        }
7385                        if let Some(outer_limit) = tq.limit {
7386                            inner_tq.limit = Some(match inner_tq.limit {
7387                                Some(existing) => existing.min(outer_limit),
7388                                None => outer_limit,
7389                            });
7390                        }
7391                        if let Some(outer_offset) = tq.offset {
7392                            inner_tq.offset = Some(match inner_tq.offset {
7393                                Some(existing) => existing + outer_offset,
7394                                None => outer_offset,
7395                            });
7396                        }
7397                        QueryExpr::Table(inner_tq)
7398                    }
7399                    other => other,
7400                }
7401            }
7402            QueryExpr::Join(mut jq) => {
7403                jq.left = Box::new(self.rewrite_view_refs_inner(*jq.left));
7404                jq.right = Box::new(self.rewrite_view_refs_inner(*jq.right));
7405                QueryExpr::Join(jq)
7406            }
7407            // Other variants don't carry nested QueryExpr that can reference
7408            // a view by table name. Return as-is.
7409            other => other,
7410        }
7411    }
7412
7413    /// Internal dispatch: route a `QueryExpr` to the appropriate executor.
7414    /// Shared by `execute_query` (after parse/cache) and `execute_query_expr`
7415    /// (direct call from prepared-statement handler).
7416    fn authorize_relational_table_select(
7417        &self,
7418        mut table: TableQuery,
7419        frame: &dyn super::statement_frame::ReadFrame,
7420    ) -> RedDBResult<Option<TableQuery>> {
7421        if let Some(TableSource::Subquery(inner)) = table.source.take() {
7422            let authorized_inner = self.authorize_relational_select_expr(*inner, frame)?;
7423            table.source = Some(TableSource::Subquery(Box::new(authorized_inner)));
7424            return Ok(Some(table));
7425        }
7426
7427        self.check_table_column_projection_authz(&table, frame)?;
7428
7429        if self.inner.rls_enabled_tables.read().contains(&table.table) {
7430            return Ok(inject_rls_filters(self, frame, table));
7431        }
7432
7433        Ok(Some(table))
7434    }
7435
7436    fn authorize_relational_join_select(
7437        &self,
7438        mut join: JoinQuery,
7439        frame: &dyn super::statement_frame::ReadFrame,
7440    ) -> RedDBResult<Option<JoinQuery>> {
7441        self.check_join_column_projection_authz(&join, frame)?;
7442        join.left = Box::new(self.authorize_relational_join_child(*join.left, frame)?);
7443        join.right = Box::new(self.authorize_relational_join_child(*join.right, frame)?);
7444        Ok(inject_rls_into_join(self, frame, join))
7445    }
7446
7447    fn authorize_relational_join_child(
7448        &self,
7449        expr: QueryExpr,
7450        frame: &dyn super::statement_frame::ReadFrame,
7451    ) -> RedDBResult<QueryExpr> {
7452        match expr {
7453            QueryExpr::Table(mut table) => {
7454                if let Some(TableSource::Subquery(inner)) = table.source.take() {
7455                    let authorized_inner = self.authorize_relational_select_expr(*inner, frame)?;
7456                    table.source = Some(TableSource::Subquery(Box::new(authorized_inner)));
7457                }
7458                Ok(QueryExpr::Table(table))
7459            }
7460            QueryExpr::Join(join) => self
7461                .authorize_relational_join_select(join, frame)?
7462                .map(QueryExpr::Join)
7463                .ok_or_else(|| {
7464                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
7465                }),
7466            other => Ok(other),
7467        }
7468    }
7469
7470    fn authorize_relational_select_expr(
7471        &self,
7472        expr: QueryExpr,
7473        frame: &dyn super::statement_frame::ReadFrame,
7474    ) -> RedDBResult<QueryExpr> {
7475        match expr {
7476            QueryExpr::Table(table) => self
7477                .authorize_relational_table_select(table, frame)?
7478                .map(QueryExpr::Table)
7479                .ok_or_else(|| {
7480                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
7481                }),
7482            QueryExpr::Join(join) => self
7483                .authorize_relational_join_select(join, frame)?
7484                .map(QueryExpr::Join)
7485                .ok_or_else(|| {
7486                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
7487                }),
7488            other => Ok(other),
7489        }
7490    }
7491
7492    fn check_table_column_projection_authz(
7493        &self,
7494        table: &TableQuery,
7495        frame: &dyn super::statement_frame::ReadFrame,
7496    ) -> RedDBResult<()> {
7497        let Some((username, role)) = frame.identity() else {
7498            return Ok(());
7499        };
7500        let Some(auth_store) = self.inner.auth_store.read().clone() else {
7501            return Ok(());
7502        };
7503
7504        let columns = self.resolved_table_projection_columns(table)?;
7505        let request = ColumnAccessRequest::select(table.table.clone(), columns);
7506        let principal = UserId::from_parts(frame.effective_scope(), username);
7507        let ctx = runtime_iam_context(
7508            role,
7509            frame.effective_scope(),
7510            auth_store.principal_is_system_owned(&principal),
7511        );
7512        let outcome = auth_store.check_column_projection_authz(&principal, &request, &ctx);
7513        if outcome.allowed() {
7514            return Ok(());
7515        }
7516
7517        if let Some(denied) = outcome.first_denied_column() {
7518            return Err(RedDBError::Query(format!(
7519                "permission denied: principal=`{username}` cannot select column `{}`",
7520                denied.resource.name
7521            )));
7522        }
7523        Err(RedDBError::Query(format!(
7524            "permission denied: principal=`{username}` cannot select table `{}`",
7525            table.table
7526        )))
7527    }
7528
7529    fn check_join_column_projection_authz(
7530        &self,
7531        join: &JoinQuery,
7532        frame: &dyn super::statement_frame::ReadFrame,
7533    ) -> RedDBResult<()> {
7534        let mut by_table: HashMap<String, BTreeSet<String>> = HashMap::new();
7535        let projections = crate::storage::query::sql_lowering::effective_join_projections(join);
7536        self.collect_join_projection_columns(join, &projections, &mut by_table)?;
7537
7538        for (table, columns) in by_table {
7539            let query = TableQuery {
7540                table,
7541                source: None,
7542                alias: None,
7543                select_items: Vec::new(),
7544                columns: columns.into_iter().map(Projection::Column).collect(),
7545                where_expr: None,
7546                filter: None,
7547                group_by_exprs: Vec::new(),
7548                group_by: Vec::new(),
7549                having_expr: None,
7550                having: None,
7551                order_by: Vec::new(),
7552                limit: None,
7553                limit_param: None,
7554                offset: None,
7555                offset_param: None,
7556                expand: None,
7557                as_of: None,
7558                sessionize: None,
7559            };
7560            self.check_table_column_projection_authz(&query, frame)?;
7561        }
7562        Ok(())
7563    }
7564
7565    fn collect_join_projection_columns(
7566        &self,
7567        join: &JoinQuery,
7568        projections: &[Projection],
7569        out: &mut HashMap<String, BTreeSet<String>>,
7570    ) -> RedDBResult<()> {
7571        let left = table_side_context(join.left.as_ref());
7572        let right = table_side_context(join.right.as_ref());
7573
7574        if projections
7575            .iter()
7576            .any(|projection| matches!(projection, Projection::All))
7577        {
7578            for side in [left.as_ref(), right.as_ref()].into_iter().flatten() {
7579                out.entry(side.table.clone())
7580                    .or_default()
7581                    .extend(self.table_all_projection_columns(&side.table)?);
7582            }
7583            return Ok(());
7584        }
7585
7586        for projection in projections {
7587            collect_projection_columns_for_join_side(
7588                projection,
7589                left.as_ref(),
7590                right.as_ref(),
7591                out,
7592            )?;
7593        }
7594        Ok(())
7595    }
7596
7597    fn resolved_table_projection_columns(&self, table: &TableQuery) -> RedDBResult<Vec<String>> {
7598        let projections = crate::storage::query::sql_lowering::effective_table_projections(table);
7599        if projections
7600            .iter()
7601            .any(|projection| matches!(projection, Projection::All))
7602        {
7603            return self.table_all_projection_columns(&table.table);
7604        }
7605
7606        let mut columns = BTreeSet::new();
7607        for projection in &projections {
7608            collect_projection_columns_for_table(
7609                projection,
7610                &table.table,
7611                table.alias.as_deref(),
7612                &mut columns,
7613            );
7614        }
7615        Ok(columns.into_iter().collect())
7616    }
7617
7618    fn table_all_projection_columns(&self, table: &str) -> RedDBResult<Vec<String>> {
7619        if let Some(contract) = self.inner.db.collection_contract_arc(table) {
7620            let columns: Vec<String> = contract
7621                .declared_columns
7622                .iter()
7623                .map(|column| column.name.clone())
7624                .collect();
7625            if !columns.is_empty() {
7626                return Ok(columns);
7627            }
7628        }
7629
7630        let records = scan_runtime_table_source_records_limited(&self.inner.db, table, Some(1))?;
7631        Ok(records
7632            .first()
7633            .map(|record| {
7634                record
7635                    .column_names()
7636                    .into_iter()
7637                    .map(|column| column.to_string())
7638                    .collect()
7639            })
7640            .unwrap_or_default())
7641    }
7642
7643    fn resolve_table_expr_subqueries(
7644        &self,
7645        mut table: TableQuery,
7646        frame: &dyn super::statement_frame::ReadFrame,
7647    ) -> RedDBResult<TableQuery> {
7648        if let Some(TableSource::Subquery(inner)) = table.source.take() {
7649            let inner = self.resolve_select_expr_subqueries(*inner, frame)?;
7650            table.source = Some(TableSource::Subquery(Box::new(inner)));
7651        }
7652
7653        let outer_scopes = relation_scopes_for_query(&QueryExpr::Table(table.clone()));
7654        for item in &mut table.select_items {
7655            if let crate::storage::query::ast::SelectItem::Expr { expr, .. } = item {
7656                *expr = self.resolve_expr_subqueries(expr.clone(), &outer_scopes, frame)?;
7657            }
7658        }
7659        if let Some(where_expr) = table.where_expr.take() {
7660            table.where_expr =
7661                Some(self.resolve_expr_subqueries(where_expr, &outer_scopes, frame)?);
7662            table.filter = None;
7663        }
7664        if let Some(having_expr) = table.having_expr.take() {
7665            table.having_expr =
7666                Some(self.resolve_expr_subqueries(having_expr, &outer_scopes, frame)?);
7667            table.having = None;
7668        }
7669        for expr in &mut table.group_by_exprs {
7670            *expr = self.resolve_expr_subqueries(expr.clone(), &outer_scopes, frame)?;
7671        }
7672        for clause in &mut table.order_by {
7673            if let Some(expr) = clause.expr.take() {
7674                clause.expr = Some(self.resolve_expr_subqueries(expr, &outer_scopes, frame)?);
7675            }
7676        }
7677        Ok(table)
7678    }
7679
7680    fn resolve_select_expr_subqueries(
7681        &self,
7682        expr: QueryExpr,
7683        frame: &dyn super::statement_frame::ReadFrame,
7684    ) -> RedDBResult<QueryExpr> {
7685        match expr {
7686            QueryExpr::Table(table) => self
7687                .resolve_table_expr_subqueries(table, frame)
7688                .map(QueryExpr::Table),
7689            QueryExpr::Join(mut join) => {
7690                join.left = Box::new(self.resolve_select_expr_subqueries(*join.left, frame)?);
7691                join.right = Box::new(self.resolve_select_expr_subqueries(*join.right, frame)?);
7692                Ok(QueryExpr::Join(join))
7693            }
7694            other => Ok(other),
7695        }
7696    }
7697
7698    fn resolve_expr_subqueries(
7699        &self,
7700        expr: crate::storage::query::ast::Expr,
7701        outer_scopes: &[String],
7702        frame: &dyn super::statement_frame::ReadFrame,
7703    ) -> RedDBResult<crate::storage::query::ast::Expr> {
7704        use crate::storage::query::ast::Expr;
7705
7706        match expr {
7707            Expr::Subquery { query, span } => {
7708                let values = self.execute_expr_subquery_values(query, outer_scopes, frame)?;
7709                if values.len() > 1 {
7710                    return Err(RedDBError::Query(
7711                        "scalar subquery returned more than one row".to_string(),
7712                    ));
7713                }
7714                Ok(Expr::Literal {
7715                    value: values.into_iter().next().unwrap_or(Value::Null),
7716                    span,
7717                })
7718            }
7719            Expr::BinaryOp { op, lhs, rhs, span } => Ok(Expr::BinaryOp {
7720                op,
7721                lhs: Box::new(self.resolve_expr_subqueries(*lhs, outer_scopes, frame)?),
7722                rhs: Box::new(self.resolve_expr_subqueries(*rhs, outer_scopes, frame)?),
7723                span,
7724            }),
7725            Expr::UnaryOp { op, operand, span } => Ok(Expr::UnaryOp {
7726                op,
7727                operand: Box::new(self.resolve_expr_subqueries(*operand, outer_scopes, frame)?),
7728                span,
7729            }),
7730            Expr::Cast {
7731                inner,
7732                target,
7733                span,
7734            } => Ok(Expr::Cast {
7735                inner: Box::new(self.resolve_expr_subqueries(*inner, outer_scopes, frame)?),
7736                target,
7737                span,
7738            }),
7739            Expr::FunctionCall { name, args, span } => {
7740                let args = args
7741                    .into_iter()
7742                    .map(|arg| self.resolve_expr_subqueries(arg, outer_scopes, frame))
7743                    .collect::<RedDBResult<Vec<_>>>()?;
7744                Ok(Expr::FunctionCall { name, args, span })
7745            }
7746            Expr::Case {
7747                branches,
7748                else_,
7749                span,
7750            } => {
7751                let branches = branches
7752                    .into_iter()
7753                    .map(|(cond, value)| {
7754                        Ok((
7755                            self.resolve_expr_subqueries(cond, outer_scopes, frame)?,
7756                            self.resolve_expr_subqueries(value, outer_scopes, frame)?,
7757                        ))
7758                    })
7759                    .collect::<RedDBResult<Vec<_>>>()?;
7760                let else_ = else_
7761                    .map(|expr| self.resolve_expr_subqueries(*expr, outer_scopes, frame))
7762                    .transpose()?
7763                    .map(Box::new);
7764                Ok(Expr::Case {
7765                    branches,
7766                    else_,
7767                    span,
7768                })
7769            }
7770            Expr::IsNull {
7771                operand,
7772                negated,
7773                span,
7774            } => Ok(Expr::IsNull {
7775                operand: Box::new(self.resolve_expr_subqueries(*operand, outer_scopes, frame)?),
7776                negated,
7777                span,
7778            }),
7779            Expr::InList {
7780                target,
7781                values,
7782                negated,
7783                span,
7784            } => {
7785                let target =
7786                    Box::new(self.resolve_expr_subqueries(*target, outer_scopes, frame)?);
7787                let mut resolved = Vec::new();
7788                for value in values {
7789                    if let Expr::Subquery { query, .. } = value {
7790                        resolved.extend(
7791                            self.execute_expr_subquery_values(query, outer_scopes, frame)?
7792                                .into_iter()
7793                                .map(Expr::lit),
7794                        );
7795                    } else {
7796                        resolved.push(self.resolve_expr_subqueries(value, outer_scopes, frame)?);
7797                    }
7798                }
7799                Ok(Expr::InList {
7800                    target,
7801                    values: resolved,
7802                    negated,
7803                    span,
7804                })
7805            }
7806            Expr::Between {
7807                target,
7808                low,
7809                high,
7810                negated,
7811                span,
7812            } => Ok(Expr::Between {
7813                target: Box::new(self.resolve_expr_subqueries(*target, outer_scopes, frame)?),
7814                low: Box::new(self.resolve_expr_subqueries(*low, outer_scopes, frame)?),
7815                high: Box::new(self.resolve_expr_subqueries(*high, outer_scopes, frame)?),
7816                negated,
7817                span,
7818            }),
7819            other => Ok(other),
7820        }
7821    }
7822
7823    fn execute_expr_subquery_values(
7824        &self,
7825        subquery: crate::storage::query::ast::ExprSubquery,
7826        outer_scopes: &[String],
7827        frame: &dyn super::statement_frame::ReadFrame,
7828    ) -> RedDBResult<Vec<Value>> {
7829        let query = *subquery.query;
7830        if query_references_outer_scope(&query, outer_scopes) {
7831            return Err(RedDBError::Query(
7832                "NOT_YET_SUPPORTED: correlated subqueries are not supported yet; track follow-up issue #470-correlated-subqueries".to_string(),
7833            ));
7834        }
7835        let query = self.rewrite_view_refs(query);
7836        let query = self.resolve_select_expr_subqueries(query, frame)?;
7837        let query = self.authorize_relational_select_expr(query, frame)?;
7838        let result = match query {
7839            QueryExpr::Table(table) => {
7840                execute_runtime_table_query(&self.inner.db, &table, Some(&self.inner.index_store))?
7841            }
7842            QueryExpr::Join(join) => execute_runtime_join_query(&self.inner.db, &join)?,
7843            other => {
7844                return Err(RedDBError::Query(format!(
7845                    "expression subquery must be a SELECT query, got {}",
7846                    query_expr_name(&other)
7847                )))
7848            }
7849        };
7850        first_column_values(result)
7851    }
7852
7853    fn dispatch_expr(
7854        &self,
7855        expr: QueryExpr,
7856        query_str: &str,
7857        mode: QueryMode,
7858    ) -> RedDBResult<RuntimeQueryResult> {
7859        let statement = query_expr_name(&expr);
7860        match expr {
7861            QueryExpr::Graph(_) | QueryExpr::Path(_) => {
7862                // Graph queries are not cacheable as prepared statements.
7863                Err(RedDBError::Query(
7864                    "graph queries cannot be used as prepared statements".to_string(),
7865                ))
7866            }
7867            QueryExpr::Table(table) => {
7868                let scope = self.ai_scope();
7869                let table = self.resolve_table_expr_subqueries(
7870                    table,
7871                    &scope as &dyn super::statement_frame::ReadFrame,
7872                )?;
7873                if super::red_schema::is_virtual_table(&table.table) {
7874                    return Ok(RuntimeQueryResult {
7875                        query: query_str.to_string(),
7876                        mode,
7877                        statement,
7878                        engine: "runtime-red-schema",
7879                        result: super::red_schema::red_query(
7880                            self,
7881                            &table.table,
7882                            &table,
7883                            &scope as &dyn super::statement_frame::ReadFrame,
7884                        )?,
7885                        affected_rows: 0,
7886                        statement_type: "select",
7887                    });
7888                }
7889                let Some(table_with_rls) = self.authorize_relational_table_select(
7890                    table,
7891                    &scope as &dyn super::statement_frame::ReadFrame,
7892                )?
7893                else {
7894                    return Ok(RuntimeQueryResult {
7895                        query: query_str.to_string(),
7896                        mode,
7897                        statement,
7898                        engine: "runtime-table-rls",
7899                        result: crate::storage::query::unified::UnifiedResult::empty(),
7900                        affected_rows: 0,
7901                        statement_type: "select",
7902                    });
7903                };
7904                Ok(RuntimeQueryResult {
7905                    query: query_str.to_string(),
7906                    mode,
7907                    statement,
7908                    engine: "runtime-table",
7909                    result: execute_runtime_table_query(
7910                        &self.inner.db,
7911                        &table_with_rls,
7912                        Some(&self.inner.index_store),
7913                    )?,
7914                    affected_rows: 0,
7915                    statement_type: "select",
7916                })
7917            }
7918            QueryExpr::Join(join) => {
7919                let scope = self.ai_scope();
7920                let Some(join_with_rls) = self.authorize_relational_join_select(
7921                    join,
7922                    &scope as &dyn super::statement_frame::ReadFrame,
7923                )?
7924                else {
7925                    return Ok(RuntimeQueryResult {
7926                        query: query_str.to_string(),
7927                        mode,
7928                        statement,
7929                        engine: "runtime-join-rls",
7930                        result: crate::storage::query::unified::UnifiedResult::empty(),
7931                        affected_rows: 0,
7932                        statement_type: "select",
7933                    });
7934                };
7935                Ok(RuntimeQueryResult {
7936                    query: query_str.to_string(),
7937                    mode,
7938                    statement,
7939                    engine: "runtime-join",
7940                    result: execute_runtime_join_query(&self.inner.db, &join_with_rls)?,
7941                    affected_rows: 0,
7942                    statement_type: "select",
7943                })
7944            }
7945            QueryExpr::Vector(vector) => Ok(RuntimeQueryResult {
7946                query: query_str.to_string(),
7947                mode,
7948                statement,
7949                engine: "runtime-vector",
7950                result: execute_runtime_vector_query(&self.inner.db, &vector)?,
7951                affected_rows: 0,
7952                statement_type: "select",
7953            }),
7954            QueryExpr::Hybrid(hybrid) => Ok(RuntimeQueryResult {
7955                query: query_str.to_string(),
7956                mode,
7957                statement,
7958                engine: "runtime-hybrid",
7959                result: execute_runtime_hybrid_query(&self.inner.db, &hybrid)?,
7960                affected_rows: 0,
7961                statement_type: "select",
7962            }),
7963            QueryExpr::Insert(ref insert) if super::red_schema::is_virtual_table(&insert.table) => {
7964                Err(RedDBError::Query(
7965                    super::red_schema::READ_ONLY_ERROR.to_string(),
7966                ))
7967            }
7968            QueryExpr::Update(ref update) if super::red_schema::is_virtual_table(&update.table) => {
7969                Err(RedDBError::Query(
7970                    super::red_schema::READ_ONLY_ERROR.to_string(),
7971                ))
7972            }
7973            QueryExpr::Delete(ref delete) if super::red_schema::is_virtual_table(&delete.table) => {
7974                Err(RedDBError::Query(
7975                    super::red_schema::READ_ONLY_ERROR.to_string(),
7976                ))
7977            }
7978            QueryExpr::Insert(ref insert) => self
7979                .with_deferred_store_wal_for_dml(self.insert_may_emit_events(insert), || {
7980                    self.execute_insert(query_str, insert)
7981                }),
7982            QueryExpr::Update(ref update) => self
7983                .with_deferred_store_wal_for_dml(self.update_may_emit_events(update), || {
7984                    self.execute_update(query_str, update)
7985                }),
7986            QueryExpr::Delete(ref delete) => self
7987                .with_deferred_store_wal_for_dml(self.delete_may_emit_events(delete), || {
7988                    self.execute_delete(query_str, delete)
7989                }),
7990            QueryExpr::SearchCommand(ref cmd) => self.execute_search_command(query_str, cmd),
7991            QueryExpr::Ask(ref ask) => self.execute_ask(query_str, ask),
7992            _ => Err(RedDBError::Query(format!(
7993                "prepared-statement execution does not support {statement} statements"
7994            ))),
7995        }
7996    }
7997
7998    /// Ultra-fast path: detect `SELECT * FROM table WHERE _entity_id = N` by string pattern
7999    /// and execute it without SQL parsing or planning. Returns None if pattern doesn't match.
8000    fn try_fast_entity_lookup(&self, query: &str) -> Option<RedDBResult<RuntimeQueryResult>> {
8001        // Pattern: "SELECT * FROM <table> WHERE _entity_id = <id>"
8002        // or "SELECT * FROM <table> WHERE _entity_id =<id>"
8003        let q = query.trim();
8004        if !q.starts_with("SELECT") && !q.starts_with("select") {
8005            return None;
8006        }
8007
8008        // Find "WHERE _entity_id = " or "WHERE _entity_id ="
8009        let where_pos = q
8010            .find("WHERE _entity_id")
8011            .or_else(|| q.find("where _entity_id"))?;
8012        let after_field = &q[where_pos + 16..].trim_start(); // skip "WHERE _entity_id"
8013        let after_eq = after_field.strip_prefix('=')?.trim_start();
8014
8015        // Parse the entity ID number
8016        let id_str = after_eq.trim();
8017        let entity_id: u64 = id_str.parse().ok()?;
8018
8019        // Extract table name: between "FROM " and " WHERE"
8020        let from_pos = q.find("FROM ").or_else(|| q.find("from "))? + 5;
8021        let table = q[from_pos..where_pos].trim();
8022        if table.is_empty()
8023            || table.contains(' ') && !table.contains(" AS ") && !table.contains(" as ")
8024        {
8025            return None; // complex query, fall through
8026        }
8027        let table_name = table.split_whitespace().next()?;
8028
8029        // Direct entity lookup — skips SQL parse, plan cache, result
8030        // cache, view rewriter, RLS gate. Safe because the gating in
8031        // `execute_query` guarantees no scope override / no
8032        // transaction context is active. MVCC visibility is still
8033        // honoured against the current snapshot.
8034        let store = self.inner.db.store();
8035        let entity = store
8036            .get(
8037                table_name,
8038                crate::storage::unified::EntityId::new(entity_id),
8039            )
8040            .filter(entity_visible_under_current_snapshot);
8041
8042        let count = if entity.is_some() { 1u64 } else { 0 };
8043
8044        // Materialize a record so downstream consumers that walk
8045        // `result.records` (embedded runtime API, decrypt pass, CLI)
8046        // see the row. Previously only `pre_serialized_json` was
8047        // filled, which caused those consumers to see zero rows and
8048        // skewed benchmarks.
8049        let records: Vec<crate::storage::query::unified::UnifiedRecord> = entity
8050            .as_ref()
8051            .and_then(|e| runtime_table_record_from_entity(e.clone()))
8052            .into_iter()
8053            .collect();
8054
8055        let json = match entity {
8056            Some(ref e) => execute_runtime_serialize_single_entity(e),
8057            None => r#"{"columns":[],"record_count":0,"selection":{"scope":"any"},"records":[]}"#
8058                .to_string(),
8059        };
8060
8061        Some(Ok(RuntimeQueryResult {
8062            query: query.to_string(),
8063            mode: crate::storage::query::modes::QueryMode::Sql,
8064            statement: "select",
8065            engine: "fast-entity-lookup",
8066            result: crate::storage::query::unified::UnifiedResult {
8067                columns: Vec::new(),
8068                records,
8069                stats: crate::storage::query::unified::QueryStats {
8070                    rows_scanned: count,
8071                    ..Default::default()
8072                },
8073                pre_serialized_json: Some(json),
8074            },
8075            affected_rows: 0,
8076            statement_type: "select",
8077        }))
8078    }
8079
8080    fn result_cache_backend(&self) -> RuntimeResultCacheBackend {
8081        match self
8082            .config_string(RESULT_CACHE_BACKEND_KEY, RESULT_CACHE_DEFAULT_BACKEND)
8083            .as_str()
8084        {
8085            "blob_cache" => RuntimeResultCacheBackend::BlobCache,
8086            "shadow" => RuntimeResultCacheBackend::Shadow,
8087            _ => RuntimeResultCacheBackend::Legacy,
8088        }
8089    }
8090
8091    pub(super) fn get_result_cache_entry(&self, key: &str) -> Option<RuntimeQueryResult> {
8092        match self.result_cache_backend() {
8093            RuntimeResultCacheBackend::Legacy => self.get_legacy_result_cache_entry(key),
8094            RuntimeResultCacheBackend::BlobCache => self.get_blob_result_cache_entry(key),
8095            RuntimeResultCacheBackend::Shadow => {
8096                let legacy = self.get_legacy_result_cache_entry(key);
8097                let blob = self.get_blob_result_cache_entry(key);
8098                if let (Some(ref legacy), Some(ref blob)) = (&legacy, &blob) {
8099                    if result_cache_fingerprint(legacy) != result_cache_fingerprint(blob) {
8100                        self.inner
8101                            .result_cache_shadow_divergences
8102                            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
8103                        tracing::warn!(
8104                            key,
8105                            metric = crate::runtime::METRIC_CACHE_SHADOW_DIVERGENCE_TOTAL,
8106                            "result cache shadow backend diverged from legacy"
8107                        );
8108                    }
8109                }
8110                legacy
8111            }
8112        }
8113    }
8114
8115    fn get_legacy_result_cache_entry(&self, key: &str) -> Option<RuntimeQueryResult> {
8116        let cache = self.inner.result_cache.read();
8117        cache.0.get(key).and_then(|entry| {
8118            if entry.cached_at.elapsed().as_secs() < RESULT_CACHE_TTL_SECS {
8119                Some(entry.result.clone())
8120            } else {
8121                None
8122            }
8123        })
8124    }
8125
8126    fn get_blob_result_cache_entry(&self, key: &str) -> Option<RuntimeQueryResult> {
8127        let hit = self
8128            .inner
8129            .result_blob_cache
8130            .get(RESULT_CACHE_BLOB_NAMESPACE, key)?;
8131        {
8132            let cache = self.inner.result_blob_entries.read();
8133            if let Some(entry) = cache.0.get(key) {
8134                return Some(entry.result.clone());
8135            }
8136        }
8137
8138        let (result, scopes) = decode_result_cache_payload(hit.value())?;
8139        let mut cache = self.inner.result_blob_entries.write();
8140        let (ref mut map, ref mut order) = *cache;
8141        if !map.contains_key(key) {
8142            order.push_back(key.to_string());
8143        }
8144        map.insert(
8145            key.to_string(),
8146            RuntimeResultCacheEntry {
8147                result: result.clone(),
8148                cached_at: std::time::Instant::now(),
8149                scopes,
8150            },
8151        );
8152        trim_result_cache(map, order);
8153        Some(result)
8154    }
8155
8156    pub(super) fn put_result_cache_entry(&self, key: &str, entry: RuntimeResultCacheEntry) {
8157        match self.result_cache_backend() {
8158            RuntimeResultCacheBackend::Legacy => self.put_legacy_result_cache_entry(key, entry),
8159            RuntimeResultCacheBackend::BlobCache => self.put_blob_result_cache_entry(key, entry),
8160            RuntimeResultCacheBackend::Shadow => {
8161                self.put_legacy_result_cache_entry(key, entry.clone());
8162                self.put_blob_result_cache_entry(key, entry);
8163            }
8164        }
8165    }
8166
8167    fn put_legacy_result_cache_entry(&self, key: &str, entry: RuntimeResultCacheEntry) {
8168        let mut cache = self.inner.result_cache.write();
8169        let (ref mut map, ref mut order) = *cache;
8170        if !map.contains_key(key) {
8171            order.push_back(key.to_string());
8172        }
8173        map.insert(key.to_string(), entry);
8174        trim_result_cache(map, order);
8175    }
8176
8177    fn put_blob_result_cache_entry(&self, key: &str, entry: RuntimeResultCacheEntry) {
8178        let policy = crate::storage::cache::BlobCachePolicy::default()
8179            .ttl_ms(RESULT_CACHE_TTL_SECS * 1000)
8180            .priority(200);
8181        let dependencies = entry.scopes.iter().cloned().collect::<Vec<_>>();
8182        let bytes = encode_result_cache_payload(&entry)
8183            .unwrap_or_else(|| result_cache_fingerprint(&entry.result).into_bytes());
8184        let put = crate::storage::cache::BlobCachePut::new(bytes)
8185            .with_dependencies(dependencies)
8186            .with_policy(policy);
8187        if self
8188            .inner
8189            .result_blob_cache
8190            .put(RESULT_CACHE_BLOB_NAMESPACE, key, put)
8191            .is_err()
8192        {
8193            return;
8194        }
8195
8196        let mut cache = self.inner.result_blob_entries.write();
8197        let (ref mut map, ref mut order) = *cache;
8198        if !map.contains_key(key) {
8199            order.push_back(key.to_string());
8200        }
8201        map.insert(key.to_string(), entry);
8202        trim_result_cache(map, order);
8203    }
8204
8205    pub fn result_cache_shadow_divergences(&self) -> u64 {
8206        self.inner
8207            .result_cache_shadow_divergences
8208            .load(std::sync::atomic::Ordering::Relaxed)
8209    }
8210
8211    /// Invalidate the result cache (call after any write operation).
8212    /// Full clear — use for DDL (DROP TABLE, schema changes) or when table is unknown.
8213    pub fn invalidate_result_cache(&self) {
8214        let mut cache = self.inner.result_cache.write();
8215        cache.0.clear();
8216        cache.1.clear();
8217        let mut blob_entries = self.inner.result_blob_entries.write();
8218        blob_entries.0.clear();
8219        blob_entries.1.clear();
8220        self.inner
8221            .result_blob_cache
8222            .invalidate_namespace(RESULT_CACHE_BLOB_NAMESPACE);
8223        let mut ask_entries = self.inner.ask_answer_cache_entries.write();
8224        ask_entries.0.clear();
8225        ask_entries.1.clear();
8226        self.inner
8227            .result_blob_cache
8228            .invalidate_namespace(ASK_ANSWER_CACHE_NAMESPACE);
8229    }
8230
8231    /// Invalidate only result cache entries that declared a dependency on `table`.
8232    /// Cheaper than a full clear: unrelated tables keep their cached results.
8233    pub(crate) fn invalidate_result_cache_for_table(&self, table: &str) {
8234        // Hot-path probe both backends before taking write locks. The blob
8235        // backend is node-local, same as the legacy result cache.
8236        let legacy_has_match = {
8237            let cache = self.inner.result_cache.read();
8238            let (ref map, _) = *cache;
8239            !map.is_empty() && map.values().any(|entry| entry.scopes.contains(table))
8240        };
8241        let blob_has_match = {
8242            let cache = self.inner.result_blob_entries.read();
8243            let (ref map, _) = *cache;
8244            !map.is_empty() && map.values().any(|entry| entry.scopes.contains(table))
8245        };
8246        if legacy_has_match {
8247            let mut cache = self.inner.result_cache.write();
8248            let (ref mut map, ref mut order) = *cache;
8249            map.retain(|_, entry| !entry.scopes.contains(table));
8250            order.retain(|key| map.contains_key(key));
8251        }
8252
8253        if matches!(
8254            self.result_cache_backend(),
8255            RuntimeResultCacheBackend::BlobCache | RuntimeResultCacheBackend::Shadow
8256        ) {
8257            let mut blob_entries = self.inner.result_blob_entries.write();
8258            let (ref mut blob_map, ref mut blob_order) = *blob_entries;
8259            blob_map.clear();
8260            blob_order.clear();
8261            self.inner
8262                .result_blob_cache
8263                .invalidate_namespace(RESULT_CACHE_BLOB_NAMESPACE);
8264        } else if blob_has_match {
8265            let mut blob_entries = self.inner.result_blob_entries.write();
8266            let (ref mut blob_map, ref mut blob_order) = *blob_entries;
8267            blob_map.retain(|_, entry| !entry.scopes.contains(table));
8268            blob_order.retain(|key| blob_map.contains_key(key));
8269        }
8270        let mut ask_entries = self.inner.ask_answer_cache_entries.write();
8271        ask_entries.0.clear();
8272        ask_entries.1.clear();
8273        self.inner
8274            .result_blob_cache
8275            .invalidate_namespace(ASK_ANSWER_CACHE_NAMESPACE);
8276    }
8277
8278    pub(crate) fn invalidate_plan_cache(&self) {
8279        self.inner.query_cache.write().clear();
8280        self.inner
8281            .ddl_epoch
8282            .fetch_add(1, std::sync::atomic::Ordering::Release);
8283    }
8284
8285    /// Read the monotonic DDL epoch counter. Bumped by every
8286    /// `invalidate_plan_cache` call so prepared-statement holders can
8287    /// detect schema drift between PREPARE and EXECUTE.
8288    pub fn ddl_epoch(&self) -> u64 {
8289        self.inner
8290            .ddl_epoch
8291            .load(std::sync::atomic::Ordering::Acquire)
8292    }
8293
8294    pub(crate) fn clear_table_planner_stats(&self, table: &str) {
8295        let store = self.inner.db.store();
8296        crate::storage::query::planner::stats_catalog::clear_table_stats(store.as_ref(), table);
8297        self.invalidate_plan_cache();
8298    }
8299
8300    /// Replay `tenant_tables.*.column` keys from red_config at boot so
8301    /// `CREATE TABLE ... TENANT BY (col)` declarations persist across
8302    /// restarts (Phase 2.5.4). Reads every row of the `red_config`
8303    /// collection, picks the keys matching the tenant-marker shape,
8304    /// and calls `register_tenant_table` for each.
8305    ///
8306    /// Safe no-op when `red_config` doesn't exist (first boot on a
8307    /// fresh datadir).
8308    pub(crate) fn rehydrate_tenant_tables(&self) {
8309        let store = self.inner.db.store();
8310        let Some(manager) = store.get_collection("red_config") else {
8311            return;
8312        };
8313        // Replay in insertion order (SegmentManager iteration). Multiple
8314        // toggles on the same table leave several rows behind — the
8315        // last one processed wins because each register/unregister
8316        // call overwrites the in-memory state.
8317        for entity in manager.query_all(|_| true) {
8318            let crate::storage::unified::entity::EntityData::Row(row) = &entity.data else {
8319                continue;
8320            };
8321            let Some(named) = &row.named else { continue };
8322            let Some(crate::storage::schema::Value::Text(key)) = named.get("key") else {
8323                continue;
8324            };
8325            // Shape: tenant_tables.{table}.column
8326            let Some(rest) = key.strip_prefix("tenant_tables.") else {
8327                continue;
8328            };
8329            let Some((table, suffix)) = rest.rsplit_once('.') else {
8330                // Issue #205 — a `tenant_tables.*` row that doesn't
8331                // split cleanly is a schema-shape regression: the
8332                // metadata writer must always emit the `.column`
8333                // suffix, so reaching this branch means an upgrade
8334                // with incompatible state or external tampering.
8335                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
8336                    collection: "red_config".to_string(),
8337                    detail: format!("malformed tenant_tables key: {key}"),
8338                }
8339                .emit_global();
8340                continue;
8341            };
8342            if suffix != "column" {
8343                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
8344                    collection: "red_config".to_string(),
8345                    detail: format!("unexpected tenant_tables suffix: {key}"),
8346                }
8347                .emit_global();
8348                continue;
8349            }
8350            match named.get("value") {
8351                Some(crate::storage::schema::Value::Text(column)) => {
8352                    self.register_tenant_table(table, column);
8353                }
8354                // Null / missing value = DISABLE TENANCY marker.
8355                Some(crate::storage::schema::Value::Null) | None => {
8356                    self.unregister_tenant_table(table);
8357                }
8358                _ => {}
8359            }
8360        }
8361    }
8362
8363    /// Replay every persisted `MaterializedViewDescriptor` from the
8364    /// `red_materialized_view_defs` system collection (issue #593
8365    /// slice 9a). For each descriptor, re-parse the original SQL,
8366    /// extract the `QueryExpr::CreateView` it produced, and populate
8367    /// the in-memory registries (`inner.views` and
8368    /// `inner.materialized_views`) directly — no write paths run, so
8369    /// rehydrate does not re-persist what it just read.
8370    ///
8371    /// Malformed rows (missing `name`/`source_sql`, parse errors) are
8372    /// skipped with a `SchemaCorruption` operator event so a single
8373    /// bad entry does not block startup.
8374    pub(crate) fn rehydrate_materialized_view_descriptors(&self) {
8375        let store = self.inner.db.store();
8376        let descriptors = crate::runtime::continuous_materialized_view::load_all(store.as_ref());
8377        for descriptor in descriptors {
8378            let parsed = match crate::storage::query::parser::parse(&descriptor.source_sql) {
8379                Ok(qc) => qc,
8380                Err(err) => {
8381                    crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
8382                        collection:
8383                            crate::runtime::continuous_materialized_view::CATALOG_COLLECTION
8384                                .to_string(),
8385                        detail: format!(
8386                            "failed to re-parse materialized-view source for {}: {err}",
8387                            descriptor.name
8388                        ),
8389                    }
8390                    .emit_global();
8391                    continue;
8392                }
8393            };
8394            let crate::storage::query::ast::QueryExpr::CreateView(create) = parsed.query else {
8395                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
8396                    collection: crate::runtime::continuous_materialized_view::CATALOG_COLLECTION
8397                        .to_string(),
8398                    detail: format!(
8399                        "materialized-view source for {} did not re-parse as CREATE VIEW",
8400                        descriptor.name
8401                    ),
8402                }
8403                .emit_global();
8404                continue;
8405            };
8406            // Populate in-memory view registry.
8407            let view_name = create.name.clone();
8408            self.inner
8409                .views
8410                .write()
8411                .insert(view_name.clone(), Arc::new(create));
8412            // Materialized cache slot (data empty until next REFRESH).
8413            use crate::storage::cache::result::{MaterializedViewDef, RefreshPolicy};
8414            let refresh = match descriptor.refresh_every_ms {
8415                Some(ms) => RefreshPolicy::Periodic(std::time::Duration::from_millis(ms)),
8416                None => RefreshPolicy::Manual,
8417            };
8418            let def = MaterializedViewDef {
8419                name: view_name.clone(),
8420                query: format!("<parsed view {}>", view_name),
8421                dependencies: descriptor.source_collections.clone(),
8422                refresh,
8423                retention_duration_ms: descriptor.retention_duration_ms,
8424            };
8425            self.inner.materialized_views.write().register(def);
8426        }
8427        // A rehydrated view shape may differ from any plans the cache
8428        // bootstrapped before this method ran — flush to be safe.
8429        self.invalidate_plan_cache();
8430    }
8431
8432    pub(crate) fn rehydrate_declared_column_schemas(&self) {
8433        let store = self.inner.db.store();
8434        for contract in self.inner.db.collection_contracts() {
8435            let columns: Vec<String> = contract
8436                .declared_columns
8437                .iter()
8438                .map(|column| column.name.clone())
8439                .collect();
8440            let Some(manager) = store.get_collection(&contract.name) else {
8441                continue;
8442            };
8443            manager.set_column_schema_if_empty(columns);
8444        }
8445    }
8446
8447    /// Register a table as tenant-scoped (Phase 2.5.4). Installs the
8448    /// in-memory column mapping, the implicit RLS policy, and enables
8449    /// row-level security on the table. Idempotent — re-registering
8450    /// the same `(table, column)` replaces the prior auto-policy.
8451    pub fn register_tenant_table(&self, table: &str, column: &str) {
8452        use crate::storage::query::ast::{
8453            CompareOp, CreatePolicyQuery, Expr, FieldRef, Filter, Span,
8454        };
8455        self.inner
8456            .tenant_tables
8457            .write()
8458            .insert(table.to_string(), column.to_string());
8459
8460        // Build the policy: col = CURRENT_TENANT()
8461        // Uses CompareExpr so the comparison happens at runtime against
8462        // the thread-local tenant value read by the CURRENT_TENANT
8463        // scalar. Spans are synthetic — there's no source location for
8464        // an auto-generated policy.
8465        let lhs = Expr::Column {
8466            field: FieldRef::TableColumn {
8467                table: table.to_string(),
8468                column: column.to_string(),
8469            },
8470            span: Span::synthetic(),
8471        };
8472        let rhs = Expr::FunctionCall {
8473            name: "CURRENT_TENANT".to_string(),
8474            args: Vec::new(),
8475            span: Span::synthetic(),
8476        };
8477        let policy_filter = Filter::CompareExpr {
8478            lhs,
8479            op: CompareOp::Eq,
8480            rhs,
8481        };
8482
8483        let policy = CreatePolicyQuery {
8484            name: "__tenant_iso".to_string(),
8485            table: table.to_string(),
8486            action: None, // None = ALL actions (SELECT/INSERT/UPDATE/DELETE)
8487            role: None,   // None = every role
8488            using: Box::new(policy_filter),
8489            // Auto-tenancy defaults to Table targets. Collections of
8490            // other kinds (graph / vector / queue / timeseries) that
8491            // opt in via `ALTER ... ENABLE TENANCY` should use the
8492            // matching kind — but for now we keep the auto-policy
8493            // kind-agnostic so the evaluator can apply it to any
8494            // entity living in the collection.
8495            target_kind: crate::storage::query::ast::PolicyTargetKind::Table,
8496        };
8497
8498        // Replace any prior auto-policy for this table (column rename).
8499        self.inner.rls_policies.write().insert(
8500            (table.to_string(), "__tenant_iso".to_string()),
8501            Arc::new(policy),
8502        );
8503        self.inner
8504            .rls_enabled_tables
8505            .write()
8506            .insert(table.to_string());
8507
8508        // Auto-build a hash index on the tenant column. Every read/write
8509        // against a tenant-scoped table carries an implicit
8510        // `col = CURRENT_TENANT()` predicate from the auto-policy, so an
8511        // index on that column is on the hot path of every query. Without
8512        // it, every SELECT/UPDATE/DELETE degrades to a full scan.
8513        self.ensure_tenant_index(table, column);
8514    }
8515
8516    /// Auto-create the hash index that backs the tenant-iso RLS predicate.
8517    /// Skipped when:
8518    ///   * the column is dotted (nested path — flat secondary indices
8519    ///     don't cover those today; RLS still works via the policy)
8520    ///   * `__tenant_idx_{table}` already exists (idempotent on rehydrate)
8521    ///   * the user already registered an index whose first column matches
8522    ///     (avoids redundant duplicates of a user-defined composite)
8523    fn ensure_tenant_index(&self, table: &str, column: &str) {
8524        if column.contains('.') {
8525            return;
8526        }
8527        let index_name = format!("__tenant_idx_{table}");
8528        let registry = self.inner.index_store.list_indices(table);
8529        if registry.iter().any(|idx| idx.name == index_name) {
8530            return;
8531        }
8532        if registry
8533            .iter()
8534            .any(|idx| idx.columns.first().map(|c| c.as_str()) == Some(column))
8535        {
8536            return;
8537        }
8538
8539        let store = self.inner.db.store();
8540        let Some(manager) = store.get_collection(table) else {
8541            return;
8542        };
8543        let entities = manager.query_all(|_| true);
8544        let entity_fields: Vec<(
8545            crate::storage::unified::EntityId,
8546            Vec<(String, crate::storage::schema::Value)>,
8547        )> = entities
8548            .iter()
8549            .map(|e| {
8550                let fields = match &e.data {
8551                    crate::storage::EntityData::Row(row) => {
8552                        if let Some(ref named) = row.named {
8553                            named.iter().map(|(k, v)| (k.clone(), v.clone())).collect()
8554                        } else if let Some(ref schema) = row.schema {
8555                            schema
8556                                .iter()
8557                                .zip(row.columns.iter())
8558                                .map(|(k, v)| (k.clone(), v.clone()))
8559                                .collect()
8560                        } else {
8561                            Vec::new()
8562                        }
8563                    }
8564                    crate::storage::EntityData::Node(node) => node
8565                        .properties
8566                        .iter()
8567                        .map(|(k, v)| (k.clone(), v.clone()))
8568                        .collect(),
8569                    _ => Vec::new(),
8570                };
8571                (e.id, fields)
8572            })
8573            .collect();
8574
8575        let columns = vec![column.to_string()];
8576        if self
8577            .inner
8578            .index_store
8579            .create_index(
8580                &index_name,
8581                table,
8582                &columns,
8583                super::index_store::IndexMethodKind::Hash,
8584                false,
8585                &entity_fields,
8586            )
8587            .is_err()
8588        {
8589            return;
8590        }
8591        self.inner
8592            .index_store
8593            .register(super::index_store::RegisteredIndex {
8594                name: index_name,
8595                collection: table.to_string(),
8596                columns,
8597                method: super::index_store::IndexMethodKind::Hash,
8598                unique: false,
8599            });
8600        self.invalidate_plan_cache();
8601    }
8602
8603    /// Drop the auto-generated tenant index, if one exists. Called from
8604    /// `unregister_tenant_table` so DISABLE TENANCY / DROP TABLE clean up.
8605    fn drop_tenant_index(&self, table: &str) {
8606        let index_name = format!("__tenant_idx_{table}");
8607        self.inner.index_store.drop_index(&index_name, table);
8608    }
8609
8610    /// Retrieve the tenant column for a table, if any (Phase 2.5.4).
8611    /// Used by the INSERT auto-fill path to know which column to
8612    /// populate with `current_tenant()` when the user didn't name it.
8613    pub fn tenant_column(&self, table: &str) -> Option<String> {
8614        self.inner.tenant_tables.read().get(table).cloned()
8615    }
8616
8617    /// Remove a table's tenant registration (Phase 2.5.4). Called by
8618    /// DROP TABLE / ALTER TABLE DISABLE TENANCY. Removes the auto-policy
8619    /// but leaves any user-installed explicit policies intact.
8620    pub fn unregister_tenant_table(&self, table: &str) {
8621        self.inner.tenant_tables.write().remove(table);
8622        self.inner
8623            .rls_policies
8624            .write()
8625            .remove(&(table.to_string(), "__tenant_iso".to_string()));
8626        self.drop_tenant_index(table);
8627        // Only clear RLS enablement if no other policies remain.
8628        let has_other_policies = self
8629            .inner
8630            .rls_policies
8631            .read()
8632            .keys()
8633            .any(|(t, _)| t == table);
8634        if !has_other_policies {
8635            self.inner.rls_enabled_tables.write().remove(table);
8636        }
8637    }
8638
8639    /// Record that the running transaction has marked `id` in `collection`
8640    /// for deletion (Phase 2.3.2b MVCC tombstones). `stamper_xid` is the
8641    /// xid that was written into `xmax` — either the parent txn xid or
8642    /// the innermost savepoint sub-xid. Savepoint rollback filters by
8643    /// this xid to revive only its own tombstones.
8644    pub(crate) fn record_pending_tombstone(
8645        &self,
8646        conn_id: u64,
8647        collection: &str,
8648        id: crate::storage::unified::entity::EntityId,
8649        stamper_xid: crate::storage::transaction::snapshot::Xid,
8650        previous_xmax: crate::storage::transaction::snapshot::Xid,
8651    ) {
8652        self.inner
8653            .pending_tombstones
8654            .write()
8655            .entry(conn_id)
8656            .or_default()
8657            .push((collection.to_string(), id, stamper_xid, previous_xmax));
8658    }
8659
8660    pub(crate) fn record_pending_versioned_update(
8661        &self,
8662        conn_id: u64,
8663        collection: &str,
8664        old_id: crate::storage::unified::entity::EntityId,
8665        new_id: crate::storage::unified::entity::EntityId,
8666        stamper_xid: crate::storage::transaction::snapshot::Xid,
8667        previous_xmax: crate::storage::transaction::snapshot::Xid,
8668    ) {
8669        self.inner
8670            .pending_versioned_updates
8671            .write()
8672            .entry(conn_id)
8673            .or_default()
8674            .push((
8675                collection.to_string(),
8676                old_id,
8677                new_id,
8678                stamper_xid,
8679                previous_xmax,
8680            ));
8681    }
8682
8683    fn with_deferred_store_wal_if_transaction<T>(
8684        &self,
8685        f: impl FnOnce() -> RedDBResult<T>,
8686    ) -> RedDBResult<T> {
8687        let conn_id = current_connection_id();
8688        if !self.inner.tx_contexts.read().contains_key(&conn_id) {
8689            return f();
8690        }
8691
8692        crate::storage::UnifiedStore::begin_deferred_store_wal_capture();
8693        let result = f();
8694        let captured = crate::storage::UnifiedStore::take_deferred_store_wal_capture();
8695        match result {
8696            Ok(value) => {
8697                self.record_pending_store_wal_actions(conn_id, captured);
8698                Ok(value)
8699            }
8700            Err(err) => Err(err),
8701        }
8702    }
8703
8704    fn with_deferred_store_wal_for_dml<T>(
8705        &self,
8706        capture_autocommit_events: bool,
8707        f: impl FnOnce() -> RedDBResult<T>,
8708    ) -> RedDBResult<T> {
8709        let conn_id = current_connection_id();
8710        if self.inner.tx_contexts.read().contains_key(&conn_id) {
8711            return self.with_deferred_store_wal_if_transaction(f);
8712        }
8713        if !capture_autocommit_events {
8714            return f();
8715        }
8716
8717        crate::storage::UnifiedStore::begin_deferred_store_wal_capture();
8718        let result = f();
8719        let captured = crate::storage::UnifiedStore::take_deferred_store_wal_capture();
8720        self.inner
8721            .db
8722            .store()
8723            .append_deferred_store_wal_actions(captured)
8724            .map_err(|err| RedDBError::Internal(err.to_string()))?;
8725        result
8726    }
8727
8728    fn insert_may_emit_events(&self, query: &InsertQuery) -> bool {
8729        !query.suppress_events
8730            && self.collection_has_event_subscriptions_for_operation(
8731                &query.table,
8732                crate::catalog::SubscriptionOperation::Insert,
8733            )
8734    }
8735
8736    fn update_may_emit_events(&self, query: &UpdateQuery) -> bool {
8737        !query.suppress_events
8738            && self.collection_has_event_subscriptions_for_operation(
8739                &query.table,
8740                crate::catalog::SubscriptionOperation::Update,
8741            )
8742    }
8743
8744    fn delete_may_emit_events(&self, query: &DeleteQuery) -> bool {
8745        !query.suppress_events
8746            && self.collection_has_event_subscriptions_for_operation(
8747                &query.table,
8748                crate::catalog::SubscriptionOperation::Delete,
8749            )
8750    }
8751
8752    fn collection_has_event_subscriptions_for_operation(
8753        &self,
8754        collection: &str,
8755        operation: crate::catalog::SubscriptionOperation,
8756    ) -> bool {
8757        let Some(contract) = self.db().collection_contract_arc(collection) else {
8758            return false;
8759        };
8760        contract.subscriptions.iter().any(|subscription| {
8761            subscription.enabled
8762                && (subscription.ops_filter.is_empty()
8763                    || subscription.ops_filter.contains(&operation))
8764        })
8765    }
8766
8767    fn record_pending_store_wal_actions(
8768        &self,
8769        conn_id: u64,
8770        actions: crate::storage::unified::DeferredStoreWalActions,
8771    ) {
8772        if actions.is_empty() {
8773            return;
8774        }
8775        let mut guard = self.inner.pending_store_wal_actions.write();
8776        guard.entry(conn_id).or_default().extend(actions);
8777    }
8778
8779    fn flush_pending_store_wal_actions(&self, conn_id: u64) -> RedDBResult<()> {
8780        let Some(actions) = self
8781            .inner
8782            .pending_store_wal_actions
8783            .write()
8784            .remove(&conn_id)
8785        else {
8786            return Ok(());
8787        };
8788        self.inner
8789            .db
8790            .store()
8791            .append_deferred_store_wal_actions(actions)
8792            .map_err(|err| RedDBError::Internal(err.to_string()))
8793    }
8794
8795    fn discard_pending_store_wal_actions(&self, conn_id: u64) {
8796        self.inner
8797            .pending_store_wal_actions
8798            .write()
8799            .remove(&conn_id);
8800    }
8801
8802    fn xid_conflicts_with_snapshot(
8803        &self,
8804        xid: crate::storage::transaction::snapshot::Xid,
8805        snapshot: &crate::storage::transaction::snapshot::Snapshot,
8806        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
8807    ) -> bool {
8808        xid != 0
8809            && !own_xids.contains(&xid)
8810            && !self.inner.snapshot_manager.is_aborted(xid)
8811            && !self.inner.snapshot_manager.is_active(xid)
8812            && (xid > snapshot.xid || snapshot.in_progress.contains(&xid))
8813    }
8814
8815    fn conflict_error(
8816        collection: &str,
8817        logical_id: crate::storage::unified::entity::EntityId,
8818        xid: crate::storage::transaction::snapshot::Xid,
8819    ) -> RedDBError {
8820        RedDBError::Query(format!(
8821            "serialization conflict: table row {collection}/{} was modified by concurrent transaction {xid}",
8822            logical_id.raw()
8823        ))
8824    }
8825
8826    fn check_logical_row_conflict(
8827        &self,
8828        collection: &str,
8829        logical_id: crate::storage::unified::entity::EntityId,
8830        excluded_ids: &[crate::storage::unified::entity::EntityId],
8831        snapshot: &crate::storage::transaction::snapshot::Snapshot,
8832        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
8833    ) -> RedDBResult<()> {
8834        let store = self.inner.db.store();
8835        let Some(manager) = store.get_collection(collection) else {
8836            return Ok(());
8837        };
8838
8839        for candidate in manager.query_all(|_| true) {
8840            if excluded_ids.contains(&candidate.id) || candidate.logical_id() != logical_id {
8841                continue;
8842            }
8843            if self.xid_conflicts_with_snapshot(candidate.xmin, snapshot, own_xids) {
8844                return Err(Self::conflict_error(collection, logical_id, candidate.xmin));
8845            }
8846            if self.xid_conflicts_with_snapshot(candidate.xmax, snapshot, own_xids) {
8847                return Err(Self::conflict_error(collection, logical_id, candidate.xmax));
8848            }
8849        }
8850        Ok(())
8851    }
8852
8853    pub(crate) fn check_table_row_write_conflicts(
8854        &self,
8855        conn_id: u64,
8856        snapshot: &crate::storage::transaction::snapshot::Snapshot,
8857        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
8858    ) -> RedDBResult<()> {
8859        let versioned_updates = self
8860            .inner
8861            .pending_versioned_updates
8862            .read()
8863            .get(&conn_id)
8864            .cloned()
8865            .unwrap_or_default();
8866        let tombstones = self
8867            .inner
8868            .pending_tombstones
8869            .read()
8870            .get(&conn_id)
8871            .cloned()
8872            .unwrap_or_default();
8873
8874        let store = self.inner.db.store();
8875        for (collection, old_id, new_id, xid, previous_xmax) in versioned_updates {
8876            let Some(manager) = store.get_collection(&collection) else {
8877                continue;
8878            };
8879            let Some(old) = manager.get(old_id) else {
8880                continue;
8881            };
8882            let logical_id = old.logical_id();
8883            if self.xid_conflicts_with_snapshot(previous_xmax, snapshot, own_xids) {
8884                return Err(Self::conflict_error(&collection, logical_id, previous_xmax));
8885            }
8886            if old.xmax != xid && self.xid_conflicts_with_snapshot(old.xmax, snapshot, own_xids) {
8887                return Err(Self::conflict_error(&collection, logical_id, old.xmax));
8888            }
8889            self.check_logical_row_conflict(
8890                &collection,
8891                logical_id,
8892                &[old_id, new_id],
8893                snapshot,
8894                own_xids,
8895            )?;
8896        }
8897
8898        for (collection, id, xid, previous_xmax) in tombstones {
8899            let Some(manager) = store.get_collection(&collection) else {
8900                continue;
8901            };
8902            let Some(entity) = manager.get(id) else {
8903                continue;
8904            };
8905            let logical_id = entity.logical_id();
8906            if self.xid_conflicts_with_snapshot(previous_xmax, snapshot, own_xids) {
8907                return Err(Self::conflict_error(&collection, logical_id, previous_xmax));
8908            }
8909            if entity.xmax != xid
8910                && self.xid_conflicts_with_snapshot(entity.xmax, snapshot, own_xids)
8911            {
8912                return Err(Self::conflict_error(&collection, logical_id, entity.xmax));
8913            }
8914            self.check_logical_row_conflict(&collection, logical_id, &[id], snapshot, own_xids)?;
8915        }
8916
8917        Ok(())
8918    }
8919
8920    pub(crate) fn restore_pending_write_stamps(&self, conn_id: u64) {
8921        let versioned_updates = self
8922            .inner
8923            .pending_versioned_updates
8924            .read()
8925            .get(&conn_id)
8926            .cloned()
8927            .unwrap_or_default();
8928        let tombstones = self
8929            .inner
8930            .pending_tombstones
8931            .read()
8932            .get(&conn_id)
8933            .cloned()
8934            .unwrap_or_default();
8935
8936        let store = self.inner.db.store();
8937        for (collection, old_id, _new_id, xid, _previous_xmax) in versioned_updates {
8938            if let Some(manager) = store.get_collection(&collection) {
8939                if let Some(mut entity) = manager.get(old_id) {
8940                    entity.set_xmax(xid);
8941                    let _ = manager.update(entity);
8942                }
8943            }
8944        }
8945        for (collection, id, xid, _previous_xmax) in tombstones {
8946            if let Some(manager) = store.get_collection(&collection) {
8947                if let Some(mut entity) = manager.get(id) {
8948                    entity.set_xmax(xid);
8949                    let _ = manager.update(entity);
8950                }
8951            }
8952        }
8953    }
8954
8955    pub(crate) fn finalize_pending_versioned_updates(&self, conn_id: u64) {
8956        self.inner
8957            .pending_versioned_updates
8958            .write()
8959            .remove(&conn_id);
8960    }
8961
8962    pub(crate) fn revive_pending_versioned_updates(&self, conn_id: u64) {
8963        let Some(pending) = self
8964            .inner
8965            .pending_versioned_updates
8966            .write()
8967            .remove(&conn_id)
8968        else {
8969            return;
8970        };
8971
8972        let store = self.inner.db.store();
8973        for (collection, old_id, new_id, xid, previous_xmax) in pending {
8974            if let Some(manager) = store.get_collection(&collection) {
8975                if let Some(mut old) = manager.get(old_id) {
8976                    if old.xmax == xid {
8977                        old.set_xmax(previous_xmax);
8978                        let _ = manager.update(old);
8979                    }
8980                }
8981            }
8982            let _ = store.delete_batch(&collection, &[new_id]);
8983        }
8984    }
8985
8986    pub(crate) fn revive_versioned_updates_since(&self, conn_id: u64, stamper_xid: u64) -> usize {
8987        let mut guard = self.inner.pending_versioned_updates.write();
8988        let Some(pending) = guard.get_mut(&conn_id) else {
8989            return 0;
8990        };
8991
8992        let store = self.inner.db.store();
8993        let mut reverted = 0usize;
8994        pending.retain(|(collection, old_id, new_id, xid, previous_xmax)| {
8995            if *xid < stamper_xid {
8996                return true;
8997            }
8998            if let Some(manager) = store.get_collection(collection) {
8999                if let Some(mut old) = manager.get(*old_id) {
9000                    if old.xmax == *xid {
9001                        old.set_xmax(*previous_xmax);
9002                        let _ = manager.update(old);
9003                    }
9004                }
9005            }
9006            let _ = store.delete_batch(collection, &[*new_id]);
9007            reverted += 1;
9008            false
9009        });
9010        if pending.is_empty() {
9011            guard.remove(&conn_id);
9012        }
9013        reverted
9014    }
9015
9016    /// Flush tombstones on COMMIT. The xmax stamp is already the durable
9017    /// delete marker; commit only drops the rollback journal and emits
9018    /// side effects. Physical reclamation is left for VACUUM so old
9019    /// snapshots can still resolve the pre-delete row version.
9020    pub(crate) fn finalize_pending_tombstones(&self, conn_id: u64) {
9021        let Some(pending) = self.inner.pending_tombstones.write().remove(&conn_id) else {
9022            return;
9023        };
9024        if pending.is_empty() {
9025            return;
9026        }
9027
9028        let store = self.inner.db.store();
9029        for (collection, id, _xid, _previous_xmax) in pending {
9030            store.context_index().remove_entity(id);
9031            self.cdc_emit(
9032                crate::replication::cdc::ChangeOperation::Delete,
9033                &collection,
9034                id.raw(),
9035                "entity",
9036            );
9037        }
9038    }
9039
9040    /// Revive tombstones on ROLLBACK — reset `xmax` to 0 so the tuples
9041    /// become visible again to future snapshots. Best-effort: a row
9042    /// already reclaimed by a concurrent VACUUM stays gone, but VACUUM
9043    /// never reclaims tuples whose xmax is still referenced by any
9044    /// active snapshot, so this case is only reachable via external
9045    /// storage corruption.
9046    pub(crate) fn revive_pending_tombstones(&self, conn_id: u64) {
9047        let Some(pending) = self.inner.pending_tombstones.write().remove(&conn_id) else {
9048            return;
9049        };
9050
9051        let store = self.inner.db.store();
9052        for (collection, id, xid, previous_xmax) in pending {
9053            let Some(manager) = store.get_collection(&collection) else {
9054                continue;
9055            };
9056            if let Some(mut entity) = manager.get(id) {
9057                if entity.xmax == xid {
9058                    entity.set_xmax(previous_xmax);
9059                    let _ = manager.update(entity);
9060                }
9061            }
9062        }
9063    }
9064
9065    pub(crate) fn finalize_pending_kv_watch_events(&self, conn_id: u64) {
9066        let Some(pending) = self.inner.pending_kv_watch_events.write().remove(&conn_id) else {
9067            return;
9068        };
9069        for event in pending {
9070            self.cdc_emit_kv(
9071                event.op,
9072                &event.collection,
9073                &event.key,
9074                0,
9075                event.before,
9076                event.after,
9077            );
9078        }
9079    }
9080
9081    pub(crate) fn discard_pending_kv_watch_events(&self, conn_id: u64) {
9082        self.inner.pending_kv_watch_events.write().remove(&conn_id);
9083    }
9084
9085    /// Materialise the entire graph store while applying MVCC visibility
9086    /// AND per-collection RLS to each candidate node and edge. Mirrors
9087    /// `materialize_graph` but routes every entity through the same
9088    /// gate the SELECT path uses, with the correct `PolicyTargetKind`
9089    /// per entity kind (`Nodes` for graph nodes, `Edges` for graph
9090    /// edges). Returns the filtered `GraphStore` plus the
9091    /// `node_id → properties` map the executor needs for `RETURN n.*`
9092    /// projections.
9093    fn materialize_graph_with_rls(
9094        &self,
9095    ) -> RedDBResult<(
9096        crate::storage::engine::GraphStore,
9097        std::collections::HashMap<
9098            String,
9099            std::collections::HashMap<String, crate::storage::schema::Value>,
9100        >,
9101        crate::storage::query::unified::EdgeProperties,
9102    )> {
9103        use crate::storage::engine::GraphStore;
9104        use crate::storage::query::ast::{PolicyAction, PolicyTargetKind};
9105        use crate::storage::unified::entity::{EntityData, EntityKind};
9106        use std::collections::{HashMap, HashSet};
9107
9108        let store = self.inner.db.store();
9109        let snap_ctx = capture_current_snapshot();
9110        let role = current_auth_identity().map(|(_, r)| r.as_str().to_string());
9111
9112        let graph = GraphStore::new();
9113        let mut node_properties: HashMap<String, HashMap<String, crate::storage::schema::Value>> =
9114            HashMap::new();
9115        let mut edge_properties: crate::storage::query::unified::EdgeProperties = HashMap::new();
9116        let mut allowed_nodes: HashSet<String> = HashSet::new();
9117
9118        // Per-collection cached compiled filters — Nodes-kind for
9119        // first pass, Edges-kind for the second. None entries mean
9120        // "RLS enabled, zero matching policy → deny all of this kind".
9121        let mut node_rls: HashMap<String, Option<crate::storage::query::ast::Filter>> =
9122            HashMap::new();
9123        let mut edge_rls: HashMap<String, Option<crate::storage::query::ast::Filter>> =
9124            HashMap::new();
9125
9126        let collections = store.list_collections();
9127
9128        // First pass — gather nodes.
9129        for collection in &collections {
9130            let Some(manager) = store.get_collection(collection) else {
9131                continue;
9132            };
9133            let entities = manager.query_all(|_| true);
9134            for entity in entities {
9135                if !entity_visible_with_context(snap_ctx.as_ref(), &entity) {
9136                    continue;
9137                }
9138                let EntityKind::GraphNode(ref node) = entity.kind else {
9139                    continue;
9140                };
9141                if !node_passes_rls(self, collection, role.as_deref(), &mut node_rls, &entity) {
9142                    continue;
9143                }
9144                let id_str = entity.id.raw().to_string();
9145                graph
9146                    .add_node_with_label(
9147                        &id_str,
9148                        &node.label,
9149                        &super::graph_node_label(&node.node_type),
9150                    )
9151                    .map_err(|err| RedDBError::Query(err.to_string()))?;
9152                allowed_nodes.insert(id_str.clone());
9153                if let EntityData::Node(node_data) = &entity.data {
9154                    node_properties.insert(id_str, node_data.properties.clone());
9155                }
9156            }
9157        }
9158
9159        // Second pass — gather edges. An edge appears only when both
9160        // endpoint nodes survived the RLS pass AND the edge itself
9161        // passes its own RLS gate.
9162        for collection in &collections {
9163            let Some(manager) = store.get_collection(collection) else {
9164                continue;
9165            };
9166            let entities = manager.query_all(|_| true);
9167            for entity in entities {
9168                if !entity_visible_with_context(snap_ctx.as_ref(), &entity) {
9169                    continue;
9170                }
9171                let EntityKind::GraphEdge(ref edge) = entity.kind else {
9172                    continue;
9173                };
9174                if !allowed_nodes.contains(&edge.from_node)
9175                    || !allowed_nodes.contains(&edge.to_node)
9176                {
9177                    continue;
9178                }
9179                if !edge_passes_rls(self, collection, role.as_deref(), &mut edge_rls, &entity) {
9180                    continue;
9181                }
9182                let weight = match &entity.data {
9183                    EntityData::Edge(e) => e.weight,
9184                    _ => edge.weight as f32 / 1000.0,
9185                };
9186                let edge_label = super::graph_edge_label(&edge.label);
9187                graph
9188                    .add_edge_with_label(&edge.from_node, &edge.to_node, &edge_label, weight)
9189                    .map_err(|err| RedDBError::Query(err.to_string()))?;
9190                if let EntityData::Edge(edge_data) = &entity.data {
9191                    edge_properties.insert(
9192                        (edge.from_node.clone(), edge_label, edge.to_node.clone()),
9193                        edge_data.properties.clone(),
9194                    );
9195                }
9196            }
9197        }
9198
9199        // Suppress unused-PolicyAction/PolicyTargetKind warnings — both
9200        // are used inside the helper closures via the per-kind helpers
9201        // declared at the bottom of this file.
9202        let _ = (PolicyAction::Select, PolicyTargetKind::Nodes);
9203
9204        Ok((graph, node_properties, edge_properties))
9205    }
9206
9207    /// Phase 1.1 MVCC universal: post-save hook that stamps `xmin` on a
9208    /// freshly-inserted entity when the current connection holds an
9209    /// open transaction. Used by graph / vector / queue / timeseries
9210    /// write paths that go through the DevX builder API (`db.node(...)
9211    /// .save()` and friends) — those live in the storage crate and
9212    /// can't reach `current_xid()` without crossing layers, so the
9213    /// application layer calls this helper right after `save()` to
9214    /// finalise the MVCC stamp.
9215    ///
9216    /// Autocommit (outside BEGIN) is a no-op — no extra lookup or
9217    /// write, so the non-transactional hot path stays untouched.
9218    ///
9219    /// Best-effort: if the collection or entity disappears between
9220    /// the save and the stamp (concurrent DROP), we silently skip.
9221    pub(crate) fn stamp_xmin_if_in_txn(
9222        &self,
9223        collection: &str,
9224        id: crate::storage::unified::entity::EntityId,
9225    ) {
9226        let Some(xid) = self.current_xid() else {
9227            return;
9228        };
9229        let store = self.inner.db.store();
9230        let Some(manager) = store.get_collection(collection) else {
9231            return;
9232        };
9233        if let Some(mut entity) = manager.get(id) {
9234            entity.set_xmin(xid);
9235            let _ = manager.update(entity);
9236        }
9237    }
9238
9239    /// Revive tombstones stamped by `stamper_xid` or any sub-xid
9240    /// allocated after it (Phase 2.3.2e savepoint rollback). Any
9241    /// pending entries with `xid < stamper_xid` stay queued because
9242    /// they belong to the enclosing scope — they'll either flush on
9243    /// COMMIT or revive on an outer ROLLBACK TO SAVEPOINT.
9244    ///
9245    /// Returns the number of tuples whose `xmax` was wiped back to 0.
9246    pub(crate) fn revive_tombstones_since(&self, conn_id: u64, stamper_xid: u64) -> usize {
9247        let mut guard = self.inner.pending_tombstones.write();
9248        let Some(pending) = guard.get_mut(&conn_id) else {
9249            return 0;
9250        };
9251
9252        let store = self.inner.db.store();
9253        let mut revived = 0usize;
9254        pending.retain(|(collection, id, xid, previous_xmax)| {
9255            if *xid < stamper_xid {
9256                // Stamped before the savepoint — keep in queue.
9257                return true;
9258            }
9259            if let Some(manager) = store.get_collection(collection) {
9260                if let Some(mut entity) = manager.get(*id) {
9261                    if entity.xmax == *xid {
9262                        entity.set_xmax(*previous_xmax);
9263                        let _ = manager.update(entity);
9264                        revived += 1;
9265                    }
9266                }
9267            }
9268            false
9269        });
9270        if pending.is_empty() {
9271            guard.remove(&conn_id);
9272        }
9273        revived
9274    }
9275
9276    /// Return the snapshot the current connection should use for visibility
9277    /// checks (Phase 2.3 PG parity).
9278    ///
9279    /// * If the connection is inside a BEGIN-wrapped transaction, reuse
9280    ///   the snapshot stored in its `TxnContext`.
9281    /// * Otherwise (autocommit), capture a fresh snapshot tied to an
9282    ///   implicit xid=0 — the read path treats pre-MVCC rows as always
9283    ///   visible so this degrades to "see everything committed".
9284    pub fn current_snapshot(&self) -> crate::storage::transaction::snapshot::Snapshot {
9285        let conn_id = current_connection_id();
9286        if let Some(ctx) = self.inner.tx_contexts.read().get(&conn_id).cloned() {
9287            return ctx.snapshot;
9288        }
9289        // Autocommit: take a fresh snapshot bounded by `peek_next_xid` so
9290        // every already-committed xid (which is strictly less) passes the
9291        // `xmin <= snap.xid` gate, while concurrently-active xids land in
9292        // the `in_progress` set and stay hidden until they commit. Using
9293        // xid=0 would incorrectly hide every MVCC-stamped tuple.
9294        let high_water = self.inner.snapshot_manager.peek_next_xid();
9295        self.inner.snapshot_manager.snapshot(high_water)
9296    }
9297
9298    /// Xid of the current connection's active transaction, or `None` when
9299    /// running outside a BEGIN/COMMIT block. Write paths call this to
9300    /// decide whether to stamp `xmin`/`xmax` on tuples.
9301    /// Phase 2.3.2e: when a savepoint is open, `writer_xid` returns the
9302    /// sub-xid so new writes can be selectively rolled back. Otherwise
9303    /// the parent txn's xid is returned, matching pre-savepoint
9304    /// behaviour. Callers that need the enclosing *transaction* xid
9305    /// (e.g. VACUUM min-active calculations) should read `ctx.xid`
9306    /// directly.
9307    pub fn current_xid(&self) -> Option<crate::storage::transaction::snapshot::Xid> {
9308        let conn_id = current_connection_id();
9309        self.inner
9310            .tx_contexts
9311            .read()
9312            .get(&conn_id)
9313            .map(|ctx| ctx.writer_xid())
9314    }
9315
9316    /// Access the shared `SnapshotManager` — useful for VACUUM to compute
9317    /// the oldest-active xid when reclaiming dead tuples.
9318    pub fn snapshot_manager(&self) -> Arc<crate::storage::transaction::snapshot::SnapshotManager> {
9319        Arc::clone(&self.inner.snapshot_manager)
9320    }
9321
9322    fn mvcc_vacuum_cutoff_xid(&self) -> crate::storage::transaction::snapshot::Xid {
9323        let manager = &self.inner.snapshot_manager;
9324        let next_xid = manager.peek_next_xid();
9325        let mut cutoff = next_xid;
9326        if let Some(oldest_active) = manager.oldest_active_xid() {
9327            cutoff = cutoff.min(oldest_active);
9328        }
9329        if let Some(oldest_pinned) = manager.oldest_pinned_xid() {
9330            cutoff = cutoff.min(oldest_pinned);
9331        }
9332        let retention_xids = self.config_u64("runtime.mvcc.vacuum_retention_xids", 0);
9333        if retention_xids > 0 {
9334            cutoff = cutoff.min(next_xid.saturating_sub(retention_xids));
9335        }
9336        cutoff
9337    }
9338
9339    fn rebuild_runtime_indexes_for_table(&self, table: &str) -> RedDBResult<()> {
9340        let registered = self.inner.index_store.list_indices(table);
9341        if registered.is_empty() {
9342            return Ok(());
9343        }
9344        let store = self.inner.db.store();
9345        let Some(manager) = store.get_collection(table) else {
9346            return Ok(());
9347        };
9348        let entity_fields = manager
9349            .query_all(|entity| matches!(entity.kind, crate::storage::EntityKind::TableRow { .. }))
9350            .into_iter()
9351            .map(|entity| (entity.id, table_row_index_fields(&entity)))
9352            .collect::<Vec<_>>();
9353
9354        for index in registered {
9355            self.inner.index_store.drop_index(&index.name, table);
9356            self.inner
9357                .index_store
9358                .create_index(
9359                    &index.name,
9360                    table,
9361                    &index.columns,
9362                    index.method,
9363                    index.unique,
9364                    &entity_fields,
9365                )
9366                .map_err(RedDBError::Internal)?;
9367            self.inner.index_store.register(index);
9368        }
9369        self.invalidate_plan_cache();
9370        Ok(())
9371    }
9372
9373    /// Own-tx xids (parent + open/released savepoints) for the current
9374    /// connection. Transports + tests that build a `SnapshotContext`
9375    /// manually (outside the `execute_query` scope) need this set so
9376    /// the writer's own uncommitted tuples stay visible to self.
9377    pub fn current_txn_own_xids(
9378        &self,
9379    ) -> std::collections::HashSet<crate::storage::transaction::snapshot::Xid> {
9380        let mut set = std::collections::HashSet::new();
9381        if let Some(ctx) = self.inner.tx_contexts.read().get(&current_connection_id()) {
9382            set.insert(ctx.xid);
9383            for (_, sub) in &ctx.savepoints {
9384                set.insert(*sub);
9385            }
9386            for sub in &ctx.released_sub_xids {
9387                set.insert(*sub);
9388            }
9389        }
9390        set
9391    }
9392
9393    /// Access the shared `ForeignTableRegistry` (Phase 3.2 PG parity).
9394    ///
9395    /// Callers use this to check whether a table name is a registered
9396    /// foreign table (`registry.is_foreign_table(name)`) and, if so, to
9397    /// scan it (`registry.scan(name)`). The read-path rewriter consults
9398    /// this before dispatching into native-collection lookup.
9399    pub fn foreign_tables(&self) -> Arc<crate::storage::fdw::ForeignTableRegistry> {
9400        Arc::clone(&self.inner.foreign_tables)
9401    }
9402
9403    /// Is Row-Level Security enabled for this table? (Phase 2.5 PG parity)
9404    pub fn is_rls_enabled(&self, table: &str) -> bool {
9405        self.inner.rls_enabled_tables.read().contains(table)
9406    }
9407
9408    /// Collect the USING predicates that apply to this `(table, role, action)`.
9409    ///
9410    /// Returned filters should be OR-combined (a row passes RLS when *any*
9411    /// matching policy accepts it) and then AND-ed into the query's WHERE.
9412    /// When the table has RLS disabled this returns an empty Vec — callers
9413    /// can fast-path back to the unfiltered read.
9414    pub fn matching_rls_policies(
9415        &self,
9416        table: &str,
9417        role: Option<&str>,
9418        action: crate::storage::query::ast::PolicyAction,
9419    ) -> Vec<crate::storage::query::ast::Filter> {
9420        // Default kind = Table preserves the pre-Phase-2.5.5 behaviour:
9421        // callers that don't name a kind only see Table-scoped
9422        // policies (which is what execute SELECT / UPDATE / DELETE
9423        // expect).
9424        self.matching_rls_policies_for_kind(
9425            table,
9426            role,
9427            action,
9428            crate::storage::query::ast::PolicyTargetKind::Table,
9429        )
9430    }
9431
9432    /// Kind-aware variant used by cross-model scans (Phase 2.5.5).
9433    ///
9434    /// Graph scans request `Nodes` / `Edges`, vector ANN requests
9435    /// `Vectors`, queue consumers request `Messages`, and timeseries
9436    /// range scans request `Points`. Policies tagged with a
9437    /// different kind are skipped so a graph-scoped policy doesn't
9438    /// accidentally gate a table SELECT on the same collection.
9439    pub fn matching_rls_policies_for_kind(
9440        &self,
9441        table: &str,
9442        role: Option<&str>,
9443        action: crate::storage::query::ast::PolicyAction,
9444        kind: crate::storage::query::ast::PolicyTargetKind,
9445    ) -> Vec<crate::storage::query::ast::Filter> {
9446        if !self.is_rls_enabled(table) {
9447            return Vec::new();
9448        }
9449        let policies = self.inner.rls_policies.read();
9450        policies
9451            .iter()
9452            .filter_map(|((t, _), p)| {
9453                if t != table {
9454                    return None;
9455                }
9456                // Kind gate — Table policies also apply to every
9457                // other kind *iff* the policy predicate evaluates
9458                // against entity fields that exist uniformly; the
9459                // caller's kind filter is the stricter check, so
9460                // match literally. Auto-tenancy policies stamp
9461                // Table and the caller passes the concrete kind —
9462                // we allow Table policies to apply cross-kind for
9463                // backwards compat.
9464                if p.target_kind != kind
9465                    && p.target_kind != crate::storage::query::ast::PolicyTargetKind::Table
9466                {
9467                    return None;
9468                }
9469                // Action gate — `None` means "ALL" actions.
9470                if let Some(a) = p.action {
9471                    if a != action {
9472                        return None;
9473                    }
9474                }
9475                // Role gate — `None` means "any role".
9476                if let Some(p_role) = p.role.as_deref() {
9477                    match role {
9478                        Some(r) if r == p_role => {}
9479                        _ => return None,
9480                    }
9481                }
9482                Some((*p.using).clone())
9483            })
9484            .collect()
9485    }
9486
9487    pub(crate) fn refresh_table_planner_stats(&self, table: &str) {
9488        let store = self.inner.db.store();
9489        if let Some(stats) =
9490            crate::storage::query::planner::stats_catalog::analyze_collection(store.as_ref(), table)
9491        {
9492            crate::storage::query::planner::stats_catalog::persist_table_stats(
9493                store.as_ref(),
9494                &stats,
9495            );
9496        } else {
9497            crate::storage::query::planner::stats_catalog::clear_table_stats(store.as_ref(), table);
9498        }
9499        self.invalidate_plan_cache();
9500    }
9501
9502    pub(crate) fn note_table_write(&self, table: &str) {
9503        // Skip the write lock when the table is already marked
9504        // dirty. With single-row UPDATEs in a loop this used to
9505        // grab the planner_dirty_tables write lock N times even
9506        // though the first call already flipped the flag.
9507        let already_dirty = self.inner.planner_dirty_tables.read().contains(table);
9508        if !already_dirty {
9509            self.inner
9510                .planner_dirty_tables
9511                .write()
9512                .insert(table.to_string());
9513        }
9514        self.invalidate_result_cache_for_table(table);
9515    }
9516
9517    /// Wrap the planner's `RuntimeQueryExplain` as rows on a
9518    /// `RuntimeQueryResult` so callers over the SQL interface see the
9519    /// plan tree in the same shape a SELECT produces.
9520    ///
9521    /// Columns: `op`, `source`, `est_rows`, `est_cost`, `depth`.
9522    /// Nodes are walked depth-first; `depth` counts from 0 at the
9523    /// root so a text renderer can indent without re-walking.
9524    fn explain_as_rows(&self, raw_query: &str, inner_sql: &str) -> RedDBResult<RuntimeQueryResult> {
9525        let explain = self.explain_query(inner_sql)?;
9526
9527        let columns = vec![
9528            "op".to_string(),
9529            "source".to_string(),
9530            "est_rows".to_string(),
9531            "est_cost".to_string(),
9532            "depth".to_string(),
9533        ];
9534
9535        let mut records: Vec<crate::storage::query::unified::UnifiedRecord> = Vec::new();
9536
9537        // Prepend `CteScan` markers when the query carried a leading
9538        // WITH clause. The CTE bodies are already inlined into the
9539        // main plan tree, but operators reading EXPLAIN need to see
9540        // which named CTEs were resolved — without this row the plan
9541        // would look indistinguishable from a hand-inlined query.
9542        for name in &explain.cte_materializations {
9543            use std::sync::Arc;
9544            let mut rec = crate::storage::query::unified::UnifiedRecord::default();
9545            rec.set_arc(Arc::from("op"), Value::text("CteScan".to_string()));
9546            rec.set_arc(Arc::from("source"), Value::text(name.clone()));
9547            rec.set_arc(Arc::from("est_rows"), Value::Float(0.0));
9548            rec.set_arc(Arc::from("est_cost"), Value::Float(0.0));
9549            rec.set_arc(Arc::from("depth"), Value::Integer(0));
9550            records.push(rec);
9551        }
9552
9553        walk_plan_node(&explain.logical_plan.root, 0, &mut records);
9554
9555        let result = crate::storage::query::unified::UnifiedResult {
9556            columns,
9557            records,
9558            stats: Default::default(),
9559            pre_serialized_json: None,
9560        };
9561
9562        Ok(RuntimeQueryResult {
9563            query: raw_query.to_string(),
9564            mode: explain.mode,
9565            statement: "explain",
9566            engine: "runtime-explain",
9567            result,
9568            affected_rows: 0,
9569            statement_type: "select",
9570        })
9571    }
9572
9573    // -----------------------------------------------------------------
9574    // Granular RBAC — privilege gate + GRANT/REVOKE/ALTER USER dispatch
9575    // -----------------------------------------------------------------
9576
9577    /// Project a `QueryExpr` to the (action, resource) pair the
9578    /// privilege engine cares about. Returns `Ok(())` for statements
9579    /// that don't touch user data (transaction control, SHOW, SET, etc.).
9580    pub(super) fn check_query_privilege(
9581        &self,
9582        expr: &crate::storage::query::ast::QueryExpr,
9583    ) -> Result<(), String> {
9584        use crate::auth::privileges::{Action, AuthzContext, Resource};
9585        use crate::auth::UserId;
9586        use crate::storage::query::ast::QueryExpr;
9587
9588        // No auth store wired (embedded mode / fresh DB / tests) → bypass.
9589        // The bootstrap path itself goes through `execute_query` so this
9590        // is the only sensible default; once auth is wired, the gate
9591        // becomes active.
9592        let auth_store = match self.inner.auth_store.read().clone() {
9593            Some(s) => s,
9594            None => return Ok(()),
9595        };
9596
9597        // Resolve principal + role from the thread-local identity.
9598        // Anonymous (no identity) is allowed to read the bootstrap path
9599        // only when auth_store says so; we treat missing identity as
9600        // platform-admin-equivalent here so embedded test harnesses
9601        // continue to work without setting an identity.
9602        let (username, role) = match current_auth_identity() {
9603            Some(p) => p,
9604            None => return Ok(()),
9605        };
9606        let tenant = current_tenant();
9607
9608        let ctx = AuthzContext {
9609            principal: &username,
9610            effective_role: role,
9611            tenant: tenant.as_deref(),
9612        };
9613        let principal_id = UserId::from_parts(tenant.as_deref(), &username);
9614
9615        // Map QueryExpr → (Action, Resource).
9616        let (action, resource) = match expr {
9617            QueryExpr::Table(t) => (Action::Select, Resource::table_from_name(&t.table)),
9618            QueryExpr::QueueSelect(q) => (Action::Select, Resource::table_from_name(&q.queue)),
9619            QueryExpr::Graph(g) => {
9620                if auth_store.iam_authorization_enabled() {
9621                    self.check_graph_property_projection_privilege(
9622                        &auth_store,
9623                        &principal_id,
9624                        role,
9625                        tenant.as_deref(),
9626                        g,
9627                    )?;
9628                    return Ok(());
9629                }
9630                return Ok(());
9631            }
9632            QueryExpr::Vector(v) => {
9633                if auth_store.iam_authorization_enabled() {
9634                    self.check_table_like_column_projection_privilege(
9635                        &auth_store,
9636                        &principal_id,
9637                        role,
9638                        tenant.as_deref(),
9639                        &v.collection,
9640                        &["content".to_string()],
9641                    )?;
9642                    return Ok(());
9643                }
9644                return Ok(());
9645            }
9646            QueryExpr::Insert(i) => (Action::Insert, Resource::table_from_name(&i.table)),
9647            QueryExpr::Update(u) => (Action::Update, Resource::table_from_name(&u.table)),
9648            QueryExpr::Delete(d) => (Action::Delete, Resource::table_from_name(&d.table)),
9649            // Joins inherit the read privilege from any constituent
9650            // table — for now we emit a single Select on the database
9651            // (admins bypass; non-admins need a Database/Schema grant).
9652            QueryExpr::Join(_) => (Action::Select, Resource::Database),
9653            // GRANT / REVOKE / ALTER USER are authority statements;
9654            // require Admin (the helper methods enforce).
9655            QueryExpr::Grant(_) | QueryExpr::Revoke(_) | QueryExpr::AlterUser(_) => {
9656                return if role == crate::auth::Role::Admin {
9657                    Ok(())
9658                } else {
9659                    Err(format!(
9660                        "principal=`{}` role=`{:?}` cannot issue ACL/auth DDL",
9661                        username, role
9662                    ))
9663                };
9664            }
9665            QueryExpr::CreateIamPolicy { id, .. } => {
9666                return self.check_policy_management_privilege(
9667                    &auth_store,
9668                    &principal_id,
9669                    role,
9670                    tenant.as_deref(),
9671                    "policy:put",
9672                    "policy",
9673                    id,
9674                );
9675            }
9676            QueryExpr::DropIamPolicy { id } => {
9677                return self.check_policy_management_privilege(
9678                    &auth_store,
9679                    &principal_id,
9680                    role,
9681                    tenant.as_deref(),
9682                    "policy:drop",
9683                    "policy",
9684                    id,
9685                );
9686            }
9687            QueryExpr::AttachPolicy { policy_id, .. } => {
9688                return self.check_policy_management_privilege(
9689                    &auth_store,
9690                    &principal_id,
9691                    role,
9692                    tenant.as_deref(),
9693                    "policy:attach",
9694                    "policy",
9695                    policy_id,
9696                );
9697            }
9698            QueryExpr::DetachPolicy { policy_id, .. } => {
9699                return self.check_policy_management_privilege(
9700                    &auth_store,
9701                    &principal_id,
9702                    role,
9703                    tenant.as_deref(),
9704                    "policy:detach",
9705                    "policy",
9706                    policy_id,
9707                );
9708            }
9709            QueryExpr::ShowPolicies { .. } | QueryExpr::ShowEffectivePermissions { .. } => {
9710                return Ok(());
9711            }
9712            QueryExpr::SimulatePolicy { .. } => {
9713                return self.check_policy_management_privilege(
9714                    &auth_store,
9715                    &principal_id,
9716                    role,
9717                    tenant.as_deref(),
9718                    "policy:simulate",
9719                    "policy",
9720                    "*",
9721                );
9722            }
9723            // DROP and TRUNCATE — Write-role gate + per-collection IAM policy
9724            // when IAM mode is active. Other DDL stays role-only for now.
9725            QueryExpr::DropTable(q) => {
9726                return self.check_ddl_collection_privilege(
9727                    &auth_store,
9728                    &principal_id,
9729                    role,
9730                    tenant.as_deref(),
9731                    &username,
9732                    "drop",
9733                    &q.name,
9734                );
9735            }
9736            QueryExpr::DropGraph(q) => {
9737                return self.check_ddl_collection_privilege(
9738                    &auth_store,
9739                    &principal_id,
9740                    role,
9741                    tenant.as_deref(),
9742                    &username,
9743                    "drop",
9744                    &q.name,
9745                );
9746            }
9747            QueryExpr::DropVector(q) => {
9748                return self.check_ddl_collection_privilege(
9749                    &auth_store,
9750                    &principal_id,
9751                    role,
9752                    tenant.as_deref(),
9753                    &username,
9754                    "drop",
9755                    &q.name,
9756                );
9757            }
9758            QueryExpr::DropDocument(q) => {
9759                return self.check_ddl_collection_privilege(
9760                    &auth_store,
9761                    &principal_id,
9762                    role,
9763                    tenant.as_deref(),
9764                    &username,
9765                    "drop",
9766                    &q.name,
9767                );
9768            }
9769            QueryExpr::DropKv(q) => {
9770                return self.check_ddl_collection_privilege(
9771                    &auth_store,
9772                    &principal_id,
9773                    role,
9774                    tenant.as_deref(),
9775                    &username,
9776                    "drop",
9777                    &q.name,
9778                );
9779            }
9780            QueryExpr::DropCollection(q) => {
9781                return self.check_ddl_collection_privilege(
9782                    &auth_store,
9783                    &principal_id,
9784                    role,
9785                    tenant.as_deref(),
9786                    &username,
9787                    "drop",
9788                    &q.name,
9789                );
9790            }
9791            QueryExpr::Truncate(q) => {
9792                return self.check_ddl_collection_privilege(
9793                    &auth_store,
9794                    &principal_id,
9795                    role,
9796                    tenant.as_deref(),
9797                    &username,
9798                    "truncate",
9799                    &q.name,
9800                );
9801            }
9802            // Remaining DDL — gate on Write role. Fine-grained grants TBD.
9803            QueryExpr::CreateTable(_)
9804            | QueryExpr::CreateCollection(_)
9805            | QueryExpr::CreateVector(_)
9806            | QueryExpr::AlterTable(_)
9807            | QueryExpr::CreateIndex(_)
9808            | QueryExpr::DropIndex(_)
9809            | QueryExpr::CreateSchema(_)
9810            | QueryExpr::DropSchema(_)
9811            | QueryExpr::CreateSequence(_)
9812            | QueryExpr::DropSequence(_)
9813            | QueryExpr::CreateView(_)
9814            | QueryExpr::DropView(_)
9815            | QueryExpr::RefreshMaterializedView(_)
9816            | QueryExpr::CreatePolicy(_)
9817            | QueryExpr::DropPolicy(_)
9818            | QueryExpr::CreateServer(_)
9819            | QueryExpr::DropServer(_)
9820            | QueryExpr::CreateForeignTable(_)
9821            | QueryExpr::DropForeignTable(_)
9822            | QueryExpr::CreateTimeSeries(_)
9823            | QueryExpr::DropTimeSeries(_)
9824            | QueryExpr::CreateQueue(_)
9825            | QueryExpr::AlterQueue(_)
9826            | QueryExpr::DropQueue(_)
9827            | QueryExpr::CreateTree(_)
9828            | QueryExpr::DropTree(_) => {
9829                return if role >= crate::auth::Role::Write {
9830                    Ok(())
9831                } else {
9832                    Err(format!(
9833                        "principal=`{}` role=`{:?}` cannot issue DDL",
9834                        username, role
9835                    ))
9836                };
9837            }
9838            // Migration DDL — CREATE MIGRATION requires Write role (schema author).
9839            QueryExpr::CreateMigration(_) => {
9840                return if role >= crate::auth::Role::Write {
9841                    Ok(())
9842                } else {
9843                    Err(format!(
9844                        "principal=`{}` role=`{:?}` cannot issue CREATE MIGRATION",
9845                        username, role
9846                    ))
9847                };
9848            }
9849            // APPLY / ROLLBACK change data and schema — require Admin.
9850            QueryExpr::ApplyMigration(_) | QueryExpr::RollbackMigration(_) => {
9851                return if role == crate::auth::Role::Admin {
9852                    Ok(())
9853                } else {
9854                    Err(format!(
9855                        "principal=`{}` role=`{:?}` cannot issue APPLY/ROLLBACK MIGRATION",
9856                        username, role
9857                    ))
9858                };
9859            }
9860            // EXPLAIN MIGRATION is read-only — any authenticated principal.
9861            QueryExpr::ExplainMigration(_) => return Ok(()),
9862            // Everything else (SET, SHOW, transaction control, graph
9863            // commands, queue/tree commands, MaintenanceCommand …)
9864            // is allowed for any authenticated principal.
9865            _ => return Ok(()),
9866        };
9867
9868        if auth_store.iam_authorization_enabled() {
9869            let iam_action = legacy_action_to_iam(action);
9870            let iam_resource = legacy_resource_to_iam(&resource, tenant.as_deref());
9871            let iam_ctx = runtime_iam_context(
9872                role,
9873                tenant.as_deref(),
9874                auth_store.principal_is_system_owned(&principal_id),
9875            );
9876            if !auth_store.check_policy_authz_with_role(
9877                &principal_id,
9878                iam_action,
9879                &iam_resource,
9880                &iam_ctx,
9881                role,
9882            ) {
9883                return Err(format!(
9884                    "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
9885                    username, iam_action, iam_resource.kind, iam_resource.name
9886                ));
9887            }
9888
9889            if let QueryExpr::Table(table) = expr {
9890                self.check_table_column_projection_privilege(
9891                    &auth_store,
9892                    &principal_id,
9893                    &iam_ctx,
9894                    table,
9895                )?;
9896            }
9897
9898            if let QueryExpr::Update(update) = expr {
9899                let columns = update_set_target_columns(update);
9900                if !columns.is_empty() {
9901                    let request = column_access_request_for_table_update(&update.table, columns);
9902                    let outcome =
9903                        auth_store.check_column_projection_authz(&principal_id, &request, &iam_ctx);
9904                    if let Some(denied) = outcome.first_denied_column() {
9905                        return Err(format!(
9906                            "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM column policy",
9907                            username, iam_action, denied.resource.kind, denied.resource.name
9908                        ));
9909                    }
9910                    if !outcome.allowed() {
9911                        return Err(format!(
9912                            "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
9913                            username,
9914                            iam_action,
9915                            outcome.table_resource.kind,
9916                            outcome.table_resource.name
9917                        ));
9918                    }
9919                }
9920
9921                if let Some(columns) = update_returning_columns_for_policy(self, update) {
9922                    let request = column_access_request_for_table_select(&update.table, columns);
9923                    let outcome =
9924                        auth_store.check_column_projection_authz(&principal_id, &request, &iam_ctx);
9925                    if let Some(denied) = outcome.first_denied_column() {
9926                        return Err(format!(
9927                            "principal=`{}` action=`select` resource=`{}:{}` denied by IAM column policy",
9928                            username, denied.resource.kind, denied.resource.name
9929                        ));
9930                    }
9931                    if !outcome.allowed() {
9932                        return Err(format!(
9933                            "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
9934                            username, outcome.table_resource.kind, outcome.table_resource.name
9935                        ));
9936                    }
9937                }
9938            }
9939
9940            Ok(())
9941        } else {
9942            auth_store
9943                .check_grant(&ctx, action, &resource)
9944                .map_err(|e| e.to_string())
9945        }
9946    }
9947
9948    fn check_table_column_projection_privilege(
9949        &self,
9950        auth_store: &Arc<crate::auth::store::AuthStore>,
9951        principal: &crate::auth::UserId,
9952        ctx: &crate::auth::policies::EvalContext,
9953        table: &crate::storage::query::ast::TableQuery,
9954    ) -> Result<(), String> {
9955        use crate::auth::{ColumnAccessRequest, ColumnDecisionEffect};
9956
9957        let columns = requested_table_columns_for_policy(table);
9958        if columns.is_empty() {
9959            return Ok(());
9960        }
9961
9962        let request = ColumnAccessRequest::select(table.table.clone(), columns);
9963        let outcome = auth_store.check_column_projection_authz(principal, &request, ctx);
9964        if outcome.allowed() {
9965            return Ok(());
9966        }
9967
9968        if !matches!(
9969            outcome.table_decision,
9970            crate::auth::policies::Decision::Allow { .. }
9971                | crate::auth::policies::Decision::AdminBypass
9972        ) {
9973            return Err(format!(
9974                "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
9975                principal, outcome.table_resource.kind, outcome.table_resource.name
9976            ));
9977        }
9978
9979        let denied = outcome
9980            .first_denied_column()
9981            .filter(|decision| decision.effective == ColumnDecisionEffect::Denied);
9982        match denied {
9983            Some(decision) => Err(format!(
9984                "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
9985                principal, decision.resource.kind, decision.resource.name
9986            )),
9987            None => Ok(()),
9988        }
9989    }
9990
9991    fn check_graph_property_projection_privilege(
9992        &self,
9993        auth_store: &Arc<crate::auth::store::AuthStore>,
9994        principal: &crate::auth::UserId,
9995        role: crate::auth::Role,
9996        tenant: Option<&str>,
9997        query: &crate::storage::query::ast::GraphQuery,
9998    ) -> Result<(), String> {
9999        let columns = explicit_graph_projection_properties(query);
10000        if columns.is_empty() {
10001            return Ok(());
10002        }
10003        self.check_table_like_column_projection_privilege(
10004            auth_store, principal, role, tenant, "graph", &columns,
10005        )
10006    }
10007
10008    fn check_table_like_column_projection_privilege(
10009        &self,
10010        auth_store: &Arc<crate::auth::store::AuthStore>,
10011        principal: &crate::auth::UserId,
10012        role: crate::auth::Role,
10013        tenant: Option<&str>,
10014        table: &str,
10015        columns: &[String],
10016    ) -> Result<(), String> {
10017        let iam_ctx = runtime_iam_context(
10018            role,
10019            tenant,
10020            auth_store.principal_is_system_owned(principal),
10021        );
10022        let request =
10023            crate::auth::ColumnAccessRequest::select(table.to_string(), columns.iter().cloned());
10024        let outcome = auth_store.check_column_projection_authz(principal, &request, &iam_ctx);
10025        if outcome.allowed() {
10026            return Ok(());
10027        }
10028        let denied = outcome
10029            .first_denied_column()
10030            .map(|d| d.resource.name.clone())
10031            .unwrap_or_else(|| format!("{table}.<unknown>"));
10032        Err(format!(
10033            "principal=`{}` action=`select` resource=`column:{}` denied by IAM policy",
10034            principal, denied
10035        ))
10036    }
10037
10038    fn check_policy_management_privilege(
10039        &self,
10040        auth_store: &Arc<crate::auth::store::AuthStore>,
10041        principal: &crate::auth::UserId,
10042        role: crate::auth::Role,
10043        tenant: Option<&str>,
10044        action: &str,
10045        resource_kind: &str,
10046        resource_name: &str,
10047    ) -> Result<(), String> {
10048        let ctx = runtime_iam_context(
10049            role,
10050            tenant,
10051            auth_store.principal_is_system_owned(principal),
10052        );
10053
10054        if !auth_store.iam_authorization_enabled() {
10055            return if role == crate::auth::Role::Admin {
10056                Ok(())
10057            } else {
10058                Err(format!(
10059                    "principal=`{}` role=`{:?}` cannot issue ACL/auth DDL",
10060                    principal, role
10061                ))
10062            };
10063        }
10064
10065        let mut resource = crate::auth::policies::ResourceRef::new(
10066            resource_kind.to_string(),
10067            resource_name.to_string(),
10068        );
10069        if let Some(t) = tenant {
10070            resource = resource.with_tenant(t.to_string());
10071        }
10072        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
10073            Ok(())
10074        } else {
10075            Err(format!(
10076                "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
10077                principal, action, resource.kind, resource.name
10078            ))
10079        }
10080    }
10081
10082    fn check_managed_config_write_for_set_config(&self, key: &str) -> RedDBResult<()> {
10083        let Some(auth_store) = self.inner.auth_store.read().clone() else {
10084            return Ok(());
10085        };
10086        let (username, role) = current_auth_identity()
10087            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
10088        let tenant = current_tenant();
10089        let principal = crate::auth::UserId::from_parts(tenant.as_deref(), &username);
10090        let ctx = runtime_iam_context(
10091            role,
10092            tenant.as_deref(),
10093            auth_store.principal_is_system_owned(&principal),
10094        );
10095        let gate = crate::auth::managed_config::ManagedConfigGate::new(
10096            self.inner.config_registry.as_ref(),
10097        );
10098        match gate.check_write(&auth_store, &principal, &ctx, key) {
10099            crate::auth::managed_config::ManagedConfigDecision::PassThrough { .. }
10100            | crate::auth::managed_config::ManagedConfigDecision::Allow { .. } => Ok(()),
10101            crate::auth::managed_config::ManagedConfigDecision::Deny { reason, .. } => {
10102                Err(RedDBError::Query(format!(
10103                    "permission denied: managed config mutation blocked for `{key}`: {reason}"
10104                )))
10105            }
10106        }
10107    }
10108
10109    /// IAM privilege check for DROP / TRUNCATE on a named collection.
10110    ///
10111    /// In legacy mode (IAM not enabled): requires Write role.
10112    /// In IAM mode: requires an explicit `drop` / `truncate` policy on
10113    /// `collection:<name>`; admin authority allows the action only when no
10114    /// explicit Deny matches.
10115    /// Records an audit log entry for both allow and deny outcomes.
10116    fn check_ddl_collection_privilege(
10117        &self,
10118        auth_store: &Arc<crate::auth::store::AuthStore>,
10119        principal: &crate::auth::UserId,
10120        role: crate::auth::Role,
10121        tenant: Option<&str>,
10122        username: &str,
10123        action: &str,
10124        collection: &str,
10125    ) -> Result<(), String> {
10126        if role < crate::auth::Role::Write {
10127            let msg = format!(
10128                "principal=`{}` role=`{:?}` cannot issue DDL",
10129                username, role
10130            );
10131            self.inner.audit_log.record(
10132                action,
10133                username,
10134                collection,
10135                "denied",
10136                crate::json::Value::Null,
10137            );
10138            return Err(msg);
10139        }
10140
10141        if !auth_store.iam_authorization_enabled() {
10142            self.inner.audit_log.record(
10143                action,
10144                username,
10145                collection,
10146                "ok",
10147                crate::json::Value::Null,
10148            );
10149            return Ok(());
10150        }
10151
10152        let resource_name = collection.to_string();
10153        let mut resource = crate::auth::policies::ResourceRef::new(
10154            "collection".to_string(),
10155            resource_name.clone(),
10156        );
10157        if let Some(t) = tenant {
10158            resource = resource.with_tenant(t.to_string());
10159        }
10160        let ctx = runtime_iam_context(
10161            role,
10162            tenant,
10163            auth_store.principal_is_system_owned(principal),
10164        );
10165        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
10166            self.inner.audit_log.record(
10167                action,
10168                username,
10169                &resource_name,
10170                "ok",
10171                crate::json::Value::Null,
10172            );
10173            Ok(())
10174        } else {
10175            self.inner.audit_log.record(
10176                action,
10177                username,
10178                &resource_name,
10179                "denied",
10180                crate::json::Value::Null,
10181            );
10182            Err(format!(
10183                "principal=`{}` action=`{}` resource=`collection:{}` denied by IAM policy",
10184                username, action, resource_name
10185            ))
10186        }
10187    }
10188
10189    /// Translate the parsed [`GrantStmt`] into AuthStore mutations.
10190    fn execute_grant_statement(
10191        &self,
10192        query: &str,
10193        stmt: &crate::storage::query::ast::GrantStmt,
10194    ) -> RedDBResult<RuntimeQueryResult> {
10195        use crate::auth::privileges::{Action, GrantPrincipal, Resource};
10196        use crate::auth::UserId;
10197        use crate::storage::query::ast::{GrantObjectKind, GrantPrincipalRef};
10198
10199        let auth_store = self
10200            .inner
10201            .auth_store
10202            .read()
10203            .clone()
10204            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10205
10206        // Granter identity + role.
10207        let (gname, grole) = current_auth_identity().ok_or_else(|| {
10208            RedDBError::Query("GRANT requires an authenticated principal".to_string())
10209        })?;
10210        let granter = UserId::from_parts(current_tenant().as_deref(), &gname);
10211        let granter_role = grole;
10212
10213        // Build the action set.
10214        let mut actions: Vec<Action> = Vec::new();
10215        if stmt.all {
10216            actions.push(Action::All);
10217        } else {
10218            for kw in &stmt.actions {
10219                let a = Action::from_keyword(kw).ok_or_else(|| {
10220                    RedDBError::Query(format!("unknown privilege keyword `{}`", kw))
10221                })?;
10222                actions.push(a);
10223            }
10224        }
10225
10226        // Audit emit (printed; structured emission is Agent #4's lane).
10227        let mut applied = 0usize;
10228        for obj in &stmt.objects {
10229            let resource = match stmt.object_kind {
10230                GrantObjectKind::Table => Resource::Table {
10231                    schema: obj.schema.clone(),
10232                    table: obj.name.clone(),
10233                },
10234                GrantObjectKind::Schema => Resource::Schema(obj.name.clone()),
10235                GrantObjectKind::Database => Resource::Database,
10236                GrantObjectKind::Function => Resource::Function {
10237                    schema: obj.schema.clone(),
10238                    name: obj.name.clone(),
10239                },
10240            };
10241            for principal in &stmt.principals {
10242                let p = match principal {
10243                    GrantPrincipalRef::Public => GrantPrincipal::Public,
10244                    GrantPrincipalRef::Group(g) => GrantPrincipal::Group(g.clone()),
10245                    GrantPrincipalRef::User { tenant, name } => {
10246                        GrantPrincipal::User(UserId::from_parts(tenant.as_deref(), name))
10247                    }
10248                };
10249                // Tenant of the grant follows the granter's tenant
10250                // (cross-tenant guard inside `AuthStore::grant`).
10251                let tenant = granter.tenant.clone();
10252                auth_store
10253                    .grant(
10254                        &granter,
10255                        granter_role,
10256                        p.clone(),
10257                        resource.clone(),
10258                        actions.clone(),
10259                        stmt.with_grant_option,
10260                        tenant.clone(),
10261                    )
10262                    .map_err(|e| RedDBError::Query(e.to_string()))?;
10263
10264                // IAM policy translation: every GRANT also lands as a
10265                // synthetic `_grant_<id>` policy attached to the
10266                // principal so the new evaluator sees it.
10267                if let Some(policy) =
10268                    grant_to_iam_policy(&p, &resource, &actions, tenant.as_deref())
10269                {
10270                    let pid = policy.id.clone();
10271                    auth_store
10272                        .put_policy_internal(policy)
10273                        .map_err(|e| RedDBError::Query(e.to_string()))?;
10274                    let attachment = match &p {
10275                        GrantPrincipal::User(uid) => {
10276                            crate::auth::store::PrincipalRef::User(uid.clone())
10277                        }
10278                        GrantPrincipal::Group(group) => {
10279                            crate::auth::store::PrincipalRef::Group(group.clone())
10280                        }
10281                        GrantPrincipal::Public => crate::auth::store::PrincipalRef::Group(
10282                            crate::auth::store::PUBLIC_IAM_GROUP.to_string(),
10283                        ),
10284                    };
10285                    auth_store
10286                        .attach_policy(attachment, &pid)
10287                        .map_err(|e| RedDBError::Query(e.to_string()))?;
10288                }
10289                applied += 1;
10290                tracing::info!(
10291                    target: "audit",
10292                    principal = %granter,
10293                    action = "grant",
10294                    "GRANT applied"
10295                );
10296            }
10297        }
10298
10299        self.invalidate_result_cache();
10300        Ok(RuntimeQueryResult::ok_message(
10301            query.to_string(),
10302            &format!("GRANT applied to {} target(s)", applied),
10303            "grant",
10304        ))
10305    }
10306
10307    /// Translate the parsed [`RevokeStmt`] into AuthStore mutations.
10308    fn execute_revoke_statement(
10309        &self,
10310        query: &str,
10311        stmt: &crate::storage::query::ast::RevokeStmt,
10312    ) -> RedDBResult<RuntimeQueryResult> {
10313        use crate::auth::privileges::{Action, GrantPrincipal, Resource};
10314        use crate::auth::UserId;
10315        use crate::storage::query::ast::{GrantObjectKind, GrantPrincipalRef};
10316
10317        let auth_store = self
10318            .inner
10319            .auth_store
10320            .read()
10321            .clone()
10322            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10323
10324        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
10325            RedDBError::Query("REVOKE requires an authenticated principal".to_string())
10326        })?;
10327        let granter_role = grole;
10328
10329        let actions: Vec<Action> = if stmt.all {
10330            vec![Action::All]
10331        } else {
10332            stmt.actions
10333                .iter()
10334                .map(|kw| Action::from_keyword(kw).unwrap_or(Action::Select))
10335                .collect()
10336        };
10337
10338        let mut total_removed = 0usize;
10339        for obj in &stmt.objects {
10340            let resource = match stmt.object_kind {
10341                GrantObjectKind::Table => Resource::Table {
10342                    schema: obj.schema.clone(),
10343                    table: obj.name.clone(),
10344                },
10345                GrantObjectKind::Schema => Resource::Schema(obj.name.clone()),
10346                GrantObjectKind::Database => Resource::Database,
10347                GrantObjectKind::Function => Resource::Function {
10348                    schema: obj.schema.clone(),
10349                    name: obj.name.clone(),
10350                },
10351            };
10352            for principal in &stmt.principals {
10353                let p = match principal {
10354                    GrantPrincipalRef::Public => GrantPrincipal::Public,
10355                    GrantPrincipalRef::Group(g) => GrantPrincipal::Group(g.clone()),
10356                    GrantPrincipalRef::User { tenant, name } => {
10357                        GrantPrincipal::User(UserId::from_parts(tenant.as_deref(), name))
10358                    }
10359                };
10360                let removed = auth_store
10361                    .revoke(granter_role, &p, &resource, &actions)
10362                    .map_err(|e| RedDBError::Query(e.to_string()))?;
10363                let _removed_policies =
10364                    auth_store.delete_synthetic_grant_policies(&p, &resource, &actions);
10365                total_removed += removed;
10366            }
10367        }
10368
10369        self.invalidate_result_cache();
10370        Ok(RuntimeQueryResult::ok_message(
10371            query.to_string(),
10372            &format!("REVOKE removed {} grant(s)", total_removed),
10373            "revoke",
10374        ))
10375    }
10376
10377    /// Translate the parsed [`AlterUserStmt`] into AuthStore mutations.
10378    fn execute_alter_user_statement(
10379        &self,
10380        query: &str,
10381        stmt: &crate::storage::query::ast::AlterUserStmt,
10382    ) -> RedDBResult<RuntimeQueryResult> {
10383        use crate::auth::privileges::UserAttributes;
10384        use crate::auth::UserId;
10385        use crate::storage::query::ast::AlterUserAttribute;
10386
10387        let auth_store = self
10388            .inner
10389            .auth_store
10390            .read()
10391            .clone()
10392            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10393
10394        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
10395            RedDBError::Query("ALTER USER requires an authenticated principal".to_string())
10396        })?;
10397        if grole != crate::auth::Role::Admin {
10398            return Err(RedDBError::Query(
10399                "ALTER USER requires Admin role".to_string(),
10400            ));
10401        }
10402
10403        let target = UserId::from_parts(stmt.tenant.as_deref(), &stmt.username);
10404
10405        // Apply attributes incrementally — each one reads the current
10406        // record, mutates the relevant field, writes back.
10407        let mut attrs = auth_store.user_attributes(&target);
10408        let mut enable_change: Option<bool> = None;
10409
10410        for a in &stmt.attributes {
10411            match a {
10412                AlterUserAttribute::ValidUntil(ts) => {
10413                    // Parse ISO-ish timestamp → ms since epoch. Fall
10414                    // back to integer-ms parsing for callers that pass
10415                    // `'1234567890123'`.
10416                    let ms = parse_timestamp_to_ms(ts).ok_or_else(|| {
10417                        RedDBError::Query(format!("invalid VALID UNTIL timestamp `{ts}`"))
10418                    })?;
10419                    attrs.valid_until = Some(ms);
10420                }
10421                AlterUserAttribute::ConnectionLimit(n) => {
10422                    if *n < 0 {
10423                        return Err(RedDBError::Query(
10424                            "CONNECTION LIMIT must be non-negative".to_string(),
10425                        ));
10426                    }
10427                    attrs.connection_limit = Some(*n as u32);
10428                }
10429                AlterUserAttribute::SetSearchPath(p) => {
10430                    attrs.search_path = Some(p.clone());
10431                }
10432                AlterUserAttribute::AddGroup(g) => {
10433                    if !attrs.groups.iter().any(|existing| existing == g) {
10434                        attrs.groups.push(g.clone());
10435                        attrs.groups.sort();
10436                    }
10437                }
10438                AlterUserAttribute::DropGroup(g) => {
10439                    attrs.groups.retain(|existing| existing != g);
10440                }
10441                AlterUserAttribute::Enable => enable_change = Some(true),
10442                AlterUserAttribute::Disable => enable_change = Some(false),
10443                AlterUserAttribute::Password(_) => {
10444                    // Out of scope — accept the AST but no-op so the
10445                    // parser stays compatible with future password
10446                    // rotation work.
10447                }
10448            }
10449        }
10450
10451        auth_store
10452            .set_user_attributes(&target, attrs)
10453            .map_err(|e| RedDBError::Query(e.to_string()))?;
10454        if let Some(en) = enable_change {
10455            auth_store
10456                .set_user_enabled(&target, en)
10457                .map_err(|e| RedDBError::Query(e.to_string()))?;
10458        }
10459        self.invalidate_result_cache();
10460        tracing::info!(
10461            target: "audit",
10462            principal = %target,
10463            action = "alter_user",
10464            "ALTER USER applied"
10465        );
10466
10467        Ok(RuntimeQueryResult::ok_message(
10468            query.to_string(),
10469            &format!("ALTER USER {} applied", target),
10470            "alter_user",
10471        ))
10472    }
10473
10474    // -----------------------------------------------------------------
10475    // IAM policy executors
10476    // -----------------------------------------------------------------
10477
10478    fn execute_create_iam_policy(
10479        &self,
10480        query: &str,
10481        id: &str,
10482        json: &str,
10483    ) -> RedDBResult<RuntimeQueryResult> {
10484        use crate::auth::policies::Policy;
10485
10486        let auth_store = self
10487            .inner
10488            .auth_store
10489            .read()
10490            .clone()
10491            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10492
10493        // Parse + validate. The kernel rejects oversize / bad shape /
10494        // bad action keywords. If the supplied id differs from the JSON
10495        // id, override it with the SQL-provided id (the JSON id is
10496        // optional context — the SQL DDL form is authoritative).
10497        let mut policy = Policy::from_json_str(json)
10498            .map_err(|e| RedDBError::Query(format!("policy parse: {e}")))?;
10499        if policy.id != id {
10500            policy.id = id.to_string();
10501        }
10502        let pid = policy.id.clone();
10503        let tenant = current_tenant();
10504        let (actor_name, actor_role) = current_auth_identity()
10505            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
10506        let actor = crate::auth::UserId::from_parts(tenant.as_deref(), &actor_name);
10507        let eval_ctx = runtime_iam_context(
10508            actor_role,
10509            tenant.as_deref(),
10510            auth_store.principal_is_system_owned(&actor),
10511        );
10512        let event_ctx = self.policy_mutation_control_ctx(&actor, tenant.as_deref());
10513        let ledger = self.inner.control_event_ledger.read();
10514        let control = crate::auth::store::PolicyMutationControl {
10515            ctx: &event_ctx,
10516            ledger: ledger.as_ref(),
10517            config: self.inner.control_event_config,
10518            registry: Some(self.inner.config_registry.as_ref()),
10519            actor: &actor,
10520            eval_ctx: &eval_ctx,
10521        };
10522        auth_store
10523            .put_policy_with_control_events(policy, &control)
10524            .map_err(|e| RedDBError::Query(e.to_string()))?;
10525
10526        let principal = actor_name;
10527        tracing::info!(
10528            target: "audit",
10529            principal = %principal,
10530            action = "iam:policy.put",
10531            matched_policy_id = %pid,
10532            "CREATE POLICY applied"
10533        );
10534        self.inner.audit_log.record(
10535            "iam/policy.put",
10536            &principal,
10537            &pid,
10538            "ok",
10539            crate::json::Value::Null,
10540        );
10541
10542        self.invalidate_result_cache();
10543        Ok(RuntimeQueryResult::ok_message(
10544            query.to_string(),
10545            &format!("policy `{pid}` stored"),
10546            "create_iam_policy",
10547        ))
10548    }
10549
10550    fn execute_drop_iam_policy(&self, query: &str, id: &str) -> RedDBResult<RuntimeQueryResult> {
10551        let auth_store = self
10552            .inner
10553            .auth_store
10554            .read()
10555            .clone()
10556            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10557        let tenant = current_tenant();
10558        let (actor_name, actor_role) = current_auth_identity()
10559            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
10560        let actor = crate::auth::UserId::from_parts(tenant.as_deref(), &actor_name);
10561        let eval_ctx = runtime_iam_context(
10562            actor_role,
10563            tenant.as_deref(),
10564            auth_store.principal_is_system_owned(&actor),
10565        );
10566        let event_ctx = self.policy_mutation_control_ctx(&actor, tenant.as_deref());
10567        let ledger = self.inner.control_event_ledger.read();
10568        let control = crate::auth::store::PolicyMutationControl {
10569            ctx: &event_ctx,
10570            ledger: ledger.as_ref(),
10571            config: self.inner.control_event_config,
10572            registry: Some(self.inner.config_registry.as_ref()),
10573            actor: &actor,
10574            eval_ctx: &eval_ctx,
10575        };
10576        auth_store
10577            .delete_policy_with_control_events(id, &control)
10578            .map_err(|e| RedDBError::Query(e.to_string()))?;
10579
10580        let principal = actor_name;
10581        tracing::info!(
10582            target: "audit",
10583            principal = %principal,
10584            action = "iam:policy.drop",
10585            matched_policy_id = %id,
10586            "DROP POLICY applied"
10587        );
10588        self.inner.audit_log.record(
10589            "iam/policy.drop",
10590            &principal,
10591            id,
10592            "ok",
10593            crate::json::Value::Null,
10594        );
10595
10596        self.invalidate_result_cache();
10597        Ok(RuntimeQueryResult::ok_message(
10598            query.to_string(),
10599            &format!("policy `{id}` dropped"),
10600            "drop_iam_policy",
10601        ))
10602    }
10603
10604    fn execute_attach_policy(
10605        &self,
10606        query: &str,
10607        policy_id: &str,
10608        principal: &crate::storage::query::ast::PolicyPrincipalRef,
10609    ) -> RedDBResult<RuntimeQueryResult> {
10610        use crate::auth::store::PrincipalRef;
10611        use crate::auth::UserId;
10612        use crate::storage::query::ast::PolicyPrincipalRef;
10613
10614        let auth_store = self
10615            .inner
10616            .auth_store
10617            .read()
10618            .clone()
10619            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10620        let p = match principal {
10621            PolicyPrincipalRef::User(u) => {
10622                PrincipalRef::User(UserId::from_parts(u.tenant.as_deref(), &u.username))
10623            }
10624            PolicyPrincipalRef::Group(g) => PrincipalRef::Group(g.clone()),
10625        };
10626        let pretty_target = principal_label(principal);
10627        let tenant = current_tenant();
10628        let (actor_name, actor_role) = current_auth_identity()
10629            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
10630        let actor = crate::auth::UserId::from_parts(tenant.as_deref(), &actor_name);
10631        let eval_ctx = runtime_iam_context(
10632            actor_role,
10633            tenant.as_deref(),
10634            auth_store.principal_is_system_owned(&actor),
10635        );
10636        let event_ctx = self.policy_mutation_control_ctx(&actor, tenant.as_deref());
10637        let ledger = self.inner.control_event_ledger.read();
10638        let control = crate::auth::store::PolicyMutationControl {
10639            ctx: &event_ctx,
10640            ledger: ledger.as_ref(),
10641            config: self.inner.control_event_config,
10642            registry: Some(self.inner.config_registry.as_ref()),
10643            actor: &actor,
10644            eval_ctx: &eval_ctx,
10645        };
10646        auth_store
10647            .attach_policy_with_control_events(p, policy_id, &control)
10648            .map_err(|e| RedDBError::Query(e.to_string()))?;
10649
10650        let principal_str = actor_name;
10651        tracing::info!(
10652            target: "audit",
10653            principal = %principal_str,
10654            action = "iam:policy.attach",
10655            matched_policy_id = %policy_id,
10656            target = %pretty_target,
10657            "ATTACH POLICY applied"
10658        );
10659        self.inner.audit_log.record(
10660            "iam/policy.attach",
10661            &principal_str,
10662            &pretty_target,
10663            "ok",
10664            crate::json::Value::Null,
10665        );
10666
10667        self.invalidate_result_cache();
10668        Ok(RuntimeQueryResult::ok_message(
10669            query.to_string(),
10670            &format!("policy `{policy_id}` attached to {pretty_target}"),
10671            "attach_policy",
10672        ))
10673    }
10674
10675    fn execute_detach_policy(
10676        &self,
10677        query: &str,
10678        policy_id: &str,
10679        principal: &crate::storage::query::ast::PolicyPrincipalRef,
10680    ) -> RedDBResult<RuntimeQueryResult> {
10681        use crate::auth::store::PrincipalRef;
10682        use crate::auth::UserId;
10683        use crate::storage::query::ast::PolicyPrincipalRef;
10684
10685        let auth_store = self
10686            .inner
10687            .auth_store
10688            .read()
10689            .clone()
10690            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10691        let p = match principal {
10692            PolicyPrincipalRef::User(u) => {
10693                PrincipalRef::User(UserId::from_parts(u.tenant.as_deref(), &u.username))
10694            }
10695            PolicyPrincipalRef::Group(g) => PrincipalRef::Group(g.clone()),
10696        };
10697        let pretty_target = principal_label(principal);
10698        let tenant = current_tenant();
10699        let (actor_name, actor_role) = current_auth_identity()
10700            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
10701        let actor = crate::auth::UserId::from_parts(tenant.as_deref(), &actor_name);
10702        let eval_ctx = runtime_iam_context(
10703            actor_role,
10704            tenant.as_deref(),
10705            auth_store.principal_is_system_owned(&actor),
10706        );
10707        let event_ctx = self.policy_mutation_control_ctx(&actor, tenant.as_deref());
10708        let ledger = self.inner.control_event_ledger.read();
10709        let control = crate::auth::store::PolicyMutationControl {
10710            ctx: &event_ctx,
10711            ledger: ledger.as_ref(),
10712            config: self.inner.control_event_config,
10713            registry: Some(self.inner.config_registry.as_ref()),
10714            actor: &actor,
10715            eval_ctx: &eval_ctx,
10716        };
10717        auth_store
10718            .detach_policy_with_control_events(p, policy_id, &control)
10719            .map_err(|e| RedDBError::Query(e.to_string()))?;
10720
10721        let principal_str = actor_name;
10722        tracing::info!(
10723            target: "audit",
10724            principal = %principal_str,
10725            action = "iam:policy.detach",
10726            matched_policy_id = %policy_id,
10727            target = %pretty_target,
10728            "DETACH POLICY applied"
10729        );
10730        self.inner.audit_log.record(
10731            "iam/policy.detach",
10732            &principal_str,
10733            &pretty_target,
10734            "ok",
10735            crate::json::Value::Null,
10736        );
10737
10738        self.invalidate_result_cache();
10739        Ok(RuntimeQueryResult::ok_message(
10740            query.to_string(),
10741            &format!("policy `{policy_id}` detached from {pretty_target}"),
10742            "detach_policy",
10743        ))
10744    }
10745
10746    fn execute_show_policies(
10747        &self,
10748        query: &str,
10749        filter: Option<&crate::storage::query::ast::PolicyPrincipalRef>,
10750    ) -> RedDBResult<RuntimeQueryResult> {
10751        use crate::auth::UserId;
10752        use crate::storage::query::ast::PolicyPrincipalRef;
10753        use crate::storage::query::unified::UnifiedRecord;
10754        use crate::storage::schema::Value as SchemaValue;
10755        use std::sync::Arc;
10756
10757        let auth_store = self
10758            .inner
10759            .auth_store
10760            .read()
10761            .clone()
10762            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10763
10764        let pols = match filter {
10765            None => auth_store.list_policies(),
10766            Some(PolicyPrincipalRef::User(u)) => {
10767                let id = UserId::from_parts(u.tenant.as_deref(), &u.username);
10768                auth_store.effective_policies(&id)
10769            }
10770            Some(PolicyPrincipalRef::Group(g)) => auth_store.group_policies(g),
10771        };
10772
10773        let mut records = Vec::with_capacity(pols.len() + 1);
10774
10775        // Header row (#712 / S5A): synthetic record at index 0 that
10776        // reports the active PolicyEnforcementMode and the hard-cutover
10777        // version, so an operator running SHOW POLICIES can see the
10778        // current posture without a separate command.
10779        let mode = auth_store.enforcement_mode();
10780        let mut header = UnifiedRecord::default();
10781        header.set_arc(
10782            Arc::from("id"),
10783            SchemaValue::text("<enforcement_mode>".to_string()),
10784        );
10785        header.set_arc(Arc::from("statements"), SchemaValue::Integer(0));
10786        header.set_arc(Arc::from("tenant"), SchemaValue::Null);
10787        let header_json = format!(
10788            r#"{{"enforcement_mode":"{}","policy_only_hard_version":"{}"}}"#,
10789            mode.as_str(),
10790            crate::auth::enforcement_mode::POLICY_ONLY_HARD_VERSION
10791        );
10792        header.set_arc(Arc::from("json"), SchemaValue::text(header_json));
10793        records.push(header);
10794
10795        for p in pols.iter() {
10796            let mut rec = UnifiedRecord::default();
10797            rec.set_arc(Arc::from("id"), SchemaValue::text(p.id.clone()));
10798            rec.set_arc(
10799                Arc::from("statements"),
10800                SchemaValue::Integer(p.statements.len() as i64),
10801            );
10802            rec.set_arc(
10803                Arc::from("tenant"),
10804                p.tenant
10805                    .as_deref()
10806                    .map(|t| SchemaValue::text(t.to_string()))
10807                    .unwrap_or(SchemaValue::Null),
10808            );
10809            rec.set_arc(Arc::from("json"), SchemaValue::text(p.to_json_string()));
10810            records.push(rec);
10811        }
10812        let mut result = crate::storage::query::unified::UnifiedResult::empty();
10813        result.records = records;
10814        Ok(RuntimeQueryResult {
10815            query: query.to_string(),
10816            mode: crate::storage::query::modes::QueryMode::Sql,
10817            statement: "show_policies",
10818            engine: "iam-policies",
10819            result,
10820            affected_rows: 0,
10821            statement_type: "select",
10822        })
10823    }
10824
10825    fn execute_show_effective_permissions(
10826        &self,
10827        query: &str,
10828        user: &crate::storage::query::ast::PolicyUserRef,
10829        resource: Option<&crate::storage::query::ast::PolicyResourceRef>,
10830    ) -> RedDBResult<RuntimeQueryResult> {
10831        use crate::auth::UserId;
10832        use crate::storage::query::unified::UnifiedRecord;
10833        use crate::storage::schema::Value as SchemaValue;
10834        use std::sync::Arc;
10835
10836        let auth_store = self
10837            .inner
10838            .auth_store
10839            .read()
10840            .clone()
10841            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10842        let id = UserId::from_parts(user.tenant.as_deref(), &user.username);
10843        let pols = auth_store.effective_policies(&id);
10844
10845        // Show one row per (policy, statement) tuple, plus any
10846        // resource-level filter passed by the caller.
10847        let mut records = Vec::new();
10848        for p in pols.iter() {
10849            for (idx, st) in p.statements.iter().enumerate() {
10850                if let Some(_r) = resource {
10851                    // Naive filter: render statement targets to strings
10852                    // and skip if no match. Conservative default = include
10853                    // (the simulator handles fine-grained matching).
10854                }
10855                let mut rec = UnifiedRecord::default();
10856                rec.set_arc(Arc::from("policy_id"), SchemaValue::text(p.id.clone()));
10857                rec.set_arc(
10858                    Arc::from("statement_index"),
10859                    SchemaValue::Integer(idx as i64),
10860                );
10861                rec.set_arc(
10862                    Arc::from("sid"),
10863                    st.sid
10864                        .as_deref()
10865                        .map(|s| SchemaValue::text(s.to_string()))
10866                        .unwrap_or(SchemaValue::Null),
10867                );
10868                rec.set_arc(
10869                    Arc::from("effect"),
10870                    SchemaValue::text(match st.effect {
10871                        crate::auth::policies::Effect::Allow => "allow",
10872                        crate::auth::policies::Effect::Deny => "deny",
10873                    }),
10874                );
10875                rec.set_arc(
10876                    Arc::from("actions"),
10877                    SchemaValue::Integer(st.actions.len() as i64),
10878                );
10879                rec.set_arc(
10880                    Arc::from("resources"),
10881                    SchemaValue::Integer(st.resources.len() as i64),
10882                );
10883                records.push(rec);
10884            }
10885        }
10886        let mut result = crate::storage::query::unified::UnifiedResult::empty();
10887        result.records = records;
10888        Ok(RuntimeQueryResult {
10889            query: query.to_string(),
10890            mode: crate::storage::query::modes::QueryMode::Sql,
10891            statement: "show_effective_permissions",
10892            engine: "iam-policies",
10893            result,
10894            affected_rows: 0,
10895            statement_type: "select",
10896        })
10897    }
10898
10899    fn execute_simulate_policy(
10900        &self,
10901        query: &str,
10902        user: &crate::storage::query::ast::PolicyUserRef,
10903        action: &str,
10904        resource: &crate::storage::query::ast::PolicyResourceRef,
10905    ) -> RedDBResult<RuntimeQueryResult> {
10906        use crate::auth::policies::ResourceRef;
10907        use crate::auth::store::SimCtx;
10908        use crate::auth::UserId;
10909        use crate::storage::query::unified::UnifiedRecord;
10910        use crate::storage::schema::Value as SchemaValue;
10911        use std::sync::Arc;
10912
10913        let auth_store = self
10914            .inner
10915            .auth_store
10916            .read()
10917            .clone()
10918            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10919        let id = UserId::from_parts(user.tenant.as_deref(), &user.username);
10920        let r = ResourceRef::new(resource.kind.clone(), resource.name.clone());
10921        let outcome = auth_store.simulate(&id, action, &r, SimCtx::default());
10922
10923        let principal_str = current_auth_identity()
10924            .map(|(u, _)| u)
10925            .unwrap_or_else(|| "anonymous".into());
10926        let (decision_str, matched_pid, matched_sid) = decision_to_strings(&outcome.decision);
10927        tracing::info!(
10928            target: "audit",
10929            principal = %principal_str,
10930            action = "iam:policy.simulate",
10931            decision = %decision_str,
10932            matched_policy_id = ?matched_pid,
10933            matched_sid = ?matched_sid,
10934            "SIMULATE issued"
10935        );
10936        self.inner.audit_log.record(
10937            "iam/policy.simulate",
10938            &principal_str,
10939            &id.to_string(),
10940            "ok",
10941            crate::json::Value::Null,
10942        );
10943
10944        let mut rec = UnifiedRecord::default();
10945        rec.set_arc(Arc::from("decision"), SchemaValue::text(decision_str));
10946        rec.set_arc(
10947            Arc::from("matched_policy_id"),
10948            matched_pid
10949                .map(SchemaValue::text)
10950                .unwrap_or(SchemaValue::Null),
10951        );
10952        rec.set_arc(
10953            Arc::from("matched_sid"),
10954            matched_sid
10955                .map(SchemaValue::text)
10956                .unwrap_or(SchemaValue::Null),
10957        );
10958        rec.set_arc(Arc::from("reason"), SchemaValue::text(outcome.reason));
10959        rec.set_arc(
10960            Arc::from("trail_len"),
10961            SchemaValue::Integer(outcome.trail.len() as i64),
10962        );
10963        let mut result = crate::storage::query::unified::UnifiedResult::empty();
10964        result.records = vec![rec];
10965        Ok(RuntimeQueryResult {
10966            query: query.to_string(),
10967            mode: crate::storage::query::modes::QueryMode::Sql,
10968            statement: "simulate_policy",
10969            engine: "iam-policies",
10970            result,
10971            affected_rows: 0,
10972            statement_type: "select",
10973        })
10974    }
10975}
10976
10977/// Translate a parsed GRANT into a synthetic IAM policy whose id
10978/// starts with `_grant_<unique>`. PUBLIC is represented as an
10979/// implicit IAM group; legacy GROUP grants are still rejected by the
10980/// grant store and are not translated here.
10981fn grant_to_iam_policy(
10982    principal: &crate::auth::privileges::GrantPrincipal,
10983    resource: &crate::auth::privileges::Resource,
10984    actions: &[crate::auth::privileges::Action],
10985    tenant: Option<&str>,
10986) -> Option<crate::auth::policies::Policy> {
10987    use crate::auth::policies::{
10988        compile_action, ActionPattern, Effect, Policy, ResourcePattern, Statement,
10989    };
10990    use crate::auth::privileges::{Action, GrantPrincipal, Resource};
10991
10992    if matches!(principal, GrantPrincipal::Group(_)) {
10993        return None;
10994    }
10995
10996    let now = crate::auth::now_ms();
10997    let id = format!("_grant_{:x}_{:x}", now, std::process::id());
10998
10999    let resource_str = match resource {
11000        Resource::Database => "table:*".to_string(),
11001        Resource::Schema(s) => format!("table:{s}.*"),
11002        Resource::Table { schema, table } => match schema {
11003            Some(s) => format!("table:{s}.{table}"),
11004            None => format!("table:{table}"),
11005        },
11006        Resource::Function { schema, name } => match schema {
11007            Some(s) => format!("function:{s}.{name}"),
11008            None => format!("function:{name}"),
11009        },
11010    };
11011
11012    // Compile actions — fall back to `*` only when the grant included
11013    // `Action::All`. Map every other action keyword to its lowercase
11014    // form so it lines up with the kernel's allowlist.
11015    let action_patterns: Vec<ActionPattern> = if actions.contains(&Action::All) {
11016        vec![ActionPattern::Wildcard]
11017    } else {
11018        actions
11019            .iter()
11020            .map(|a| compile_action(&a.as_str().to_ascii_lowercase()))
11021            .collect()
11022    };
11023    if action_patterns.is_empty() {
11024        return None;
11025    }
11026
11027    // Inline resource compilation matching the kernel's `compile_resource`:
11028    //   * `*` → wildcard
11029    //   * contains `*` → glob
11030    //   * `kind:name` → exact
11031    let resource_patterns = if resource_str == "*" {
11032        vec![ResourcePattern::Wildcard]
11033    } else if resource_str.contains('*') {
11034        vec![ResourcePattern::Glob(resource_str.clone())]
11035    } else if let Some((kind, name)) = resource_str.split_once(':') {
11036        vec![ResourcePattern::Exact {
11037            kind: kind.to_string(),
11038            name: name.to_string(),
11039        }]
11040    } else {
11041        vec![ResourcePattern::Wildcard]
11042    };
11043
11044    let policy = Policy {
11045        id,
11046        version: 1,
11047        tenant: tenant.map(|t| t.to_string()),
11048        created_at: now,
11049        updated_at: now,
11050        statements: vec![Statement {
11051            sid: None,
11052            effect: Effect::Allow,
11053            actions: action_patterns,
11054            resources: resource_patterns,
11055            condition: None,
11056        }],
11057    };
11058    if policy.validate().is_err() {
11059        return None;
11060    }
11061    Some(policy)
11062}
11063
11064fn legacy_action_to_iam(action: crate::auth::privileges::Action) -> &'static str {
11065    use crate::auth::privileges::Action;
11066    match action {
11067        Action::Select => "select",
11068        Action::Insert => "insert",
11069        Action::Update => "update",
11070        Action::Delete => "delete",
11071        Action::Truncate => "truncate",
11072        Action::References => "references",
11073        Action::Execute => "execute",
11074        Action::Usage => "usage",
11075        Action::All => "*",
11076    }
11077}
11078
11079fn update_set_target_columns(query: &crate::storage::query::ast::UpdateQuery) -> Vec<String> {
11080    let mut columns = Vec::new();
11081    for (column, _) in &query.assignment_exprs {
11082        if !columns.iter().any(|seen| seen == column) {
11083            columns.push(column.clone());
11084        }
11085    }
11086    columns
11087}
11088
11089fn column_access_request_for_table_update(
11090    table_name: &str,
11091    columns: Vec<String>,
11092) -> crate::auth::ColumnAccessRequest {
11093    match table_name.split_once('.') {
11094        Some((schema, table)) => {
11095            crate::auth::ColumnAccessRequest::update(table.to_string(), columns)
11096                .with_schema(schema.to_string())
11097        }
11098        None => crate::auth::ColumnAccessRequest::update(table_name.to_string(), columns),
11099    }
11100}
11101
11102fn column_access_request_for_table_select(
11103    table_name: &str,
11104    columns: Vec<String>,
11105) -> crate::auth::ColumnAccessRequest {
11106    match table_name.split_once('.') {
11107        Some((schema, table)) => {
11108            crate::auth::ColumnAccessRequest::select(table.to_string(), columns)
11109                .with_schema(schema.to_string())
11110        }
11111        None => crate::auth::ColumnAccessRequest::select(table_name.to_string(), columns),
11112    }
11113}
11114
11115fn update_returning_columns_for_policy(
11116    runtime: &RedDBRuntime,
11117    query: &crate::storage::query::ast::UpdateQuery,
11118) -> Option<Vec<String>> {
11119    let items = query.returning.as_ref()?;
11120    let mut columns = Vec::new();
11121    let project_all = items
11122        .iter()
11123        .any(|item| matches!(item, crate::storage::query::ast::ReturningItem::All));
11124    if project_all {
11125        collect_returning_star_columns(runtime, query, &mut columns);
11126    } else {
11127        for item in items {
11128            let crate::storage::query::ast::ReturningItem::Column(column) = item else {
11129                continue;
11130            };
11131            push_returning_policy_column(&mut columns, column);
11132        }
11133    }
11134    (!columns.is_empty()).then_some(columns)
11135}
11136
11137fn collect_returning_star_columns(
11138    runtime: &RedDBRuntime,
11139    query: &crate::storage::query::ast::UpdateQuery,
11140    columns: &mut Vec<String>,
11141) {
11142    let store = runtime.db().store();
11143    let Some(manager) = store.get_collection(&query.table) else {
11144        return;
11145    };
11146    if let Some(schema) = manager.column_schema() {
11147        for column in schema.iter() {
11148            push_returning_policy_column(columns, column);
11149        }
11150    }
11151    for entity in manager.query_all(|_| true) {
11152        if !returning_entity_matches_update_target(&entity, query.target) {
11153            continue;
11154        }
11155        match &entity.data {
11156            crate::storage::EntityData::Row(row) => {
11157                for (column, _) in row.iter_fields() {
11158                    push_returning_policy_column(columns, column);
11159                }
11160            }
11161            crate::storage::EntityData::Node(node) => {
11162                push_returning_policy_column(columns, "label");
11163                push_returning_policy_column(columns, "node_type");
11164                for column in node.properties.keys() {
11165                    push_returning_policy_column(columns, column);
11166                }
11167            }
11168            crate::storage::EntityData::Edge(edge) => {
11169                push_returning_policy_column(columns, "label");
11170                push_returning_policy_column(columns, "from_rid");
11171                push_returning_policy_column(columns, "to_rid");
11172                push_returning_policy_column(columns, "weight");
11173                for column in edge.properties.keys() {
11174                    push_returning_policy_column(columns, column);
11175                }
11176            }
11177            _ => {}
11178        }
11179    }
11180}
11181
11182fn push_returning_policy_column(columns: &mut Vec<String>, column: &str) {
11183    if returning_public_envelope_column(column) {
11184        return;
11185    }
11186    if !columns.iter().any(|seen| seen == column) {
11187        columns.push(column.to_string());
11188    }
11189}
11190
11191fn returning_public_envelope_column(column: &str) -> bool {
11192    matches!(
11193        column.to_ascii_lowercase().as_str(),
11194        "rid" | "collection" | "kind" | "tenant" | "created_at" | "updated_at" | "red_entity_id"
11195    )
11196}
11197
11198fn returning_entity_matches_update_target(
11199    entity: &crate::storage::UnifiedEntity,
11200    target: crate::storage::query::ast::UpdateTarget,
11201) -> bool {
11202    use crate::storage::query::ast::UpdateTarget;
11203    match target {
11204        UpdateTarget::Rows => {
11205            matches!(returning_row_item_kind(entity), Some(ReturningRowKind::Row))
11206        }
11207        UpdateTarget::Documents => {
11208            matches!(
11209                returning_row_item_kind(entity),
11210                Some(ReturningRowKind::Document)
11211            )
11212        }
11213        UpdateTarget::Kv => matches!(returning_row_item_kind(entity), Some(ReturningRowKind::Kv)),
11214        UpdateTarget::Nodes => matches!(
11215            (&entity.kind, &entity.data),
11216            (
11217                crate::storage::EntityKind::GraphNode(_),
11218                crate::storage::EntityData::Node(_)
11219            )
11220        ),
11221        UpdateTarget::Edges => matches!(
11222            (&entity.kind, &entity.data),
11223            (
11224                crate::storage::EntityKind::GraphEdge(_),
11225                crate::storage::EntityData::Edge(_)
11226            )
11227        ),
11228    }
11229}
11230
11231#[derive(Debug, Clone, Copy, PartialEq, Eq)]
11232enum ReturningRowKind {
11233    Row,
11234    Document,
11235    Kv,
11236}
11237
11238fn returning_row_item_kind(entity: &crate::storage::UnifiedEntity) -> Option<ReturningRowKind> {
11239    let row = entity.data.as_row()?;
11240    let is_kv = row.iter_fields().all(|(column, _)| {
11241        column.eq_ignore_ascii_case("key") || column.eq_ignore_ascii_case("value")
11242    });
11243    if is_kv {
11244        return Some(ReturningRowKind::Kv);
11245    }
11246    let is_document = row
11247        .iter_fields()
11248        .any(|(_, value)| matches!(value, crate::storage::schema::Value::Json(_)));
11249    if is_document {
11250        Some(ReturningRowKind::Document)
11251    } else {
11252        Some(ReturningRowKind::Row)
11253    }
11254}
11255
11256fn requested_table_columns_for_policy(
11257    table: &crate::storage::query::ast::TableQuery,
11258) -> Vec<String> {
11259    use crate::storage::query::sql_lowering::{
11260        effective_table_filter, effective_table_group_by_exprs, effective_table_having_filter,
11261        effective_table_projections,
11262    };
11263
11264    let table_name = table.table.as_str();
11265    let table_alias = table.alias.as_deref();
11266    let mut columns = std::collections::BTreeSet::new();
11267
11268    for projection in effective_table_projections(table) {
11269        collect_projection_columns(&projection, table_name, table_alias, &mut columns);
11270    }
11271    if let Some(filter) = effective_table_filter(table) {
11272        collect_filter_columns(&filter, table_name, table_alias, &mut columns);
11273    }
11274    for expr in effective_table_group_by_exprs(table) {
11275        collect_expr_columns(&expr, table_name, table_alias, &mut columns);
11276    }
11277    if let Some(filter) = effective_table_having_filter(table) {
11278        collect_filter_columns(&filter, table_name, table_alias, &mut columns);
11279    }
11280    for order in &table.order_by {
11281        if let Some(expr) = order.expr.as_ref() {
11282            collect_expr_columns(expr, table_name, table_alias, &mut columns);
11283        } else {
11284            collect_field_ref_column(&order.field, table_name, table_alias, &mut columns);
11285        }
11286    }
11287
11288    columns.into_iter().collect()
11289}
11290
11291fn collect_projection_columns(
11292    projection: &crate::storage::query::ast::Projection,
11293    table_name: &str,
11294    table_alias: Option<&str>,
11295    columns: &mut std::collections::BTreeSet<String>,
11296) {
11297    use crate::storage::query::ast::Projection;
11298    match projection {
11299        Projection::All => {
11300            columns.insert("*".to_string());
11301        }
11302        Projection::Column(column) | Projection::Alias(column, _) => {
11303            if column != "*" {
11304                columns.insert(column.clone());
11305            }
11306        }
11307        Projection::Function(_, args) => {
11308            for arg in args {
11309                collect_projection_columns(arg, table_name, table_alias, columns);
11310            }
11311        }
11312        Projection::Expression(filter, _) => {
11313            collect_filter_columns(filter, table_name, table_alias, columns);
11314        }
11315        Projection::Field(field, _) => {
11316            collect_field_ref_column(field, table_name, table_alias, columns);
11317        }
11318        // Slice 7a (#589): no runtime support yet; recurse into args so
11319        // any column references are still tracked in case a future
11320        // executor needs the column set.
11321        Projection::Window { args, .. } => {
11322            for arg in args {
11323                collect_projection_columns(arg, table_name, table_alias, columns);
11324            }
11325        }
11326    }
11327}
11328
11329fn collect_filter_columns(
11330    filter: &crate::storage::query::ast::Filter,
11331    table_name: &str,
11332    table_alias: Option<&str>,
11333    columns: &mut std::collections::BTreeSet<String>,
11334) {
11335    use crate::storage::query::ast::Filter;
11336    match filter {
11337        Filter::Compare { field, .. }
11338        | Filter::IsNull(field)
11339        | Filter::IsNotNull(field)
11340        | Filter::In { field, .. }
11341        | Filter::Between { field, .. }
11342        | Filter::Like { field, .. }
11343        | Filter::StartsWith { field, .. }
11344        | Filter::EndsWith { field, .. }
11345        | Filter::Contains { field, .. } => {
11346            collect_field_ref_column(field, table_name, table_alias, columns);
11347        }
11348        Filter::CompareFields { left, right, .. } => {
11349            collect_field_ref_column(left, table_name, table_alias, columns);
11350            collect_field_ref_column(right, table_name, table_alias, columns);
11351        }
11352        Filter::CompareExpr { lhs, rhs, .. } => {
11353            collect_expr_columns(lhs, table_name, table_alias, columns);
11354            collect_expr_columns(rhs, table_name, table_alias, columns);
11355        }
11356        Filter::And(left, right) | Filter::Or(left, right) => {
11357            collect_filter_columns(left, table_name, table_alias, columns);
11358            collect_filter_columns(right, table_name, table_alias, columns);
11359        }
11360        Filter::Not(inner) => collect_filter_columns(inner, table_name, table_alias, columns),
11361    }
11362}
11363
11364fn collect_expr_columns(
11365    expr: &crate::storage::query::ast::Expr,
11366    table_name: &str,
11367    table_alias: Option<&str>,
11368    columns: &mut std::collections::BTreeSet<String>,
11369) {
11370    use crate::storage::query::ast::Expr;
11371    match expr {
11372        Expr::Column { field, .. } => {
11373            collect_field_ref_column(field, table_name, table_alias, columns);
11374        }
11375        Expr::Literal { .. } | Expr::Parameter { .. } => {}
11376        Expr::UnaryOp { operand, .. } | Expr::Cast { inner: operand, .. } => {
11377            collect_expr_columns(operand, table_name, table_alias, columns);
11378        }
11379        Expr::BinaryOp { lhs, rhs, .. } => {
11380            collect_expr_columns(lhs, table_name, table_alias, columns);
11381            collect_expr_columns(rhs, table_name, table_alias, columns);
11382        }
11383        Expr::FunctionCall { args, .. } => {
11384            for arg in args {
11385                collect_expr_columns(arg, table_name, table_alias, columns);
11386            }
11387        }
11388        Expr::Case {
11389            branches, else_, ..
11390        } => {
11391            for (condition, value) in branches {
11392                collect_expr_columns(condition, table_name, table_alias, columns);
11393                collect_expr_columns(value, table_name, table_alias, columns);
11394            }
11395            if let Some(value) = else_ {
11396                collect_expr_columns(value, table_name, table_alias, columns);
11397            }
11398        }
11399        Expr::IsNull { operand, .. } => {
11400            collect_expr_columns(operand, table_name, table_alias, columns);
11401        }
11402        Expr::InList { target, values, .. } => {
11403            collect_expr_columns(target, table_name, table_alias, columns);
11404            for value in values {
11405                collect_expr_columns(value, table_name, table_alias, columns);
11406            }
11407        }
11408        Expr::Between {
11409            target, low, high, ..
11410        } => {
11411            collect_expr_columns(target, table_name, table_alias, columns);
11412            collect_expr_columns(low, table_name, table_alias, columns);
11413            collect_expr_columns(high, table_name, table_alias, columns);
11414        }
11415        Expr::Subquery { .. } => {}
11416        Expr::WindowFunctionCall { args, window, .. } => {
11417            for arg in args {
11418                collect_expr_columns(arg, table_name, table_alias, columns);
11419            }
11420            for e in &window.partition_by {
11421                collect_expr_columns(e, table_name, table_alias, columns);
11422            }
11423            for o in &window.order_by {
11424                collect_expr_columns(&o.expr, table_name, table_alias, columns);
11425            }
11426        }
11427    }
11428}
11429
11430fn collect_field_ref_column(
11431    field: &crate::storage::query::ast::FieldRef,
11432    table_name: &str,
11433    table_alias: Option<&str>,
11434    columns: &mut std::collections::BTreeSet<String>,
11435) {
11436    if let Some(column) = policy_column_name_from_field_ref(field, table_name, table_alias) {
11437        if column != "*" {
11438            columns.insert(column);
11439        }
11440    }
11441}
11442
11443fn policy_column_name_from_field_ref(
11444    field: &crate::storage::query::ast::FieldRef,
11445    table_name: &str,
11446    table_alias: Option<&str>,
11447) -> Option<String> {
11448    match field {
11449        crate::storage::query::ast::FieldRef::TableColumn { table, column } => {
11450            if column == "*" {
11451                return Some("*".to_string());
11452            }
11453            if table.is_empty() || table == table_name || Some(table.as_str()) == table_alias {
11454                Some(column.clone())
11455            } else {
11456                Some(format!("{table}.{column}"))
11457            }
11458        }
11459        _ => None,
11460    }
11461}
11462
11463fn legacy_resource_to_iam(
11464    resource: &crate::auth::privileges::Resource,
11465    tenant: Option<&str>,
11466) -> crate::auth::policies::ResourceRef {
11467    use crate::auth::privileges::Resource;
11468
11469    let (kind, name) = match resource {
11470        Resource::Database => ("database".to_string(), "*".to_string()),
11471        Resource::Schema(s) => ("schema".to_string(), format!("{s}.*")),
11472        Resource::Table { schema, table } => (
11473            "table".to_string(),
11474            match schema {
11475                Some(s) => format!("{s}.{table}"),
11476                None => table.clone(),
11477            },
11478        ),
11479        Resource::Function { schema, name } => (
11480            "function".to_string(),
11481            match schema {
11482                Some(s) => format!("{s}.{name}"),
11483                None => name.clone(),
11484            },
11485        ),
11486    };
11487
11488    let mut out = crate::auth::policies::ResourceRef::new(kind, name);
11489    if let Some(t) = tenant {
11490        out = out.with_tenant(t.to_string());
11491    }
11492    out
11493}
11494
11495#[derive(Debug)]
11496struct JoinTableSide {
11497    table: String,
11498    alias: String,
11499}
11500
11501fn table_side_context(expr: &QueryExpr) -> Option<JoinTableSide> {
11502    match expr {
11503        QueryExpr::Table(table) => Some(JoinTableSide {
11504            table: table.table.clone(),
11505            alias: table.alias.clone().unwrap_or_else(|| table.table.clone()),
11506        }),
11507        _ => None,
11508    }
11509}
11510
11511fn collect_projection_columns_for_table(
11512    projection: &Projection,
11513    table: &str,
11514    alias: Option<&str>,
11515    out: &mut BTreeSet<String>,
11516) {
11517    match projection {
11518        Projection::Column(column) | Projection::Alias(column, _) => {
11519            match split_qualified_column(column) {
11520                Some((qualifier, column))
11521                    if qualifier == table || alias.is_some_and(|alias| qualifier == alias) =>
11522                {
11523                    push_policy_column(column, out);
11524                }
11525                Some(_) => {}
11526                None => push_policy_column(column, out),
11527            }
11528        }
11529        Projection::Field(
11530            FieldRef::TableColumn {
11531                table: qualifier,
11532                column,
11533            },
11534            _,
11535        ) => {
11536            if qualifier.is_empty()
11537                || qualifier == table
11538                || alias.is_some_and(|alias| qualifier == alias)
11539            {
11540                push_policy_column(column, out);
11541            }
11542        }
11543        Projection::Field(
11544            FieldRef::NodeProperty {
11545                alias: qualifier,
11546                property,
11547            },
11548            _,
11549        )
11550        | Projection::Field(
11551            FieldRef::EdgeProperty {
11552                alias: qualifier,
11553                property,
11554            },
11555            _,
11556        ) => {
11557            if qualifier == table || alias.is_some_and(|alias| qualifier == alias) {
11558                push_policy_column(property, out);
11559            }
11560        }
11561        Projection::Function(_, args) => {
11562            for arg in args {
11563                collect_projection_columns_for_table(arg, table, alias, out);
11564            }
11565        }
11566        Projection::Expression(_, _) | Projection::All | Projection::Field(_, _) => {}
11567        Projection::Window { args, .. } => {
11568            for arg in args {
11569                collect_projection_columns_for_table(arg, table, alias, out);
11570            }
11571        }
11572    }
11573}
11574
11575fn collect_projection_columns_for_join_side(
11576    projection: &Projection,
11577    left: Option<&JoinTableSide>,
11578    right: Option<&JoinTableSide>,
11579    out: &mut HashMap<String, BTreeSet<String>>,
11580) -> RedDBResult<()> {
11581    match projection {
11582        Projection::Column(column) | Projection::Alias(column, _) => {
11583            if let Some((qualifier, column)) = split_qualified_column(column) {
11584                push_qualified_join_column(qualifier, column, left, right, out);
11585            } else {
11586                push_unqualified_join_column(column, left, right, out);
11587            }
11588        }
11589        Projection::Field(FieldRef::TableColumn { table, column }, _) => {
11590            if table.is_empty() {
11591                push_unqualified_join_column(column, left, right, out);
11592            } else if let Some(side) = [left, right]
11593                .into_iter()
11594                .flatten()
11595                .find(|side| table == side.table.as_str() || table == side.alias.as_str())
11596            {
11597                push_join_column(&side.table, column, out);
11598            }
11599        }
11600        Projection::Field(FieldRef::NodeProperty { alias, property }, _)
11601        | Projection::Field(FieldRef::EdgeProperty { alias, property }, _) => {
11602            push_qualified_join_column(alias, property, left, right, out);
11603        }
11604        Projection::Function(_, args) => {
11605            for arg in args {
11606                collect_projection_columns_for_join_side(arg, left, right, out)?;
11607            }
11608        }
11609        Projection::Expression(_, _) | Projection::All | Projection::Field(_, _) => {}
11610        Projection::Window { args, .. } => {
11611            for arg in args {
11612                collect_projection_columns_for_join_side(arg, left, right, out)?;
11613            }
11614        }
11615    }
11616    Ok(())
11617}
11618
11619fn split_qualified_column(column: &str) -> Option<(&str, &str)> {
11620    let (qualifier, column) = column.split_once('.')?;
11621    if qualifier.is_empty() || column.is_empty() || column.contains('.') {
11622        return None;
11623    }
11624    Some((qualifier, column))
11625}
11626
11627fn push_qualified_join_column(
11628    qualifier: &str,
11629    column: &str,
11630    left: Option<&JoinTableSide>,
11631    right: Option<&JoinTableSide>,
11632    out: &mut HashMap<String, BTreeSet<String>>,
11633) {
11634    if let Some(side) = [left, right]
11635        .into_iter()
11636        .flatten()
11637        .find(|side| qualifier == side.table.as_str() || qualifier == side.alias.as_str())
11638    {
11639        push_join_column(&side.table, column, out);
11640    }
11641}
11642
11643fn push_unqualified_join_column(
11644    column: &str,
11645    left: Option<&JoinTableSide>,
11646    right: Option<&JoinTableSide>,
11647    out: &mut HashMap<String, BTreeSet<String>>,
11648) {
11649    for side in [left, right].into_iter().flatten() {
11650        push_join_column(&side.table, column, out);
11651    }
11652}
11653
11654fn push_join_column(table: &str, column: &str, out: &mut HashMap<String, BTreeSet<String>>) {
11655    if is_policy_column_name(column) {
11656        out.entry(table.to_string())
11657            .or_default()
11658            .insert(column.to_string());
11659    }
11660}
11661
11662fn push_policy_column(column: &str, out: &mut BTreeSet<String>) {
11663    if is_policy_column_name(column) {
11664        out.insert(column.to_string());
11665    }
11666}
11667
11668fn is_policy_column_name(column: &str) -> bool {
11669    !column.is_empty()
11670        && column != "*"
11671        && !column.starts_with("LIT:")
11672        && !column.starts_with("TYPE:")
11673}
11674
11675fn runtime_iam_context(
11676    role: crate::auth::Role,
11677    tenant: Option<&str>,
11678    principal_is_system_owned: bool,
11679) -> crate::auth::policies::EvalContext {
11680    crate::auth::policies::EvalContext {
11681        principal_tenant: tenant.map(|t| t.to_string()),
11682        current_tenant: tenant.map(|t| t.to_string()),
11683        peer_ip: None,
11684        mfa_present: false,
11685        now_ms: crate::auth::now_ms(),
11686        principal_is_admin_role: role == crate::auth::Role::Admin,
11687        principal_is_system_owned,
11688        principal_is_platform_scoped: tenant.is_none(),
11689    }
11690}
11691
11692fn explicit_table_projection_columns(
11693    query: &crate::storage::query::ast::TableQuery,
11694) -> Vec<String> {
11695    use crate::storage::query::ast::{FieldRef, Projection};
11696
11697    let mut columns = Vec::new();
11698    for projection in crate::storage::query::sql_lowering::effective_table_projections(query) {
11699        match projection {
11700            Projection::Column(column) | Projection::Alias(column, _) => {
11701                push_unique(&mut columns, column)
11702            }
11703            Projection::Field(FieldRef::TableColumn { column, .. }, _) => {
11704                push_unique(&mut columns, column)
11705            }
11706            // SELECT * and expression/function projections need the
11707            // executor-wide column-policy context mapped in
11708            // docs/security/select-relational-column-policy-audit-2026-05-08.md.
11709            _ => {}
11710        }
11711    }
11712    columns
11713}
11714
11715fn explicit_graph_projection_properties(
11716    query: &crate::storage::query::ast::GraphQuery,
11717) -> Vec<String> {
11718    use crate::storage::query::ast::{FieldRef, Projection};
11719
11720    let mut columns = Vec::new();
11721    for projection in &query.return_ {
11722        match projection {
11723            Projection::Field(FieldRef::NodeProperty { property, .. }, _)
11724            | Projection::Field(FieldRef::EdgeProperty { property, .. }, _) => {
11725                push_unique(&mut columns, property.clone())
11726            }
11727            _ => {}
11728        }
11729    }
11730    columns
11731}
11732
11733fn push_unique(columns: &mut Vec<String>, column: String) {
11734    if !columns.iter().any(|existing| existing == &column) {
11735        columns.push(column);
11736    }
11737}
11738
11739fn principal_label(p: &crate::storage::query::ast::PolicyPrincipalRef) -> String {
11740    use crate::storage::query::ast::PolicyPrincipalRef;
11741    match p {
11742        PolicyPrincipalRef::User(u) => match &u.tenant {
11743            Some(t) => format!("user:{t}/{}", u.username),
11744            None => format!("user:{}", u.username),
11745        },
11746        PolicyPrincipalRef::Group(g) => format!("group:{g}"),
11747    }
11748}
11749
11750/// Render a `Decision` into the (decision, matched_policy_id, matched_sid)
11751/// shape used by every audit emit + the simulator response.
11752pub(crate) fn decision_to_strings(
11753    d: &crate::auth::policies::Decision,
11754) -> (String, Option<String>, Option<String>) {
11755    use crate::auth::policies::Decision;
11756    match d {
11757        Decision::Allow {
11758            matched_policy_id,
11759            matched_sid,
11760        } => (
11761            "allow".into(),
11762            Some(matched_policy_id.clone()),
11763            matched_sid.clone(),
11764        ),
11765        Decision::Deny {
11766            matched_policy_id,
11767            matched_sid,
11768        } => (
11769            "deny".into(),
11770            Some(matched_policy_id.clone()),
11771            matched_sid.clone(),
11772        ),
11773        Decision::DefaultDeny => ("default_deny".into(), None, None),
11774        Decision::AdminBypass => ("admin_bypass".into(), None, None),
11775    }
11776}
11777
11778fn relation_scopes_for_query(query: &QueryExpr) -> Vec<String> {
11779    let mut scopes = Vec::new();
11780    collect_relation_scopes(query, &mut scopes);
11781    scopes.sort();
11782    scopes.dedup();
11783    scopes
11784}
11785
11786fn collect_relation_scopes(query: &QueryExpr, scopes: &mut Vec<String>) {
11787    match query {
11788        QueryExpr::Table(table) => {
11789            if !table.table.is_empty() {
11790                scopes.push(table.table.clone());
11791            }
11792            if let Some(alias) = &table.alias {
11793                scopes.push(alias.clone());
11794            }
11795        }
11796        QueryExpr::Join(join) => {
11797            collect_relation_scopes(&join.left, scopes);
11798            collect_relation_scopes(&join.right, scopes);
11799        }
11800        _ => {}
11801    }
11802}
11803
11804fn query_references_outer_scope(query: &QueryExpr, outer_scopes: &[String]) -> bool {
11805    let inner_scopes = relation_scopes_for_query(query);
11806    query_expr_references_outer_scope(query, outer_scopes, &inner_scopes)
11807}
11808
11809fn query_expr_references_outer_scope(
11810    query: &QueryExpr,
11811    outer_scopes: &[String],
11812    inner_scopes: &[String],
11813) -> bool {
11814    match query {
11815        QueryExpr::Table(table) => {
11816            table.select_items.iter().any(|item| match item {
11817                crate::storage::query::ast::SelectItem::Wildcard => false,
11818                crate::storage::query::ast::SelectItem::Expr { expr, .. } => {
11819                    expr_references_outer_scope(expr, outer_scopes, inner_scopes)
11820                }
11821            }) || table
11822                .where_expr
11823                .as_ref()
11824                .is_some_and(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
11825                || table.filter.as_ref().is_some_and(|filter| {
11826                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
11827                })
11828                || table.having_expr.as_ref().is_some_and(|expr| {
11829                    expr_references_outer_scope(expr, outer_scopes, inner_scopes)
11830                })
11831                || table.having.as_ref().is_some_and(|filter| {
11832                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
11833                })
11834                || table
11835                    .group_by_exprs
11836                    .iter()
11837                    .any(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
11838                || table.order_by.iter().any(|clause| {
11839                    clause.expr.as_ref().is_some_and(|expr| {
11840                        expr_references_outer_scope(expr, outer_scopes, inner_scopes)
11841                    })
11842                })
11843        }
11844        QueryExpr::Join(join) => {
11845            query_expr_references_outer_scope(&join.left, outer_scopes, inner_scopes)
11846                || query_expr_references_outer_scope(&join.right, outer_scopes, inner_scopes)
11847                || join.filter.as_ref().is_some_and(|filter| {
11848                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
11849                })
11850                || join.return_items.iter().any(|item| match item {
11851                    crate::storage::query::ast::SelectItem::Wildcard => false,
11852                    crate::storage::query::ast::SelectItem::Expr { expr, .. } => {
11853                        expr_references_outer_scope(expr, outer_scopes, inner_scopes)
11854                    }
11855                })
11856        }
11857        _ => false,
11858    }
11859}
11860
11861fn filter_references_outer_scope(
11862    filter: &crate::storage::query::ast::Filter,
11863    outer_scopes: &[String],
11864    inner_scopes: &[String],
11865) -> bool {
11866    use crate::storage::query::ast::Filter;
11867    match filter {
11868        Filter::Compare { field, .. }
11869        | Filter::IsNull(field)
11870        | Filter::IsNotNull(field)
11871        | Filter::In { field, .. }
11872        | Filter::Between { field, .. }
11873        | Filter::Like { field, .. }
11874        | Filter::StartsWith { field, .. }
11875        | Filter::EndsWith { field, .. }
11876        | Filter::Contains { field, .. } => {
11877            field_ref_references_outer_scope(field, outer_scopes, inner_scopes)
11878        }
11879        Filter::CompareFields { left, right, .. } => {
11880            field_ref_references_outer_scope(left, outer_scopes, inner_scopes)
11881                || field_ref_references_outer_scope(right, outer_scopes, inner_scopes)
11882        }
11883        Filter::CompareExpr { lhs, rhs, .. } => {
11884            expr_references_outer_scope(lhs, outer_scopes, inner_scopes)
11885                || expr_references_outer_scope(rhs, outer_scopes, inner_scopes)
11886        }
11887        Filter::And(left, right) | Filter::Or(left, right) => {
11888            filter_references_outer_scope(left, outer_scopes, inner_scopes)
11889                || filter_references_outer_scope(right, outer_scopes, inner_scopes)
11890        }
11891        Filter::Not(inner) => filter_references_outer_scope(inner, outer_scopes, inner_scopes),
11892    }
11893}
11894
11895fn expr_references_outer_scope(
11896    expr: &crate::storage::query::ast::Expr,
11897    outer_scopes: &[String],
11898    inner_scopes: &[String],
11899) -> bool {
11900    use crate::storage::query::ast::Expr;
11901    match expr {
11902        Expr::Column { field, .. } => {
11903            field_ref_references_outer_scope(field, outer_scopes, inner_scopes)
11904        }
11905        Expr::BinaryOp { lhs, rhs, .. } => {
11906            expr_references_outer_scope(lhs, outer_scopes, inner_scopes)
11907                || expr_references_outer_scope(rhs, outer_scopes, inner_scopes)
11908        }
11909        Expr::UnaryOp { operand, .. }
11910        | Expr::Cast { inner: operand, .. }
11911        | Expr::IsNull { operand, .. } => {
11912            expr_references_outer_scope(operand, outer_scopes, inner_scopes)
11913        }
11914        Expr::FunctionCall { args, .. } => args
11915            .iter()
11916            .any(|arg| expr_references_outer_scope(arg, outer_scopes, inner_scopes)),
11917        Expr::Case {
11918            branches, else_, ..
11919        } => {
11920            branches.iter().any(|(cond, value)| {
11921                expr_references_outer_scope(cond, outer_scopes, inner_scopes)
11922                    || expr_references_outer_scope(value, outer_scopes, inner_scopes)
11923            }) || else_
11924                .as_ref()
11925                .is_some_and(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
11926        }
11927        Expr::InList { target, values, .. } => {
11928            expr_references_outer_scope(target, outer_scopes, inner_scopes)
11929                || values
11930                    .iter()
11931                    .any(|value| expr_references_outer_scope(value, outer_scopes, inner_scopes))
11932        }
11933        Expr::Between {
11934            target, low, high, ..
11935        } => {
11936            expr_references_outer_scope(target, outer_scopes, inner_scopes)
11937                || expr_references_outer_scope(low, outer_scopes, inner_scopes)
11938                || expr_references_outer_scope(high, outer_scopes, inner_scopes)
11939        }
11940        Expr::Subquery { query, .. } => query_references_outer_scope(&query.query, inner_scopes),
11941        Expr::Literal { .. } | Expr::Parameter { .. } => false,
11942        Expr::WindowFunctionCall { args, window, .. } => {
11943            args.iter()
11944                .any(|arg| expr_references_outer_scope(arg, outer_scopes, inner_scopes))
11945                || window
11946                    .partition_by
11947                    .iter()
11948                    .any(|e| expr_references_outer_scope(e, outer_scopes, inner_scopes))
11949                || window
11950                    .order_by
11951                    .iter()
11952                    .any(|o| expr_references_outer_scope(&o.expr, outer_scopes, inner_scopes))
11953        }
11954    }
11955}
11956
11957fn field_ref_references_outer_scope(
11958    field: &crate::storage::query::ast::FieldRef,
11959    outer_scopes: &[String],
11960    inner_scopes: &[String],
11961) -> bool {
11962    match field {
11963        crate::storage::query::ast::FieldRef::TableColumn { table, .. } if !table.is_empty() => {
11964            outer_scopes.iter().any(|scope| scope == table)
11965                && !inner_scopes.iter().any(|scope| scope == table)
11966        }
11967        _ => false,
11968    }
11969}
11970
11971fn first_column_values(
11972    result: crate::storage::query::unified::UnifiedResult,
11973) -> RedDBResult<Vec<Value>> {
11974    if result.columns.len() > 1 {
11975        return Err(RedDBError::Query(
11976            "expression subquery must return exactly one column".to_string(),
11977        ));
11978    }
11979    let fallback_column = result
11980        .records
11981        .first()
11982        .and_then(|record| record.column_names().into_iter().next())
11983        .map(|name| name.to_string());
11984    let column = result.columns.first().cloned().or(fallback_column);
11985    let Some(column) = column else {
11986        return Ok(Vec::new());
11987    };
11988    Ok(result
11989        .records
11990        .iter()
11991        .map(|record| record.get(column.as_str()).cloned().unwrap_or(Value::Null))
11992        .collect())
11993}
11994
11995fn parse_timestamp_to_ms(s: &str) -> Option<u128> {
11996    // Bare integer ms.
11997    if let Ok(n) = s.parse::<u128>() {
11998        return Some(n);
11999    }
12000    // Fallback: ISO-8601 like 2030-01-02 03:04:05 — accept the date
12001    // portion only (midnight UTC). Full RFC3339 parsing is a stretch
12002    // goal; the common case is `'2030-01-01'`.
12003    if let Some(date) = s.split_whitespace().next() {
12004        let parts: Vec<&str> = date.split('-').collect();
12005        if parts.len() == 3 {
12006            let (y, m, d) = (parts[0], parts[1], parts[2]);
12007            if let (Ok(y), Ok(m), Ok(d)) = (y.parse::<i64>(), m.parse::<u32>(), d.parse::<u32>()) {
12008                // Days since 1970-01-01 — simple Julian arithmetic
12009                // suitable for years 1970-2100. Good enough for test
12010                // fixtures; precise parsing lands when we wire chrono.
12011                let days_in = days_from_civil(y, m, d);
12012                return Some((days_in as u128) * 86_400_000u128);
12013            }
12014        }
12015    }
12016    None
12017}
12018
12019/// Days from Unix epoch using H. Hinnant's civil-from-days algorithm.
12020/// Robust for the entire Gregorian range; used by `parse_timestamp_to_ms`.
12021fn days_from_civil(y: i64, m: u32, d: u32) -> i64 {
12022    let y = if m <= 2 { y - 1 } else { y };
12023    let era = if y >= 0 { y } else { y - 399 } / 400;
12024    let yoe = (y - era * 400) as u64; // [0, 399]
12025    let doy = (153 * (if m > 2 { m - 3 } else { m + 9 }) as u64 + 2) / 5 + d as u64 - 1;
12026    let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
12027    era * 146097 + doe as i64 - 719468
12028}
12029
12030fn walk_plan_node(
12031    node: &crate::storage::query::planner::CanonicalLogicalNode,
12032    depth: usize,
12033    out: &mut Vec<crate::storage::query::unified::UnifiedRecord>,
12034) {
12035    use std::sync::Arc;
12036    let mut rec = crate::storage::query::unified::UnifiedRecord::default();
12037    rec.set_arc(Arc::from("op"), Value::text(node.operator.clone()));
12038    rec.set_arc(
12039        Arc::from("source"),
12040        node.source.clone().map(Value::text).unwrap_or(Value::Null),
12041    );
12042    rec.set_arc(Arc::from("est_rows"), Value::Float(node.estimated_rows));
12043    rec.set_arc(Arc::from("est_cost"), Value::Float(node.operator_cost));
12044    rec.set_arc(Arc::from("depth"), Value::Integer(depth as i64));
12045    out.push(rec);
12046    for child in &node.children {
12047        walk_plan_node(child, depth + 1, out);
12048    }
12049}