Skip to main content

reddb_server/runtime/
impl_core.rs

1use super::*;
2use crate::application::entity::metadata_to_json;
3use crate::auth::column_policy_gate::ColumnAccessRequest;
4use crate::auth::UserId;
5use crate::replication::cdc::ChangeRecord;
6use crate::replication::logical::{ApplyMode, LogicalChangeApplier};
7use crate::storage::query::ast::TableSource;
8
9thread_local! {
10    /// Current connection id for the executing statement. Set by the
11    /// per-connection wrapper (stdio/gRPC handlers) before dispatching
12    /// into `execute_query`; falls back to `0` for embedded callers.
13    static CURRENT_CONN_ID: std::cell::Cell<u64> = const { std::cell::Cell::new(0) };
14
15    /// Authenticated user + role for the executing statement (Phase 2.5.2
16    /// RLS enforcement). Set by the transport middleware after validating
17    /// credentials (password / cert / oauth); unset means "anonymous" /
18    /// "embedded" — RLS policies degrade to the role-agnostic subset.
19    ///
20    /// `None` skips RLS injection entirely; `Some((username, role))`
21    /// passes `role` to `matching_rls_policies(table, Some(role), action)`.
22    static CURRENT_AUTH_IDENTITY: std::cell::RefCell<Option<(String, crate::auth::Role)>> =
23        const { std::cell::RefCell::new(None) };
24
25    /// MVCC snapshot scoped to the currently-executing statement (Phase
26    /// 2.3.2d PG parity). `execute_query` captures it on entry and drops
27    /// it on exit; every scan consults it via
28    /// `entity_visible_under_current_snapshot` to hide tuples whose xmin
29    /// hasn't committed or whose xmax already has.
30    ///
31    /// `None` means "pre-MVCC semantics" — the read path returns every
32    /// tuple regardless of xmin/xmax. All embedded callers that bypass
33    /// `execute_query` see this default.
34    static CURRENT_SNAPSHOT: std::cell::RefCell<Option<SnapshotContext>> =
35        const { std::cell::RefCell::new(None) };
36
37    /// Cheap presence flag for `CURRENT_SNAPSHOT`. Scan hot paths
38    /// poll this instead of `borrow()`-ing the RefCell on every
39    /// row — the common case (autocommit / no MVCC session) reads
40    /// one atomic `Cell<bool>` and short-circuits, saving ~10ns × N
41    /// rows on aggregate_group / select_range scans.
42    static HAS_SNAPSHOT: std::cell::Cell<bool> = const { std::cell::Cell::new(false) };
43
44    /// Session-scoped tenant id for the current connection (Phase 2.5.3
45    /// multi-tenancy). Populated by `SET TENANT 'id'` or by transport
46    /// middleware after resolving tenant from auth claims. Read by the
47    /// `CURRENT_TENANT()` scalar function — RLS policies typically
48    /// combine it as `USING (tenant_id = CURRENT_TENANT())` to scope
49    /// every query to one tenant.
50    ///
51    /// `None` means "no tenant bound" — `CURRENT_TENANT()` returns
52    /// NULL, and RLS policies that gate on it hide every row.
53    static CURRENT_TENANT_ID: std::cell::RefCell<Option<String>> =
54        const { std::cell::RefCell::new(None) };
55
56    /// Statement-local config resolver. SQL expressions materialize the
57    /// `red_config` snapshot lazily on the first `$config.*`/`CONFIG()`
58    /// access, keeping ordinary statements on the zero-scan path.
59    static CURRENT_CONFIG_RESOLVER: std::cell::RefCell<Option<ConfigResolver>> =
60        const { std::cell::RefCell::new(None) };
61
62    /// Statement-local secret resolver. SQL expressions materialize the
63    /// vault KV snapshot lazily on first `$secret.*` access, then use
64    /// lock-free map reads for the rest of the statement.
65    static CURRENT_SECRET_RESOLVER: std::cell::RefCell<Option<SecretResolver>> =
66        const { std::cell::RefCell::new(None) };
67}
68
69fn secret_sql_value_to_string(value: &Value) -> RedDBResult<String> {
70    match value {
71        Value::Text(s) => Ok(s.to_string()),
72        Value::Integer(n) => Ok(n.to_string()),
73        Value::UnsignedInteger(n) => Ok(n.to_string()),
74        Value::Float(n) => Ok(n.to_string()),
75        Value::Boolean(b) => Ok(b.to_string()),
76        Value::Null => Err(RedDBError::Query(
77            "SET SECRET key = NULL deletes the secret; use DELETE SECRET for explicit deletes"
78                .to_string(),
79        )),
80        Value::Password(_) | Value::Secret(_) => Err(RedDBError::Query(
81            "SET SECRET accepts plain scalar literals; PASSWORD() and SECRET() are for typed columns"
82                .to_string(),
83        )),
84        _ => Err(RedDBError::Query(format!(
85            "SET SECRET does not support value type {:?} yet",
86            value.data_type()
87        ))),
88    }
89}
90
91#[derive(Clone)]
92struct QueryControlEventSpec {
93    kind: crate::runtime::control_events::EventKind,
94    action: &'static str,
95    resource: Option<String>,
96    fields: Vec<(String, crate::runtime::control_events::Sensitivity)>,
97}
98
99#[derive(Clone)]
100struct QueryAuditPlan {
101    statement_kind: &'static str,
102    collections: Vec<String>,
103}
104
105fn query_audit_plan(expr: &QueryExpr) -> Option<QueryAuditPlan> {
106    let mut collections = Vec::new();
107    let statement_kind = match expr {
108        QueryExpr::Table(table) => {
109            push_query_audit_collection(&mut collections, &table.table);
110            "select"
111        }
112        QueryExpr::Join(join) => {
113            collect_query_audit_collections(&join.left, &mut collections);
114            collect_query_audit_collections(&join.right, &mut collections);
115            "select"
116        }
117        QueryExpr::Insert(insert) => {
118            push_query_audit_collection(&mut collections, &insert.table);
119            "insert"
120        }
121        QueryExpr::Update(update) => {
122            push_query_audit_collection(&mut collections, &update.table);
123            "update"
124        }
125        QueryExpr::Delete(delete) => {
126            push_query_audit_collection(&mut collections, &delete.table);
127            "delete"
128        }
129        _ => return None,
130    };
131    if collections.is_empty() {
132        None
133    } else {
134        Some(QueryAuditPlan {
135            statement_kind,
136            collections,
137        })
138    }
139}
140
141fn collect_query_audit_collections(expr: &QueryExpr, collections: &mut Vec<String>) {
142    match expr {
143        QueryExpr::Table(table) => push_query_audit_collection(collections, &table.table),
144        QueryExpr::Join(join) => {
145            collect_query_audit_collections(&join.left, collections);
146            collect_query_audit_collections(&join.right, collections);
147        }
148        _ => {}
149    }
150}
151
152fn push_query_audit_collection(collections: &mut Vec<String>, name: &str) {
153    if name == "red" || name.starts_with("red.") || name.starts_with("__red_schema_") {
154        return;
155    }
156    if !collections.iter().any(|existing| existing == name) {
157        collections.push(name.to_string());
158    }
159}
160
161impl RedDBRuntime {
162    fn execute_create_metric(
163        &self,
164        raw_query: &str,
165        query: &crate::storage::query::ast::CreateMetricQuery,
166    ) -> RedDBResult<RuntimeQueryResult> {
167        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
168        let store = self.inner.db.store();
169        super::metric_descriptor_catalog::create(
170            store.as_ref(),
171            &query.path,
172            &query.kind,
173            &query.role,
174            super::metric_descriptor_catalog::DerivedSpec {
175                source: query.source.clone(),
176                query: query.query.clone(),
177                window_ms: query.window_ms,
178                time_field: query.time_field.clone(),
179            },
180        )?;
181        self.invalidate_result_cache();
182        Ok(RuntimeQueryResult::ok_message(
183            raw_query.to_string(),
184            &format!("metric descriptor '{}' created", query.path),
185            "create",
186        ))
187    }
188
189    fn execute_alter_metric(
190        &self,
191        raw_query: &str,
192        query: &crate::storage::query::ast::AlterMetricQuery,
193    ) -> RedDBResult<RuntimeQueryResult> {
194        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
195        let store = self.inner.db.store();
196        super::metric_descriptor_catalog::update(
197            store.as_ref(),
198            &query.path,
199            query.set_role.as_deref(),
200            query.attempted_kind.as_deref(),
201            query.attempted_path.as_deref(),
202        )?;
203        self.invalidate_result_cache();
204        Ok(RuntimeQueryResult::ok_message(
205            raw_query.to_string(),
206            &format!("metric descriptor '{}' updated", query.path),
207            "alter",
208        ))
209    }
210
211    fn execute_create_slo(
212        &self,
213        raw_query: &str,
214        query: &crate::storage::query::ast::CreateSloQuery,
215    ) -> RedDBResult<RuntimeQueryResult> {
216        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
217        let store = self.inner.db.store();
218        super::slo_descriptor_catalog::create(
219            store.as_ref(),
220            &query.path,
221            &query.metric_path,
222            query.target,
223            query.window_ms,
224        )?;
225        self.invalidate_result_cache();
226        Ok(RuntimeQueryResult::ok_message(
227            raw_query.to_string(),
228            &format!("SLO descriptor '{}' created", query.path),
229            "create",
230        ))
231    }
232
233    fn execute_create_analytics_source(
234        &self,
235        raw_query: &str,
236        query: super::analytics_source_catalog::CreateAnalyticsSourceProfile,
237    ) -> RedDBResult<RuntimeQueryResult> {
238        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
239        let store = self.inner.db.store();
240        let profile = super::analytics_source_catalog::create(
241            store.as_ref(),
242            &self.inner.db.collection_contracts(),
243            query,
244        )?;
245        self.invalidate_result_cache();
246        Ok(RuntimeQueryResult::ok_message(
247            raw_query.to_string(),
248            &format!("analytics source '{}' created", profile.name),
249            "create",
250        ))
251    }
252}
253
254fn query_control_event_specs(expr: &QueryExpr) -> Vec<QueryControlEventSpec> {
255    use crate::runtime::control_events::{EventKind, Sensitivity};
256
257    let mut specs = Vec::new();
258    let mut schema = |action: &'static str, resource: Option<String>| {
259        specs.push(QueryControlEventSpec {
260            kind: EventKind::SchemaDdl,
261            action,
262            resource,
263            fields: Vec::new(),
264        });
265    };
266    match expr {
267        QueryExpr::CreateTable(q) => {
268            schema("create_table", Some(format!("table:{}", q.name)));
269            if let Some(column) = &q.tenant_by {
270                specs.push(QueryControlEventSpec {
271                    kind: EventKind::TenantGovernance,
272                    action: "create_table_tenant_by",
273                    resource: Some(format!("table:{}", q.name)),
274                    fields: vec![("tenant_column".to_string(), Sensitivity::raw(column))],
275                });
276            }
277        }
278        QueryExpr::CreateCollection(q) => {
279            schema("create_collection", Some(format!("collection:{}", q.name)));
280        }
281        QueryExpr::CreateVector(q) => schema("create_vector", Some(format!("vector:{}", q.name))),
282        QueryExpr::DropTable(q) => schema("drop_table", Some(format!("table:{}", q.name))),
283        QueryExpr::DropGraph(q) => schema("drop_graph", Some(format!("graph:{}", q.name))),
284        QueryExpr::DropVector(q) => schema("drop_vector", Some(format!("vector:{}", q.name))),
285        QueryExpr::DropDocument(q) => {
286            schema("drop_document", Some(format!("document:{}", q.name)));
287        }
288        QueryExpr::DropKv(q) => schema("drop_kv", Some(format!("kv:{}", q.name))),
289        QueryExpr::DropCollection(q) => {
290            schema("drop_collection", Some(format!("collection:{}", q.name)));
291        }
292        QueryExpr::Truncate(q) => schema("truncate", Some(format!("collection:{}", q.name))),
293        QueryExpr::AlterTable(q) => {
294            schema("alter_table", Some(format!("table:{}", q.name)));
295            for op in &q.operations {
296                match op {
297                    crate::storage::query::ast::AlterOperation::EnableRowLevelSecurity => {
298                        specs.push(QueryControlEventSpec {
299                            kind: EventKind::RlsGovernance,
300                            action: "enable_rls",
301                            resource: Some(format!("table:{}", q.name)),
302                            fields: Vec::new(),
303                        });
304                    }
305                    crate::storage::query::ast::AlterOperation::DisableRowLevelSecurity => {
306                        specs.push(QueryControlEventSpec {
307                            kind: EventKind::RlsGovernance,
308                            action: "disable_rls",
309                            resource: Some(format!("table:{}", q.name)),
310                            fields: Vec::new(),
311                        });
312                    }
313                    crate::storage::query::ast::AlterOperation::EnableTenancy { column } => {
314                        specs.push(QueryControlEventSpec {
315                            kind: EventKind::TenantGovernance,
316                            action: "enable_tenancy",
317                            resource: Some(format!("table:{}", q.name)),
318                            fields: vec![("tenant_column".to_string(), Sensitivity::raw(column))],
319                        });
320                    }
321                    crate::storage::query::ast::AlterOperation::DisableTenancy => {
322                        specs.push(QueryControlEventSpec {
323                            kind: EventKind::TenantGovernance,
324                            action: "disable_tenancy",
325                            resource: Some(format!("table:{}", q.name)),
326                            fields: Vec::new(),
327                        });
328                    }
329                    _ => {}
330                }
331            }
332        }
333        QueryExpr::CreateIndex(q) => {
334            schema(
335                "create_index",
336                Some(format!("index:{}:{}", q.table, q.name)),
337            );
338        }
339        QueryExpr::DropIndex(q) => {
340            schema("drop_index", Some(format!("index:{}:{}", q.table, q.name)));
341        }
342        QueryExpr::CreateTimeSeries(q) => {
343            schema("create_timeseries", Some(format!("timeseries:{}", q.name)));
344        }
345        QueryExpr::CreateMetric(q) => {
346            schema("create_metric", Some(format!("metric:{}", q.path)));
347        }
348        QueryExpr::AlterMetric(q) => {
349            schema("alter_metric", Some(format!("metric:{}", q.path)));
350        }
351        QueryExpr::CreateSlo(q) => {
352            schema("create_slo", Some(format!("slo:{}", q.path)));
353        }
354        QueryExpr::DropTimeSeries(q) => {
355            schema("drop_timeseries", Some(format!("timeseries:{}", q.name)));
356        }
357        QueryExpr::CreateQueue(q) => schema("create_queue", Some(format!("queue:{}", q.name))),
358        QueryExpr::AlterQueue(q) => schema("alter_queue", Some(format!("queue:{}", q.name))),
359        QueryExpr::DropQueue(q) => schema("drop_queue", Some(format!("queue:{}", q.name))),
360        QueryExpr::CreateTree(q) => {
361            schema(
362                "create_tree",
363                Some(format!("tree:{}:{}", q.collection, q.name)),
364            );
365        }
366        QueryExpr::DropTree(q) => {
367            schema(
368                "drop_tree",
369                Some(format!("tree:{}:{}", q.collection, q.name)),
370            );
371        }
372        QueryExpr::CreateSchema(q) => schema("create_schema", Some(format!("schema:{}", q.name))),
373        QueryExpr::DropSchema(q) => schema("drop_schema", Some(format!("schema:{}", q.name))),
374        QueryExpr::CreateSequence(q) => {
375            schema("create_sequence", Some(format!("sequence:{}", q.name)));
376        }
377        QueryExpr::DropSequence(q) => schema("drop_sequence", Some(format!("sequence:{}", q.name))),
378        QueryExpr::CreateView(q) => schema("create_view", Some(format!("view:{}", q.name))),
379        QueryExpr::DropView(q) => schema("drop_view", Some(format!("view:{}", q.name))),
380        QueryExpr::RefreshMaterializedView(q) => {
381            schema(
382                "refresh_materialized_view",
383                Some(format!("view:{}", q.name)),
384            );
385        }
386        QueryExpr::CreatePolicy(q) => {
387            specs.push(QueryControlEventSpec {
388                kind: EventKind::RlsGovernance,
389                action: "create_policy",
390                resource: Some(format!("table:{}:policy:{}", q.table, q.name)),
391                fields: vec![(
392                    "target_kind".to_string(),
393                    Sensitivity::raw(q.target_kind.as_ident()),
394                )],
395            });
396        }
397        QueryExpr::DropPolicy(q) => {
398            specs.push(QueryControlEventSpec {
399                kind: EventKind::RlsGovernance,
400                action: "drop_policy",
401                resource: Some(format!("table:{}:policy:{}", q.table, q.name)),
402                fields: Vec::new(),
403            });
404        }
405        QueryExpr::SetTenant(value) => {
406            let mut fields = Vec::new();
407            if let Some(value) = value {
408                fields.push(("tenant".to_string(), Sensitivity::raw(value)));
409            }
410            specs.push(QueryControlEventSpec {
411                kind: EventKind::TenantGovernance,
412                action: "set_tenant",
413                resource: Some("tenant:session".to_string()),
414                fields,
415            });
416        }
417        QueryExpr::SetConfig { key, .. } => {
418            specs.push(QueryControlEventSpec {
419                kind: EventKind::ConfigWrite,
420                action: "config:write",
421                resource: Some(format!("config:{key}")),
422                fields: vec![("key".to_string(), Sensitivity::raw(key))],
423            });
424        }
425        QueryExpr::ConfigCommand(cmd) => match cmd {
426            crate::storage::query::ast::ConfigCommand::Put {
427                collection, key, ..
428            }
429            | crate::storage::query::ast::ConfigCommand::Rotate {
430                collection, key, ..
431            } => {
432                let target = format!("{collection}/{key}");
433                specs.push(QueryControlEventSpec {
434                    kind: EventKind::ConfigWrite,
435                    action: "config:write",
436                    resource: Some(format!("config:{target}")),
437                    fields: vec![
438                        ("collection".to_string(), Sensitivity::raw(collection)),
439                        ("key".to_string(), Sensitivity::raw(key)),
440                    ],
441                });
442            }
443            crate::storage::query::ast::ConfigCommand::Delete { collection, key } => {
444                let target = format!("{collection}/{key}");
445                specs.push(QueryControlEventSpec {
446                    kind: EventKind::ConfigDelete,
447                    action: "config:write",
448                    resource: Some(format!("config:{target}")),
449                    fields: vec![
450                        ("collection".to_string(), Sensitivity::raw(collection)),
451                        ("key".to_string(), Sensitivity::raw(key)),
452                    ],
453                });
454            }
455            _ => {}
456        },
457        QueryExpr::AlterUser(stmt) => {
458            let disables = stmt.attributes.iter().any(|attr| {
459                matches!(
460                    attr,
461                    crate::storage::query::ast::AlterUserAttribute::Disable
462                )
463            });
464            specs.push(QueryControlEventSpec {
465                kind: if disables {
466                    EventKind::UserDisable
467                } else {
468                    EventKind::UserUpdate
469                },
470                action: "alter_user",
471                resource: Some(format!("user:{}", stmt.username)),
472                fields: Vec::new(),
473            });
474        }
475        _ => {}
476    }
477    specs
478}
479
480fn control_event_outcome_for_error(err: &RedDBError) -> crate::runtime::control_events::Outcome {
481    match err {
482        RedDBError::ReadOnly(_) => crate::runtime::control_events::Outcome::Denied,
483        RedDBError::Query(msg)
484            if msg.contains("permission denied")
485                || msg.contains("cannot issue")
486                || msg.contains("lacks") =>
487        {
488            crate::runtime::control_events::Outcome::Denied
489        }
490        _ => crate::runtime::control_events::Outcome::Error,
491    }
492}
493
494/// Convert the rows produced by a materialized-view body into
495/// `UnifiedEntity` table rows targeting the backing collection.
496/// Issue #595 slice 9c — feeds `UnifiedStore::refresh_collection`.
497///
498/// Graph fragments and vector hits are ignored: a materialized view
499/// is a relational result set (SELECT-shaped); slices 11+ may extend
500/// this once we have a richer view body shape. Each row materialises
501/// the union of its schema-bound columns + overflow.
502fn view_records_to_entities(
503    table: &str,
504    records: &[crate::storage::query::unified::UnifiedRecord],
505) -> Vec<crate::storage::UnifiedEntity> {
506    use std::collections::HashMap;
507    let table_arc: std::sync::Arc<str> = std::sync::Arc::from(table);
508    let mut out = Vec::with_capacity(records.len());
509    for record in records {
510        let mut named: HashMap<String, crate::storage::schema::Value> = HashMap::new();
511        for (name, value) in record.iter_fields() {
512            named.insert(name.to_string(), value.clone());
513        }
514        let entity = crate::storage::UnifiedEntity::new(
515            crate::storage::EntityId::new(0),
516            crate::storage::EntityKind::TableRow {
517                table: std::sync::Arc::clone(&table_arc),
518                row_id: 0,
519            },
520            crate::storage::EntityData::Row(crate::storage::RowData {
521                columns: Vec::new(),
522                named: Some(named),
523                schema: None,
524            }),
525        );
526        out.push(entity);
527    }
528    out
529}
530
531fn system_keyed_collection_contract(
532    name: &str,
533    model: crate::catalog::CollectionModel,
534) -> crate::physical::CollectionContract {
535    let now = crate::utils::now_unix_millis() as u128;
536    crate::physical::CollectionContract {
537        name: name.to_string(),
538        declared_model: model,
539        schema_mode: crate::catalog::SchemaMode::Dynamic,
540        origin: crate::physical::ContractOrigin::Implicit,
541        version: 1,
542        created_at_unix_ms: now,
543        updated_at_unix_ms: now,
544        default_ttl_ms: None,
545        vector_dimension: None,
546        vector_metric: None,
547        context_index_fields: Vec::new(),
548        declared_columns: Vec::new(),
549        table_def: None,
550        timestamps_enabled: false,
551        context_index_enabled: false,
552        metrics_raw_retention_ms: None,
553        metrics_rollup_policies: Vec::new(),
554        metrics_tenant_identity: None,
555        metrics_namespace: None,
556        append_only: false,
557        subscriptions: Vec::new(),
558        analytics_config: Vec::new(),
559        session_key: None,
560        session_gap_ms: None,
561        retention_duration_ms: None,
562    }
563}
564
565/// Snapshot + manager pair used for read-path visibility checks.
566///
567/// The manager is needed in addition to the snapshot because `aborted`
568/// state mutates after the snapshot is captured — a ROLLBACK by a
569/// committed-at-capture-time writer must still hide its tuples. Keeping
570/// the Arc around is O(pointer) and the RwLock reads on `is_aborted`
571/// are cheap (HashSet lookup under a parking_lot read guard).
572///
573/// `own_xids` (Phase 2.3.2e) lists the xids belonging to the current
574/// connection's transaction — the parent xid plus open and released
575/// savepoint sub-xids. The visibility rule promotes rows stamped with
576/// these xids to "always visible (unless aborted)" so the writer sees
577/// its own nested-savepoint writes even though their xids exceed
578/// `snapshot.xid`.
579#[derive(Clone)]
580pub struct SnapshotContext {
581    pub snapshot: crate::storage::transaction::snapshot::Snapshot,
582    pub manager: Arc<crate::storage::transaction::snapshot::SnapshotManager>,
583    pub own_xids: std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
584    pub requires_index_fallback: bool,
585}
586
587/// Install a connection id on the current thread for the duration of a
588/// statement. Transaction state (`RuntimeInner::tx_contexts`) is keyed
589/// by this id so different connections can hold independent BEGINs.
590///
591/// Pub so transports (PG wire, gRPC, HTTP per-request spawners) and
592/// tests can emulate per-connection isolation. Call it once when
593/// binding the connection's worker thread; pair with
594/// `clear_current_connection_id` on teardown.
595pub fn set_current_connection_id(id: u64) {
596    CURRENT_CONN_ID.with(|c| c.set(id));
597}
598
599/// Reset the thread's connection id back to `0` (autocommit).
600pub fn clear_current_connection_id() {
601    CURRENT_CONN_ID.with(|c| c.set(0));
602}
603
604/// Read the connection id set by `set_current_connection_id`. Returns
605/// `0` when no wrapper installed one — auto-commit path.
606pub fn current_connection_id() -> u64 {
607    CURRENT_CONN_ID.with(|c| c.get())
608}
609
610/// Install the authenticated identity for the current thread (Phase 2.5.2
611/// RLS enforcement). Transport layers call this right after resolving
612/// auth so the query dispatch can fold RLS policies into the filter.
613pub fn set_current_auth_identity(username: String, role: crate::auth::Role) {
614    CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = Some((username, role)));
615}
616
617/// Clear the thread-local auth identity. Transports call this after the
618/// statement completes so pooled threads don't leak identities across
619/// requests.
620pub fn clear_current_auth_identity() {
621    CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = None);
622}
623
624/// Read the current-thread auth identity. `None` when no transport
625/// installed one (embedded mode / anonymous access).
626pub(crate) fn current_auth_identity() -> Option<(String, crate::auth::Role)> {
627    CURRENT_AUTH_IDENTITY.with(|cell| cell.borrow().clone())
628}
629
630/// Public probe of the thread-local auth identity for callers outside
631/// the `runtime` module (e.g. the AI credential resolver, which audits
632/// who triggered a secret read on behalf of a query).
633pub fn current_auth_identity_for_audit() -> Option<(String, crate::auth::Role)> {
634    current_auth_identity()
635}
636
637/// Install the session tenant id for the current thread (Phase 2.5.3
638/// multi-tenancy). Called by `SET TENANT 'id'` dispatch and by
639/// transport middleware that resolves tenant from auth claims (e.g.
640/// JWT `tenant` claim, HTTP header, subdomain).
641pub fn set_current_tenant(tenant_id: String) {
642    CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = Some(tenant_id));
643}
644
645/// Clear the current-thread tenant — `CURRENT_TENANT()` will then
646/// return NULL and any RLS policy gated on it will hide every row.
647pub fn clear_current_tenant() {
648    CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = None);
649}
650
651/// Read the current-thread tenant id, applying overrides in priority order:
652///   1. `WITHIN TENANT '<id>' …` per-statement override (highest)
653///   2. `SET LOCAL TENANT '<id>'` transaction-local override (consulted
654///      only when the current connection has an open transaction)
655///   3. `SET TENANT '<id>'` session-level thread-local
656///   4. `None` (deny-default for RLS).
657///
658/// The transaction-local layer is read through the runtime; an embedded
659/// helper crate that has no `RedDBRuntime` access still gets correct
660/// behaviour for layers 1, 3, and 4.
661pub fn current_tenant() -> Option<String> {
662    let inherited = CURRENT_TENANT_ID.with(|cell| cell.borrow().clone());
663    if let Some(over) = current_scope_override() {
664        if over.tenant.is_active() {
665            return over.tenant.resolve(inherited);
666        }
667    }
668    if let Some(tx_local) = current_tx_local_tenant() {
669        return tx_local;
670    }
671    inherited
672}
673
674thread_local! {
675    /// Snapshot of the active connection's `tx_local_tenants` entry for
676    /// the current `execute_query` call. Outer `Some(_)` means "a
677    /// transaction-local tenant override is active for this call";
678    /// inner is the override's value (`Some(s)` overrides to `s`,
679    /// `None` overrides to NULL/cleared). Refreshed at the top of every
680    /// `execute_query` invocation and cleared by the RAII guard on
681    /// return so pooled connections cannot leak the override past the
682    /// statement that owns it.
683    static TX_LOCAL_TENANT: std::cell::RefCell<Option<Option<String>>> =
684        const { std::cell::RefCell::new(None) };
685}
686
687fn current_tx_local_tenant() -> Option<Option<String>> {
688    TX_LOCAL_TENANT.with(|cell| cell.borrow().clone())
689}
690
691/// Recognise `SET LOCAL TENANT '<id>'` / `SET LOCAL TENANT NULL` —
692/// returns `Ok(Some(Some(id)))` for an explicit value, `Ok(Some(None))`
693/// for an explicit NULL clear, `Ok(None)` when the input is not a
694/// `SET LOCAL TENANT` statement at all, and `Err` when the prefix
695/// matches but the value is malformed.
696fn parse_set_local_tenant(query: &str) -> RedDBResult<Option<Option<String>>> {
697    let mut tokens = query.split_ascii_whitespace();
698    let Some(w1) = tokens.next() else {
699        return Ok(None);
700    };
701    if !w1.eq_ignore_ascii_case("SET") {
702        return Ok(None);
703    }
704    let Some(w2) = tokens.next() else {
705        return Ok(None);
706    };
707    if !w2.eq_ignore_ascii_case("LOCAL") {
708        return Ok(None);
709    }
710    let Some(w3) = tokens.next() else {
711        return Ok(None);
712    };
713    if !w3.eq_ignore_ascii_case("TENANT") {
714        return Ok(None);
715    }
716    let rest: String = tokens.collect::<Vec<_>>().join(" ");
717    let rest = rest.trim().trim_end_matches(';').trim();
718    let value_str = rest.strip_prefix('=').map(|s| s.trim()).unwrap_or(rest);
719    if value_str.is_empty() {
720        return Err(RedDBError::Query(
721            "SET LOCAL TENANT expects a string literal or NULL".to_string(),
722        ));
723    }
724    if value_str.eq_ignore_ascii_case("NULL") {
725        return Ok(Some(None));
726    }
727    if value_str.starts_with('\'') && value_str.ends_with('\'') && value_str.len() >= 2 {
728        let inner = &value_str[1..value_str.len() - 1];
729        return Ok(Some(Some(inner.to_string())));
730    }
731    Err(RedDBError::Query(format!(
732        "SET LOCAL TENANT expects a string literal or NULL, got `{value_str}`"
733    )))
734}
735
736pub(crate) struct TxLocalTenantGuard;
737
738impl TxLocalTenantGuard {
739    pub fn install(value: Option<Option<String>>) -> Self {
740        TX_LOCAL_TENANT.with(|cell| *cell.borrow_mut() = value);
741        Self
742    }
743}
744
745impl Drop for TxLocalTenantGuard {
746    fn drop(&mut self) {
747        TX_LOCAL_TENANT.with(|cell| *cell.borrow_mut() = None);
748    }
749}
750
751thread_local! {
752    /// Stack of `WITHIN ... <stmt>` overrides active on the current
753    /// thread. Every entry corresponds to one in-flight `execute_query`
754    /// call that started with a `WITHIN` prefix; the entry is pushed
755    /// before dispatch and popped before the call returns. The stack
756    /// shape supports nested invocations (e.g. a view body that itself
757    /// re-enters execute_query).
758    static SCOPE_OVERRIDES: std::cell::RefCell<Vec<crate::runtime::within_clause::ScopeOverride>> =
759        const { std::cell::RefCell::new(Vec::new()) };
760}
761
762pub(crate) fn push_scope_override(over: crate::runtime::within_clause::ScopeOverride) {
763    SCOPE_OVERRIDES.with(|cell| cell.borrow_mut().push(over));
764}
765
766pub(crate) fn pop_scope_override() {
767    SCOPE_OVERRIDES.with(|cell| {
768        cell.borrow_mut().pop();
769    });
770}
771
772pub(crate) fn current_scope_override() -> Option<crate::runtime::within_clause::ScopeOverride> {
773    SCOPE_OVERRIDES.with(|cell| cell.borrow().last().cloned())
774}
775
776/// Cheap probe: is any `WITHIN …` scope override active on this
777/// thread? The fast-path needs to know without paying for the full
778/// `.last().cloned()` allocation — just peek at stack length.
779pub(crate) fn has_scope_override_active() -> bool {
780    SCOPE_OVERRIDES.with(|cell| !cell.borrow().is_empty())
781}
782
783/// RAII guard pairing `push_scope_override` with the matching pop, so
784/// the stack stays balanced even when the inner `execute_query` returns
785/// early via `?`.
786pub(crate) struct ScopeOverrideGuard;
787
788impl ScopeOverrideGuard {
789    pub fn install(over: crate::runtime::within_clause::ScopeOverride) -> Self {
790        push_scope_override(over);
791        Self
792    }
793}
794
795impl Drop for ScopeOverrideGuard {
796    fn drop(&mut self) {
797        pop_scope_override();
798    }
799}
800
801/// Read the current-thread auth identity, honouring per-statement
802/// `WITHIN ... USER '<u>' AS ROLE '<r>'` overrides. The override only
803/// supplies projected strings — it never grants additional privilege —
804/// so callers that need to make authorisation decisions must read from
805/// the underlying `current_auth_identity()` directly.
806pub(crate) fn current_user_projected() -> Option<String> {
807    let inherited = current_auth_identity().map(|(u, _)| u);
808    if let Some(over) = current_scope_override() {
809        if over.user.is_active() {
810            return over.user.resolve(inherited);
811        }
812    }
813    inherited
814}
815
816pub(crate) fn current_role_projected() -> Option<String> {
817    let inherited = current_auth_identity().map(|(_, r)| format!("{r:?}").to_lowercase());
818    if let Some(over) = current_scope_override() {
819        if over.role.is_active() {
820            return over.role.resolve(inherited);
821        }
822    }
823    inherited
824}
825
826pub(crate) fn current_secret_value(path: &str) -> Option<String> {
827    let key = path.to_ascii_lowercase();
828    CURRENT_SECRET_RESOLVER.with(|cell| {
829        let mut resolver = cell.borrow_mut();
830        let resolver = resolver.as_mut()?;
831        if resolver.values.is_none() {
832            resolver.values = resolver
833                .store
834                .as_ref()
835                .map(|store| store.vault_kv_snapshot());
836        }
837        let values = resolver.values.as_ref()?;
838        values.get(&key).cloned().or_else(|| {
839            key.strip_prefix("red.vault/").and_then(|rest| {
840                values
841                    .get(rest)
842                    .cloned()
843                    .or_else(|| values.get(&format!("red.secret.{rest}")).cloned())
844            })
845        })
846    })
847}
848
849struct SecretResolver {
850    store: Option<Arc<crate::auth::store::AuthStore>>,
851    values: Option<HashMap<String, String>>,
852}
853
854pub(super) struct SecretStoreGuard {
855    previous: Option<SecretResolver>,
856}
857
858impl SecretStoreGuard {
859    pub(super) fn install(store: Option<Arc<crate::auth::store::AuthStore>>) -> Self {
860        let previous = CURRENT_SECRET_RESOLVER.with(|cell| {
861            cell.replace(Some(SecretResolver {
862                store,
863                values: None,
864            }))
865        });
866        Self { previous }
867    }
868}
869
870impl Drop for SecretStoreGuard {
871    fn drop(&mut self) {
872        let previous = self.previous.take();
873        CURRENT_SECRET_RESOLVER.with(|cell| {
874            cell.replace(previous);
875        });
876    }
877}
878
879pub(crate) fn current_config_value(path: &str) -> Option<Value> {
880    let key = path.to_ascii_lowercase();
881    CURRENT_CONFIG_RESOLVER.with(|cell| {
882        let mut resolver = cell.borrow_mut();
883        let resolver = resolver.as_mut()?;
884        if resolver.values.is_none() {
885            resolver.values = Some(latest_config_snapshot(&resolver.db));
886        }
887        let values = resolver.values.as_ref()?;
888        values.get(&key).cloned().or_else(|| {
889            key.strip_prefix("red.config/")
890                .and_then(|rest| values.get(&format!("red.config.{rest}")).cloned())
891        })
892    })
893}
894
895fn update_current_config_value(path: &str, value: Value) {
896    let key = path.to_ascii_lowercase();
897    CURRENT_CONFIG_RESOLVER.with(|cell| {
898        if let Some(resolver) = cell.borrow_mut().as_mut() {
899            if let Some(values) = resolver.values.as_mut() {
900                values.insert(key, value);
901            }
902        }
903    });
904}
905
906fn update_current_secret_value(path: &str, value: Option<String>) {
907    let key = path.to_ascii_lowercase();
908    CURRENT_SECRET_RESOLVER.with(|cell| {
909        if let Some(resolver) = cell.borrow_mut().as_mut() {
910            let Some(values) = resolver.values.as_mut() else {
911                return;
912            };
913            match value {
914                Some(value) => {
915                    values.insert(key, value);
916                }
917                None => {
918                    values.remove(&key);
919                }
920            }
921        }
922    });
923}
924
925fn latest_config_snapshot(db: &RedDB) -> HashMap<String, Value> {
926    let mut latest: HashMap<String, (u64, Value)> = HashMap::new();
927
928    if let Some(manager) = db.store().get_collection("red_config") {
929        manager.for_each_entity(|entity| {
930            let Some(row) = entity.data.as_row() else {
931                return true;
932            };
933            let Some(Value::Text(key)) = row.get_field("key") else {
934                return true;
935            };
936            let value = row.get_field("value").cloned().unwrap_or(Value::Null);
937            let id = entity.id.raw();
938            let key = key.to_ascii_lowercase();
939            insert_latest_config_value(&mut latest, key.clone(), id, value.clone());
940            if let Some(rest) = key.strip_prefix("red.config.") {
941                insert_latest_config_value(&mut latest, format!("red.config/{rest}"), id, value);
942            }
943            true
944        });
945    }
946
947    if let Some(manager) = db.store().get_collection("red.config") {
948        manager.for_each_entity(|entity| {
949            let Some(row) = entity.data.as_row() else {
950                return true;
951            };
952            if matches!(row.get_field("tombstone"), Some(Value::Boolean(true))) {
953                return true;
954            }
955            let Some(Value::Text(key)) = row.get_field("key") else {
956                return true;
957            };
958            let value = row.get_field("value").cloned().unwrap_or(Value::Null);
959            insert_latest_config_value(
960                &mut latest,
961                format!("red.config/{}", key.to_ascii_lowercase()),
962                entity.id.raw(),
963                value,
964            );
965            true
966        });
967    }
968
969    latest
970        .into_iter()
971        .map(|(key, (_, value))| (key, value))
972        .collect()
973}
974
975fn insert_latest_config_value(
976    latest: &mut HashMap<String, (u64, Value)>,
977    key: String,
978    id: u64,
979    value: Value,
980) {
981    match latest.get(&key) {
982        Some((prev_id, _)) if *prev_id > id => {}
983        _ => {
984            latest.insert(key, (id, value));
985        }
986    }
987}
988
989struct ConfigResolver {
990    db: Arc<RedDB>,
991    values: Option<HashMap<String, Value>>,
992}
993
994pub(super) struct ConfigSnapshotGuard {
995    previous: Option<ConfigResolver>,
996}
997
998impl ConfigSnapshotGuard {
999    pub(super) fn install(db: Arc<RedDB>) -> Self {
1000        let previous = CURRENT_CONFIG_RESOLVER
1001            .with(|cell| cell.replace(Some(ConfigResolver { db, values: None })));
1002        Self { previous }
1003    }
1004}
1005
1006impl Drop for ConfigSnapshotGuard {
1007    fn drop(&mut self) {
1008        let previous = self.previous.take();
1009        CURRENT_CONFIG_RESOLVER.with(|cell| {
1010            cell.replace(previous);
1011        });
1012    }
1013}
1014
1015/// Install the MVCC snapshot used by the current thread for the duration
1016/// of one statement. Paired with `clear_current_snapshot()` — callers
1017/// should prefer the `CurrentSnapshotGuard` RAII wrapper so early returns
1018/// still clean up.
1019pub fn set_current_snapshot(ctx: SnapshotContext) {
1020    CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = Some(ctx));
1021    HAS_SNAPSHOT.with(|c| c.set(true));
1022}
1023
1024pub fn clear_current_snapshot() {
1025    CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = None);
1026    HAS_SNAPSHOT.with(|c| c.set(false));
1027}
1028
1029/// Drop-guard that restores the previous snapshot on scope exit. Safe to
1030/// nest — each statement saves the caller's snapshot and puts it back
1031/// instead of blindly clearing, so a top-level `execute_query` called
1032/// from inside another statement dispatch (e.g. vector source subqueries)
1033/// doesn't strip visibility from the outer scan.
1034pub(crate) struct CurrentSnapshotGuard {
1035    previous: Option<SnapshotContext>,
1036}
1037
1038impl CurrentSnapshotGuard {
1039    pub(crate) fn install(ctx: SnapshotContext) -> Self {
1040        let previous = CURRENT_SNAPSHOT.with(|cell| cell.borrow().clone());
1041        set_current_snapshot(ctx);
1042        Self { previous }
1043    }
1044}
1045
1046impl Drop for CurrentSnapshotGuard {
1047    fn drop(&mut self) {
1048        let prev = self.previous.take();
1049        let has = prev.is_some();
1050        CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = prev);
1051        HAS_SNAPSHOT.with(|c| c.set(has));
1052    }
1053}
1054
1055/// Is this entity visible under the current thread's MVCC snapshot?
1056///
1057/// Returns `true` (no filtering) when no snapshot is installed — that
1058/// path is used by embedded callers and by operations that intentionally
1059/// bypass MVCC (VACUUM, snapshot export, admin introspection).
1060///
1061/// When a snapshot is installed the result is
1062///   `snapshot.sees(xmin, xmax) && !mgr.is_aborted(xmin) && !xmax_half_abort`
1063/// where `xmax_half_abort` re-grants visibility for tuples whose
1064/// deleting transaction rolled back.
1065#[inline]
1066pub fn entity_visible_under_current_snapshot(
1067    entity: &crate::storage::unified::entity::UnifiedEntity,
1068) -> bool {
1069    // Fast path — one `Cell<bool>` read, no RefCell borrow. Autocommit
1070    // reads (no active MVCC transaction) still hide superseded physical
1071    // versions while avoiding a full snapshot-context lookup.
1072    // This runs on every row of every scan; the slow path only fires
1073    // inside an explicit transaction.
1074    if !HAS_SNAPSHOT.with(|c| c.get()) {
1075        return entity.xmax == 0;
1076    }
1077    CURRENT_SNAPSHOT.with(|cell| {
1078        let guard = cell.borrow();
1079        let Some(ctx) = guard.as_ref() else {
1080            return true;
1081        };
1082        visibility_check(ctx, entity.xmin, entity.xmax)
1083    })
1084}
1085
1086/// Direct visibility check from raw `(xmin, xmax)` — bypasses the
1087/// entity borrow for callers that already decomposed the tuple (e.g.
1088/// pre-materialized scan caches). Same semantics as
1089/// `entity_visible_under_current_snapshot`.
1090#[inline]
1091pub(crate) fn xids_visible_under_current_snapshot(xmin: u64, xmax: u64) -> bool {
1092    if !HAS_SNAPSHOT.with(|c| c.get()) {
1093        return true;
1094    }
1095    CURRENT_SNAPSHOT.with(|cell| {
1096        let guard = cell.borrow();
1097        let Some(ctx) = guard.as_ref() else {
1098            return true;
1099        };
1100        visibility_check(ctx, xmin, xmax)
1101    })
1102}
1103
1104/// Clone the current thread's snapshot context. Parallel scan paths
1105/// (`query_all_zoned` with `std::thread::scope`) call this on the main
1106/// thread *before* spawning workers so the captured `SnapshotContext`
1107/// can be moved into every worker closure. Worker threads do not
1108/// inherit thread-locals, so calling `entity_visible_under_current_snapshot`
1109/// from inside a spawned closure would silently skip the filter.
1110pub fn capture_current_snapshot() -> Option<SnapshotContext> {
1111    CURRENT_SNAPSHOT.with(|cell| cell.borrow().clone())
1112}
1113
1114/// Whether the active read snapshot may need historical tuple versions
1115/// that the current secondary indexes cannot prove. Index paths can still
1116/// recheck visible candidates, but only a heap scan can discover versions
1117/// whose indexed value was changed or deleted after this snapshot.
1118pub(crate) fn current_snapshot_requires_index_fallback() -> bool {
1119    if !HAS_SNAPSHOT.with(|c| c.get()) {
1120        return false;
1121    }
1122    CURRENT_SNAPSHOT.with(|cell| {
1123        cell.borrow()
1124            .as_ref()
1125            .is_some_and(|ctx| ctx.requires_index_fallback)
1126    })
1127}
1128
1129/// Frozen MVCC + identity context for callers that need to reinstall
1130/// the same view across thread-local boundaries — long-lived cursors,
1131/// background batchers, anything that detaches from the dispatch path
1132/// and re-enters later.
1133///
1134/// The bundle bakes in the three thread-locals every read path
1135/// consults: `SnapshotContext` (MVCC visibility), the auth identity
1136/// (RLS policy gate), and the tenant id (RLS scope). A FETCH that
1137/// reinstalls the bundle sees exactly the same rows as the DECLARE
1138/// would have, regardless of writes that landed in between.
1139///
1140/// Cheap to clone — `SnapshotContext` is a clone of three
1141/// `Arc`-backed fields, identity is a `(String, Role)`, tenant is a
1142/// `String`. None of these contend with the read path.
1143#[derive(Clone, Default)]
1144pub struct SnapshotBundle {
1145    pub snapshot: Option<SnapshotContext>,
1146    pub auth: Option<(String, crate::auth::Role)>,
1147    pub tenant: Option<String>,
1148}
1149
1150/// Capture the three read-path thread-locals into a `SnapshotBundle`.
1151/// Pairs with `with_snapshot_bundle` for re-entry.
1152pub fn snapshot_bundle() -> SnapshotBundle {
1153    SnapshotBundle {
1154        snapshot: capture_current_snapshot(),
1155        auth: current_auth_identity(),
1156        tenant: CURRENT_TENANT_ID.with(|cell| cell.borrow().clone()),
1157    }
1158}
1159
1160/// Reinstall a captured `SnapshotBundle` for the duration of `f`.
1161/// Restores the caller's previous thread-locals on exit (panic-safe via
1162/// the explicit guard struct so a panic in `f` cannot leak the
1163/// installed identity into the worker's next request).
1164pub fn with_snapshot_bundle<R>(bundle: &SnapshotBundle, f: impl FnOnce() -> R) -> R {
1165    struct Guard {
1166        prev_snapshot: Option<SnapshotContext>,
1167        prev_auth: Option<(String, crate::auth::Role)>,
1168        prev_tenant: Option<String>,
1169    }
1170    impl Drop for Guard {
1171        fn drop(&mut self) {
1172            let snap = self.prev_snapshot.take();
1173            let has = snap.is_some();
1174            CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = snap);
1175            HAS_SNAPSHOT.with(|c| c.set(has));
1176            CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = self.prev_auth.take());
1177            CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = self.prev_tenant.take());
1178        }
1179    }
1180
1181    let _guard = {
1182        let prev_snapshot = CURRENT_SNAPSHOT.with(|cell| cell.borrow().clone());
1183        let prev_auth = CURRENT_AUTH_IDENTITY.with(|cell| cell.borrow().clone());
1184        let prev_tenant = CURRENT_TENANT_ID.with(|cell| cell.borrow().clone());
1185
1186        match bundle.snapshot.clone() {
1187            Some(ctx) => set_current_snapshot(ctx),
1188            None => clear_current_snapshot(),
1189        }
1190        CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = bundle.auth.clone());
1191        CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = bundle.tenant.clone());
1192
1193        Guard {
1194            prev_snapshot,
1195            prev_auth,
1196            prev_tenant,
1197        }
1198    };
1199    f()
1200}
1201
1202/// Apply the same visibility rules used by the thread-local helpers
1203/// against a caller-provided context. Intended for parallel workers
1204/// that captured the snapshot with `capture_current_snapshot()`.
1205#[inline]
1206pub fn entity_visible_with_context(
1207    ctx: Option<&SnapshotContext>,
1208    entity: &crate::storage::unified::entity::UnifiedEntity,
1209) -> bool {
1210    match ctx {
1211        Some(ctx) => visibility_check(ctx, entity.xmin, entity.xmax),
1212        None => true,
1213    }
1214}
1215
1216fn table_row_index_fields(
1217    entity: &crate::storage::unified::entity::UnifiedEntity,
1218) -> Vec<(String, crate::storage::schema::Value)> {
1219    let crate::storage::EntityData::Row(row) = &entity.data else {
1220        return Vec::new();
1221    };
1222    if let Some(named) = &row.named {
1223        return named
1224            .iter()
1225            .map(|(name, value)| (name.clone(), value.clone()))
1226            .collect();
1227    }
1228    if let Some(schema) = &row.schema {
1229        return schema
1230            .iter()
1231            .zip(row.columns.iter())
1232            .map(|(name, value)| (name.clone(), value.clone()))
1233            .collect();
1234    }
1235    Vec::new()
1236}
1237
1238#[inline]
1239fn visibility_check(ctx: &SnapshotContext, xmin: u64, xmax: u64) -> bool {
1240    // Writer aborted → tuple never existed from any future reader's view.
1241    // Checked *before* the own-xids fast path so an aborted own-sub-xid
1242    // (rolled-back savepoint) stays hidden from the parent.
1243    if xmin != 0 && ctx.manager.is_aborted(xmin) {
1244        return false;
1245    }
1246    // Deleter aborted → treat xmax as unset; fall back to xmin-only check.
1247    let effective_xmax = if xmax != 0 && ctx.manager.is_aborted(xmax) {
1248        0
1249    } else {
1250        xmax
1251    };
1252    // Phase 2.3.2e: own-tx writes are always visible to the connection
1253    // that stamped them, even when xmin/xmax exceed `snapshot.xid` (as
1254    // happens for sub-xids allocated by SAVEPOINT after BEGIN).
1255    let own_xmin = xmin != 0 && ctx.own_xids.contains(&xmin);
1256    let own_xmax = effective_xmax != 0 && ctx.own_xids.contains(&effective_xmax);
1257    if own_xmax {
1258        // This connection deleted the row via this xid — hide it from self.
1259        return false;
1260    }
1261    if own_xmin {
1262        return true;
1263    }
1264    ctx.snapshot.sees(xmin, effective_xmax)
1265}
1266
1267fn runtime_pool_lock(runtime: &RedDBRuntime) -> std::sync::MutexGuard<'_, PoolState> {
1268    runtime
1269        .inner
1270        .pool
1271        .lock()
1272        .unwrap_or_else(|poisoned| poisoned.into_inner())
1273}
1274
1275/// The graph-analytics table-valued functions recognized in FROM position.
1276/// Both the graph-collection form and the inline `nodes => / edges =>` form
1277/// (issue #799) accept these names.
1278fn is_graph_tvf_name(name: &str) -> bool {
1279    name.eq_ignore_ascii_case("components")
1280        || name.eq_ignore_ascii_case("louvain")
1281        || name.eq_ignore_ascii_case("degree_centrality")
1282        || name.eq_ignore_ascii_case("shortest_path")
1283        || name.eq_ignore_ascii_case("betweenness")
1284        || name.eq_ignore_ascii_case("eigenvector")
1285        || name.eq_ignore_ascii_case("pagerank")
1286}
1287
1288/// Map a declared `WITH ANALYTICS` view to the concrete graph algorithm name
1289/// and named-argument list that [`RedDBRuntime::dispatch_graph_algorithm`]
1290/// consumes (issue #800). The `using` option selects the algorithm inside the
1291/// output family; unsupported algorithms and the options that do not apply to
1292/// the chosen algorithm are rejected so a view never silently ignores a
1293/// declared parameter.
1294fn analytics_view_algorithm(
1295    graph: &str,
1296    view: &crate::catalog::AnalyticsViewDescriptor,
1297) -> RedDBResult<(String, Vec<(String, f64)>)> {
1298    use crate::catalog::AnalyticsOutput;
1299
1300    let mut named_args: Vec<(String, f64)> = Vec::new();
1301    let algorithm = match view.output {
1302        AnalyticsOutput::Communities => {
1303            let algo = view.algorithm.as_deref().unwrap_or("louvain");
1304            if !algo.eq_ignore_ascii_case("louvain") {
1305                return Err(RedDBError::Query(format!(
1306                    "analytics output 'communities' on graph '{graph}' has unsupported algorithm '{algo}' (expected louvain)"
1307                )));
1308            }
1309            if let Some(resolution) = view.resolution {
1310                named_args.push(("resolution".to_string(), resolution));
1311            }
1312            "louvain".to_string()
1313        }
1314        AnalyticsOutput::Components => {
1315            if let Some(algo) = view.algorithm.as_deref() {
1316                if !algo.eq_ignore_ascii_case("components")
1317                    && !algo.eq_ignore_ascii_case("connected_components")
1318                {
1319                    return Err(RedDBError::Query(format!(
1320                        "analytics output 'components' on graph '{graph}' has unsupported algorithm '{algo}' (expected connected_components)"
1321                    )));
1322                }
1323            }
1324            "components".to_string()
1325        }
1326        AnalyticsOutput::Centrality => {
1327            let algo = view
1328                .algorithm
1329                .as_deref()
1330                .unwrap_or("pagerank")
1331                .to_ascii_lowercase();
1332            match algo.as_str() {
1333                "pagerank" => {
1334                    if let Some(max_iterations) = view.max_iterations {
1335                        named_args.push(("max_iterations".to_string(), max_iterations as f64));
1336                    }
1337                }
1338                "eigenvector" => {
1339                    if let Some(max_iterations) = view.max_iterations {
1340                        named_args.push(("max_iterations".to_string(), max_iterations as f64));
1341                    }
1342                    if let Some(tolerance) = view.tolerance {
1343                        named_args.push(("tolerance".to_string(), tolerance));
1344                    }
1345                }
1346                "betweenness" => {}
1347                other => {
1348                    return Err(RedDBError::Query(format!(
1349                        "analytics output 'centrality' on graph '{graph}' has unsupported algorithm '{other}' (expected pagerank, betweenness, or eigenvector)"
1350                    )));
1351                }
1352            }
1353            algo
1354        }
1355    };
1356    Ok((algorithm, named_args))
1357}
1358
1359/// Reject any named arguments for a TVF that accepts none.
1360fn reject_named_args(name: &str, named_args: &[(String, f64)]) -> RedDBResult<()> {
1361    if let Some((key, _)) = named_args.first() {
1362        return Err(RedDBError::Query(format!(
1363            "table function '{name}' has no named argument '{key}'"
1364        )));
1365    }
1366    Ok(())
1367}
1368
1369/// Resolve louvain's optional `resolution` named arg (γ, default 1.0). Any
1370/// other named key, or a non-finite / non-positive resolution, is rejected.
1371fn louvain_resolution(named_args: &[(String, f64)]) -> RedDBResult<f64> {
1372    let mut resolution = 1.0_f64;
1373    for (key, value) in named_args {
1374        if key.eq_ignore_ascii_case("resolution") {
1375            if !value.is_finite() || *value <= 0.0 {
1376                return Err(RedDBError::Query(format!(
1377                    "table function 'louvain' resolution must be > 0, got {value}"
1378                )));
1379            }
1380            resolution = *value;
1381        } else {
1382            return Err(RedDBError::Query(format!(
1383                "table function 'louvain' has no named argument '{key}' (expected 'resolution')"
1384            )));
1385        }
1386    }
1387    Ok(resolution)
1388}
1389
1390/// Undirected degree centrality over abstract inputs: each edge contributes
1391/// 1 to both of its endpoints. Returns `(node_id, degree)` deterministically
1392/// in ascending node-id order, so identical input always yields identical
1393/// rows.
1394fn abstract_degree_centrality(
1395    nodes: &[String],
1396    edges: &[(
1397        String,
1398        String,
1399        crate::storage::engine::graph_algorithms::Weight,
1400    )],
1401) -> Vec<(String, usize)> {
1402    let mut degree: std::collections::BTreeMap<String, usize> = std::collections::BTreeMap::new();
1403    for n in nodes {
1404        degree.entry(n.clone()).or_insert(0);
1405    }
1406    for (a, b, _w) in edges {
1407        *degree.entry(a.clone()).or_insert(0) += 1;
1408        *degree.entry(b.clone()).or_insert(0) += 1;
1409    }
1410    degree.into_iter().collect()
1411}
1412
1413/// Ordered column names for a materialized subquery result: the projection
1414/// columns when present, else the first record's field order.
1415fn ordered_result_columns(result: &crate::storage::query::unified::UnifiedResult) -> Vec<String> {
1416    if !result.columns.is_empty() {
1417        return result.columns.clone();
1418    }
1419    result
1420        .records
1421        .first()
1422        .map(|record| {
1423            record
1424                .column_names()
1425                .iter()
1426                .map(|column| column.to_string())
1427                .collect()
1428        })
1429        .unwrap_or_default()
1430}
1431
1432/// Canonical node-id string for a cell value, so the node universe (from the
1433/// `nodes` subquery) and the edge endpoints (from the `edges` subquery)
1434/// compare equal regardless of integer-vs-text typing. `Null` is not a node.
1435fn value_to_node_id(value: &crate::storage::schema::Value) -> Option<String> {
1436    use crate::storage::schema::Value;
1437    match value {
1438        Value::Null => None,
1439        Value::Text(s) => Some(s.to_string()),
1440        Value::Integer(n) => Some(n.to_string()),
1441        Value::UnsignedInteger(n) => Some(n.to_string()),
1442        Value::NodeRef(s) => Some(s.clone()),
1443        other => Some(other.to_string()),
1444    }
1445}
1446
1447/// Numeric edge weight from a cell value (the optional third `edges` column).
1448fn value_to_weight(value: &crate::storage::schema::Value) -> Option<f32> {
1449    use crate::storage::schema::Value;
1450    match value {
1451        Value::Float(f) => Some(*f as f32),
1452        Value::Integer(n) => Some(*n as f32),
1453        Value::UnsignedInteger(n) => Some(*n as f32),
1454        _ => None,
1455    }
1456}
1457
1458/// Build the node universe from a materialized `nodes` subquery result: the
1459/// first projected column of each row is the node id (issue #799). Zero rows
1460/// is a valid empty node set; a row set with no columns is a shape error.
1461fn inline_node_ids(
1462    name: &str,
1463    result: &crate::storage::query::unified::UnifiedResult,
1464) -> RedDBResult<Vec<String>> {
1465    if result.records.is_empty() {
1466        return Ok(Vec::new());
1467    }
1468    let columns = ordered_result_columns(result);
1469    let Some(first_col) = columns.first() else {
1470        return Err(RedDBError::Query(format!(
1471            "table function '{name}' inline form: `nodes` subquery must project at least one column (the node id)"
1472        )));
1473    };
1474    let mut ids = Vec::with_capacity(result.records.len());
1475    for record in &result.records {
1476        if let Some(id) = record.get(first_col).and_then(value_to_node_id) {
1477            ids.push(id);
1478        }
1479    }
1480    Ok(ids)
1481}
1482
1483/// Build the edge list from a materialized `edges` subquery result: the first
1484/// two projected columns are `(source, target)` and an optional third column
1485/// is the numeric weight (defaulting to 1.0). Fewer than two columns is a
1486/// shape error (issue #799).
1487fn inline_edges(
1488    name: &str,
1489    result: &crate::storage::query::unified::UnifiedResult,
1490) -> RedDBResult<
1491    Vec<(
1492        String,
1493        String,
1494        crate::storage::engine::graph_algorithms::Weight,
1495    )>,
1496> {
1497    if result.records.is_empty() {
1498        return Ok(Vec::new());
1499    }
1500    let columns = ordered_result_columns(result);
1501    if columns.len() < 2 {
1502        return Err(RedDBError::Query(format!(
1503            "table function '{name}' inline form: `edges` subquery must project at least two columns (source, target), got {}",
1504            columns.len()
1505        )));
1506    }
1507    let src_col = &columns[0];
1508    let dst_col = &columns[1];
1509    let weight_col = columns.get(2);
1510    let mut edges = Vec::with_capacity(result.records.len());
1511    for record in &result.records {
1512        let (Some(src), Some(dst)) = (
1513            record.get(src_col).and_then(value_to_node_id),
1514            record.get(dst_col).and_then(value_to_node_id),
1515        ) else {
1516            // A null/absent endpoint is not a valid edge; skip it.
1517            continue;
1518        };
1519        let weight = match weight_col {
1520            Some(col) => match record.get(col) {
1521                None | Some(crate::storage::schema::Value::Null) => 1.0,
1522                Some(value) => value_to_weight(value).ok_or_else(|| {
1523                    RedDBError::Query(format!(
1524                        "table function '{name}' inline form: `edges` weight column must be numeric"
1525                    ))
1526                })?,
1527            },
1528            None => 1.0,
1529        };
1530        edges.push((src, dst, weight));
1531    }
1532    Ok(edges)
1533}
1534
1535fn cache_scope_insert(scopes: &mut HashSet<String>, name: &str) {
1536    if name.is_empty() || name.starts_with("__subq_") || is_universal_query_source(name) {
1537        return;
1538    }
1539    scopes.insert(name.to_string());
1540}
1541
1542fn collect_table_source_scopes(scopes: &mut HashSet<String>, query: &TableQuery) {
1543    match query.source.as_ref() {
1544        Some(crate::storage::query::ast::TableSource::Name(name)) => {
1545            cache_scope_insert(scopes, name)
1546        }
1547        Some(crate::storage::query::ast::TableSource::Subquery(subquery)) => {
1548            collect_query_expr_result_cache_scopes(scopes, subquery);
1549        }
1550        // Graph-collection TVFs (e.g. `louvain(g)`) read the graph store
1551        // read-only. The result is now cached (issue #802) and scoped to the
1552        // graph collection named in the first argument, so any mutation on
1553        // that collection (`INSERT INTO g NODE/EDGE …`) invalidates the
1554        // entry via `invalidate_result_cache_for_table`. Non-graph or
1555        // zero-arg functions contribute no scope.
1556        Some(crate::storage::query::ast::TableSource::Function { name, args, .. }) => {
1557            if is_graph_tvf_name(name) {
1558                if let Some(graph) = args.first() {
1559                    cache_scope_insert(scopes, graph);
1560                }
1561            }
1562        }
1563        // The inline-graph form reads ordinary tables/docs through its
1564        // `nodes`/`edges` subqueries, so its result cache must be scoped to
1565        // those source collections — mutating any of them invalidates the
1566        // cached result (issue #799).
1567        Some(crate::storage::query::ast::TableSource::InlineGraphFunction {
1568            nodes, edges, ..
1569        }) => {
1570            collect_query_expr_result_cache_scopes(scopes, nodes);
1571            collect_query_expr_result_cache_scopes(scopes, edges);
1572        }
1573        None => cache_scope_insert(scopes, &query.table),
1574    }
1575}
1576
1577fn collect_vector_source_scopes(
1578    scopes: &mut HashSet<String>,
1579    source: &crate::storage::query::ast::VectorSource,
1580) {
1581    match source {
1582        crate::storage::query::ast::VectorSource::Reference { collection, .. } => {
1583            cache_scope_insert(scopes, collection);
1584        }
1585        crate::storage::query::ast::VectorSource::Subquery(subquery) => {
1586            collect_query_expr_result_cache_scopes(scopes, subquery);
1587        }
1588        crate::storage::query::ast::VectorSource::Literal(_)
1589        | crate::storage::query::ast::VectorSource::Text(_) => {}
1590    }
1591}
1592
1593fn collect_path_selector_scopes(
1594    scopes: &mut HashSet<String>,
1595    selector: &crate::storage::query::ast::NodeSelector,
1596) {
1597    if let crate::storage::query::ast::NodeSelector::ByRow { table, .. } = selector {
1598        cache_scope_insert(scopes, table);
1599    }
1600}
1601
1602fn collect_query_expr_result_cache_scopes(scopes: &mut HashSet<String>, expr: &QueryExpr) {
1603    match expr {
1604        QueryExpr::Table(query) => collect_table_source_scopes(scopes, query),
1605        QueryExpr::Join(query) => {
1606            collect_query_expr_result_cache_scopes(scopes, &query.left);
1607            collect_query_expr_result_cache_scopes(scopes, &query.right);
1608        }
1609        QueryExpr::Path(query) => {
1610            collect_path_selector_scopes(scopes, &query.from);
1611            collect_path_selector_scopes(scopes, &query.to);
1612        }
1613        QueryExpr::Vector(query) => {
1614            cache_scope_insert(scopes, &query.collection);
1615            collect_vector_source_scopes(scopes, &query.query_vector);
1616        }
1617        QueryExpr::Hybrid(query) => {
1618            collect_query_expr_result_cache_scopes(scopes, &query.structured);
1619            cache_scope_insert(scopes, &query.vector.collection);
1620            collect_vector_source_scopes(scopes, &query.vector.query_vector);
1621        }
1622        QueryExpr::Insert(query) => cache_scope_insert(scopes, &query.table),
1623        QueryExpr::Update(query) => cache_scope_insert(scopes, &query.table),
1624        QueryExpr::Delete(query) => cache_scope_insert(scopes, &query.table),
1625        QueryExpr::CreateTable(query) => cache_scope_insert(scopes, &query.name),
1626        QueryExpr::CreateCollection(query) => cache_scope_insert(scopes, &query.name),
1627        QueryExpr::CreateVector(query) => cache_scope_insert(scopes, &query.name),
1628        QueryExpr::DropTable(query) => cache_scope_insert(scopes, &query.name),
1629        QueryExpr::DropGraph(query) => cache_scope_insert(scopes, &query.name),
1630        QueryExpr::DropVector(query) => cache_scope_insert(scopes, &query.name),
1631        QueryExpr::DropDocument(query) => cache_scope_insert(scopes, &query.name),
1632        QueryExpr::DropKv(query) => cache_scope_insert(scopes, &query.name),
1633        QueryExpr::DropCollection(query) => cache_scope_insert(scopes, &query.name),
1634        QueryExpr::Truncate(query) => cache_scope_insert(scopes, &query.name),
1635        QueryExpr::AlterTable(query) => cache_scope_insert(scopes, &query.name),
1636        QueryExpr::CreateIndex(query) => cache_scope_insert(scopes, &query.table),
1637        QueryExpr::DropIndex(query) => cache_scope_insert(scopes, &query.table),
1638        QueryExpr::CreateTimeSeries(query) => cache_scope_insert(scopes, &query.name),
1639        QueryExpr::CreateMetric(query) => cache_scope_insert(scopes, &query.path),
1640        QueryExpr::AlterMetric(query) => cache_scope_insert(scopes, &query.path),
1641        QueryExpr::CreateSlo(query) => cache_scope_insert(scopes, &query.path),
1642        QueryExpr::DropTimeSeries(query) => cache_scope_insert(scopes, &query.name),
1643        QueryExpr::CreateQueue(query) => cache_scope_insert(scopes, &query.name),
1644        QueryExpr::AlterQueue(query) => cache_scope_insert(scopes, &query.name),
1645        QueryExpr::DropQueue(query) => cache_scope_insert(scopes, &query.name),
1646        QueryExpr::QueueSelect(query) => cache_scope_insert(scopes, &query.queue),
1647        QueryExpr::QueueCommand(query) => match query {
1648            QueueCommand::Push { queue, .. }
1649            | QueueCommand::Pop { queue, .. }
1650            | QueueCommand::Peek { queue, .. }
1651            | QueueCommand::Len { queue }
1652            | QueueCommand::Purge { queue }
1653            | QueueCommand::GroupCreate { queue, .. }
1654            | QueueCommand::GroupRead { queue, .. }
1655            | QueueCommand::Pending { queue, .. }
1656            | QueueCommand::Claim { queue, .. }
1657            | QueueCommand::Ack { queue, .. }
1658            | QueueCommand::Nack { queue, .. } => cache_scope_insert(scopes, queue),
1659            QueueCommand::Move {
1660                source,
1661                destination,
1662                ..
1663            } => {
1664                cache_scope_insert(scopes, source);
1665                cache_scope_insert(scopes, destination);
1666            }
1667        },
1668        QueryExpr::EventsBackfill(query) => {
1669            cache_scope_insert(scopes, &query.collection);
1670            cache_scope_insert(scopes, &query.target_queue);
1671        }
1672        QueryExpr::CreateTree(query) => cache_scope_insert(scopes, &query.collection),
1673        QueryExpr::DropTree(query) => cache_scope_insert(scopes, &query.collection),
1674        QueryExpr::TreeCommand(query) => match query {
1675            TreeCommand::Insert { collection, .. }
1676            | TreeCommand::Move { collection, .. }
1677            | TreeCommand::Delete { collection, .. }
1678            | TreeCommand::Validate { collection, .. }
1679            | TreeCommand::Rebalance { collection, .. } => cache_scope_insert(scopes, collection),
1680        },
1681        QueryExpr::SearchCommand(query) => match query {
1682            SearchCommand::Similar { collection, .. }
1683            | SearchCommand::Hybrid { collection, .. }
1684            | SearchCommand::SpatialRadius { collection, .. }
1685            | SearchCommand::SpatialBbox { collection, .. }
1686            | SearchCommand::SpatialNearest { collection, .. } => {
1687                cache_scope_insert(scopes, collection);
1688            }
1689            SearchCommand::Text { collection, .. }
1690            | SearchCommand::Multimodal { collection, .. }
1691            | SearchCommand::Index { collection, .. }
1692            | SearchCommand::Context { collection, .. } => {
1693                if let Some(collection) = collection.as_deref() {
1694                    cache_scope_insert(scopes, collection);
1695                }
1696            }
1697        },
1698        QueryExpr::Ask(query) => {
1699            if let Some(collection) = query.collection.as_deref() {
1700                cache_scope_insert(scopes, collection);
1701            }
1702        }
1703        QueryExpr::ExplainAlter(query) => cache_scope_insert(scopes, &query.target.name),
1704        QueryExpr::MaintenanceCommand(cmd) => match cmd {
1705            crate::storage::query::ast::MaintenanceCommand::Vacuum { target, .. }
1706            | crate::storage::query::ast::MaintenanceCommand::Analyze { target } => {
1707                if let Some(t) = target {
1708                    cache_scope_insert(scopes, t);
1709                }
1710            }
1711        },
1712        QueryExpr::CopyFrom(cmd) => cache_scope_insert(scopes, &cmd.table),
1713        QueryExpr::CreateView(cmd) => {
1714            cache_scope_insert(scopes, &cmd.name);
1715            // Invalidating the view should also invalidate its dependencies.
1716            collect_query_expr_result_cache_scopes(scopes, &cmd.query);
1717        }
1718        QueryExpr::DropView(cmd) => cache_scope_insert(scopes, &cmd.name),
1719        QueryExpr::RefreshMaterializedView(cmd) => cache_scope_insert(scopes, &cmd.name),
1720        QueryExpr::CreatePolicy(cmd) => cache_scope_insert(scopes, &cmd.table),
1721        QueryExpr::DropPolicy(cmd) => cache_scope_insert(scopes, &cmd.table),
1722        QueryExpr::CreateServer(_) | QueryExpr::DropServer(_) => {}
1723        QueryExpr::CreateForeignTable(cmd) => cache_scope_insert(scopes, &cmd.name),
1724        QueryExpr::DropForeignTable(cmd) => cache_scope_insert(scopes, &cmd.name),
1725        QueryExpr::Graph(_)
1726        | QueryExpr::GraphCommand(_)
1727        | QueryExpr::ProbabilisticCommand(_)
1728        | QueryExpr::SetConfig { .. }
1729        | QueryExpr::ShowConfig { .. }
1730        | QueryExpr::SetSecret { .. }
1731        | QueryExpr::DeleteSecret { .. }
1732        | QueryExpr::ShowSecrets { .. }
1733        | QueryExpr::SetTenant(_)
1734        | QueryExpr::ShowTenant
1735        | QueryExpr::TransactionControl(_)
1736        | QueryExpr::CreateSchema(_)
1737        | QueryExpr::DropSchema(_)
1738        | QueryExpr::CreateSequence(_)
1739        | QueryExpr::DropSequence(_)
1740        | QueryExpr::Grant(_)
1741        | QueryExpr::Revoke(_)
1742        | QueryExpr::AlterUser(_)
1743        | QueryExpr::CreateIamPolicy { .. }
1744        | QueryExpr::DropIamPolicy { .. }
1745        | QueryExpr::AttachPolicy { .. }
1746        | QueryExpr::DetachPolicy { .. }
1747        | QueryExpr::ShowPolicies { .. }
1748        | QueryExpr::ShowEffectivePermissions { .. }
1749        | QueryExpr::SimulatePolicy { .. }
1750        | QueryExpr::LintPolicy { .. }
1751        | QueryExpr::MigratePolicyMode { .. }
1752        | QueryExpr::CreateMigration(_)
1753        | QueryExpr::ApplyMigration(_)
1754        | QueryExpr::RollbackMigration(_)
1755        | QueryExpr::ExplainMigration(_)
1756        | QueryExpr::EventsBackfillStatus { .. } => {}
1757        QueryExpr::KvCommand(cmd) => {
1758            use crate::storage::query::ast::KvCommand;
1759            match cmd {
1760                KvCommand::Put { collection, .. }
1761                | KvCommand::InvalidateTags { collection, .. }
1762                | KvCommand::Get { collection, .. }
1763                | KvCommand::Unseal { collection, .. }
1764                | KvCommand::Rotate { collection, .. }
1765                | KvCommand::History { collection, .. }
1766                | KvCommand::List { collection, .. }
1767                | KvCommand::Purge { collection, .. }
1768                | KvCommand::Watch { collection, .. }
1769                | KvCommand::Delete { collection, .. }
1770                | KvCommand::Incr { collection, .. }
1771                | KvCommand::Cas { collection, .. } => cache_scope_insert(scopes, collection),
1772            }
1773        }
1774        QueryExpr::ConfigCommand(cmd) => {
1775            use crate::storage::query::ast::ConfigCommand;
1776            match cmd {
1777                ConfigCommand::Put { collection, .. }
1778                | ConfigCommand::Get { collection, .. }
1779                | ConfigCommand::Resolve { collection, .. }
1780                | ConfigCommand::Rotate { collection, .. }
1781                | ConfigCommand::Delete { collection, .. }
1782                | ConfigCommand::History { collection, .. }
1783                | ConfigCommand::List { collection, .. }
1784                | ConfigCommand::Watch { collection, .. }
1785                | ConfigCommand::InvalidVolatileOperation { collection, .. } => {
1786                    cache_scope_insert(scopes, collection)
1787                }
1788            }
1789        }
1790    }
1791}
1792
1793/// Combine matching RLS policies for a table + action into a single
1794/// `Filter` suitable for AND-ing into a caller's `WHERE` clause.
1795///
1796/// Returns `None` when RLS is disabled or no policy admits the caller's
1797/// role — callers use that to short-circuit the mutation (for DELETE /
1798/// UPDATE we simply skip the operation, which PG expresses as "no rows
1799/// match the policy + predicate combination").
1800pub(crate) fn rls_policy_filter(
1801    runtime: &RedDBRuntime,
1802    table: &str,
1803    action: crate::storage::query::ast::PolicyAction,
1804) -> Option<crate::storage::query::ast::Filter> {
1805    rls_policy_filter_for_kind(
1806        runtime,
1807        table,
1808        action,
1809        crate::storage::query::ast::PolicyTargetKind::Table,
1810    )
1811}
1812
1813/// Kind-aware policy filter combiner (Phase 2.5.5 RLS universal).
1814/// Graph / vector / queue / timeseries scans pass the concrete kind;
1815/// policies targeting other kinds are ignored. Legacy Table-scoped
1816/// policies still apply cross-kind — callers register auto-tenancy
1817/// policies as Table today.
1818pub(crate) fn rls_policy_filter_for_kind(
1819    runtime: &RedDBRuntime,
1820    table: &str,
1821    action: crate::storage::query::ast::PolicyAction,
1822    kind: crate::storage::query::ast::PolicyTargetKind,
1823) -> Option<crate::storage::query::ast::Filter> {
1824    use crate::storage::query::ast::Filter;
1825
1826    if !runtime.inner.rls_enabled_tables.read().contains(table) {
1827        return None;
1828    }
1829    let role = current_auth_identity().map(|(_, role)| role);
1830    let role_str = role.map(|r| r.as_str().to_string());
1831    let policies = runtime.matching_rls_policies_for_kind(table, role_str.as_deref(), action, kind);
1832    if policies.is_empty() {
1833        return None;
1834    }
1835    policies
1836        .into_iter()
1837        .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1838}
1839
1840/// Returns true when the table has RLS enforcement enabled. Convenience
1841/// shortcut so DML paths can gate the AND-combine work without reaching
1842/// into `runtime.inner.rls_enabled_tables` directly.
1843pub(crate) fn rls_is_enabled(runtime: &RedDBRuntime, table: &str) -> bool {
1844    runtime.inner.rls_enabled_tables.read().contains(table)
1845}
1846
1847/// Per-entity gate used by the graph materialiser for `GraphNode`
1848/// entities. RLS is checked against the source collection with
1849/// `kind = Nodes`, which `matching_rls_policies_for_kind` resolves to
1850/// either `Nodes`-targeted policies or legacy `Table`-targeted ones
1851/// (for back-compat with auto-tenancy declarations). Cached per
1852/// collection so big graphs only resolve the policy chain once.
1853fn node_passes_rls(
1854    runtime: &RedDBRuntime,
1855    collection: &str,
1856    role: Option<&str>,
1857    cache: &mut std::collections::HashMap<String, Option<crate::storage::query::ast::Filter>>,
1858    entity: &crate::storage::unified::entity::UnifiedEntity,
1859) -> bool {
1860    use crate::storage::query::ast::{Filter, PolicyAction, PolicyTargetKind};
1861
1862    if !runtime.inner.rls_enabled_tables.read().contains(collection) {
1863        return true;
1864    }
1865    let filter = cache.entry(collection.to_string()).or_insert_with(|| {
1866        let policies = runtime.matching_rls_policies_for_kind(
1867            collection,
1868            role,
1869            PolicyAction::Select,
1870            PolicyTargetKind::Nodes,
1871        );
1872        if policies.is_empty() {
1873            None
1874        } else {
1875            policies
1876                .into_iter()
1877                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1878        }
1879    });
1880    let Some(filter) = filter else {
1881        return false;
1882    };
1883    crate::runtime::query_exec::evaluate_entity_filter_with_db(
1884        Some(&runtime.inner.db),
1885        entity,
1886        filter,
1887        collection,
1888        collection,
1889    )
1890}
1891
1892/// Edge counterpart of `node_passes_rls`. Same caching strategy with
1893/// `kind = Edges`.
1894fn edge_passes_rls(
1895    runtime: &RedDBRuntime,
1896    collection: &str,
1897    role: Option<&str>,
1898    cache: &mut std::collections::HashMap<String, Option<crate::storage::query::ast::Filter>>,
1899    entity: &crate::storage::unified::entity::UnifiedEntity,
1900) -> bool {
1901    use crate::storage::query::ast::{Filter, PolicyAction, PolicyTargetKind};
1902
1903    if !runtime.inner.rls_enabled_tables.read().contains(collection) {
1904        return true;
1905    }
1906    let filter = cache.entry(collection.to_string()).or_insert_with(|| {
1907        let policies = runtime.matching_rls_policies_for_kind(
1908            collection,
1909            role,
1910            PolicyAction::Select,
1911            PolicyTargetKind::Edges,
1912        );
1913        if policies.is_empty() {
1914            None
1915        } else {
1916            policies
1917                .into_iter()
1918                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1919        }
1920    });
1921    let Some(filter) = filter else {
1922        return false;
1923    };
1924    crate::runtime::query_exec::evaluate_entity_filter_with_db(
1925        Some(&runtime.inner.db),
1926        entity,
1927        filter,
1928        collection,
1929        collection,
1930    )
1931}
1932
1933/// RLS policy injection (Phase 2.5.2 PG parity).
1934///
1935/// Fetch every matching policy for the current thread-local role and
1936/// fold them into the query's filter. Semantics mirror PostgreSQL:
1937///
1938/// * Multiple policies on the same table combine with **OR** — a row is
1939///   visible if *any* policy admits it.
1940/// * The combined policy predicate is **AND**-ed into the caller's
1941///   existing `WHERE` clause so explicit predicates continue to trim
1942///   the policy-allowed set.
1943/// * No matching policies + RLS enabled = zero rows (PG's
1944///   restrictive-default). Callers get `None` and return an empty
1945///   `UnifiedResult` without ever dispatching the scan.
1946///
1947/// This runs only when `RuntimeInner::rls_enabled_tables` already
1948/// contains the table name — callers gate the hot path upfront to
1949/// avoid the lock acquisition on tables without RLS.
1950///
1951/// Returns `None` when no policy admits the current role; returns
1952/// `Some(mutated_table)` with policy filters folded in otherwise.
1953fn inject_rls_filters(
1954    runtime: &RedDBRuntime,
1955    frame: &dyn super::statement_frame::ReadFrame,
1956    mut table: crate::storage::query::ast::TableQuery,
1957) -> Option<crate::storage::query::ast::TableQuery> {
1958    use crate::storage::query::ast::{Filter, PolicyAction};
1959
1960    // `None` role falls through to policies with no `TO role` clause.
1961    let role = frame.identity().map(|(_, role)| role);
1962    let role_str = role.map(|r| r.as_str().to_string());
1963    let policies =
1964        runtime.matching_rls_policies(&table.table, role_str.as_deref(), PolicyAction::Select);
1965
1966    if policies.is_empty() {
1967        // RLS enabled + no policy match = deny everything. Signal the
1968        // caller to short-circuit with an empty result set.
1969        return None;
1970    }
1971
1972    // Combine policy predicates with OR (PG's permissive default).
1973    let combined = policies
1974        .into_iter()
1975        .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1976        .expect("policies non-empty");
1977
1978    // AND into the caller's existing predicate. The predicate may live
1979    // in `where_expr` rather than `filter`: `resolve_table_expr_subqueries`
1980    // nulls `filter` whenever `where_expr` is present (the case for a
1981    // view body rewritten into `SELECT … WHERE …`). Folding only into
1982    // `filter` here would silently drop that `where_expr` predicate at
1983    // eval time because `effective_table_filter` prefers `filter` —
1984    // e.g. `WITHIN TENANT … SELECT * FROM <view>` would apply the
1985    // tenant policy but lose the view's own WHERE (#635).
1986    use crate::storage::query::sql_lowering::{expr_to_filter, filter_to_expr};
1987    let had_where_expr = table.where_expr.is_some();
1988    let existing = table
1989        .filter
1990        .take()
1991        .or_else(|| table.where_expr.as_ref().map(expr_to_filter));
1992    let new_filter = match existing {
1993        Some(existing) => Filter::And(Box::new(existing), Box::new(combined)),
1994        None => combined,
1995    };
1996    // Keep `where_expr` in lock-step with the merged `filter` so
1997    // whichever the executor consults sees the full predicate.
1998    if had_where_expr {
1999        table.where_expr = Some(filter_to_expr(&new_filter));
2000    }
2001    table.filter = Some(new_filter);
2002    Some(table)
2003}
2004
2005/// Apply per-table RLS to a `JoinQuery` by folding each side's policy
2006/// predicate into the join's outer filter. Walking the merged record
2007/// at the join layer (rather than mutating the per-side scan filter)
2008/// keeps the planner's strategy choice and per-side index selection
2009/// undisturbed — the policy predicate uses the qualified `t.col` form
2010/// that resolves cleanly against the merged record's keys.
2011///
2012/// Returns `None` when any leaf has RLS enabled and no policy admits
2013/// the caller — the join short-circuits to an empty result.
2014fn inject_rls_into_join(
2015    runtime: &RedDBRuntime,
2016    frame: &dyn super::statement_frame::ReadFrame,
2017    mut join: crate::storage::query::ast::JoinQuery,
2018) -> Option<crate::storage::query::ast::JoinQuery> {
2019    use crate::storage::query::ast::Filter;
2020
2021    let mut policy_filters: Vec<Filter> = Vec::new();
2022    if !collect_join_side_policy(runtime, frame, join.left.as_ref(), &mut policy_filters) {
2023        return None;
2024    }
2025    if !collect_join_side_policy(runtime, frame, join.right.as_ref(), &mut policy_filters) {
2026        return None;
2027    }
2028
2029    if policy_filters.is_empty() {
2030        return Some(join);
2031    }
2032
2033    let combined = policy_filters
2034        .into_iter()
2035        .reduce(|acc, f| Filter::And(Box::new(acc), Box::new(f)))
2036        .expect("policy_filters non-empty");
2037
2038    join.filter = Some(match join.filter.take() {
2039        Some(existing) => Filter::And(Box::new(existing), Box::new(combined)),
2040        None => combined,
2041    });
2042
2043    Some(join)
2044}
2045
2046/// For each `Table` leaf reachable through nested joins, append the
2047/// RLS-policy filter (combined with OR across that side's matching
2048/// policies) into `out`. Returns `false` when a side has RLS enabled
2049/// but no policy admits the caller — the join must short-circuit.
2050fn collect_join_side_policy(
2051    runtime: &RedDBRuntime,
2052    frame: &dyn super::statement_frame::ReadFrame,
2053    expr: &crate::storage::query::ast::QueryExpr,
2054    out: &mut Vec<crate::storage::query::ast::Filter>,
2055) -> bool {
2056    use crate::storage::query::ast::{Filter, PolicyAction, QueryExpr};
2057    match expr {
2058        QueryExpr::Table(t) => {
2059            if !runtime.inner.rls_enabled_tables.read().contains(&t.table) {
2060                return true;
2061            }
2062            let role = frame.identity().map(|(_, role)| role);
2063            let role_str = role.map(|r| r.as_str().to_string());
2064            let policies =
2065                runtime.matching_rls_policies(&t.table, role_str.as_deref(), PolicyAction::Select);
2066            if policies.is_empty() {
2067                return false;
2068            }
2069            let combined = policies
2070                .into_iter()
2071                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
2072                .expect("policies non-empty");
2073            out.push(combined);
2074            true
2075        }
2076        QueryExpr::Join(inner) => {
2077            collect_join_side_policy(runtime, frame, inner.left.as_ref(), out)
2078                && collect_join_side_policy(runtime, frame, inner.right.as_ref(), out)
2079        }
2080        _ => true,
2081    }
2082}
2083
2084/// Foreign-table post-scan filter application (Phase 3.2.2 PG parity).
2085///
2086/// Phase 3.2 FDW wrappers don't advertise filter pushdown, so the runtime
2087/// applies `WHERE` / `ORDER BY` / `LIMIT` / `OFFSET` after the wrapper
2088/// materialises all rows. Projections are best-effort — when the query
2089/// lists explicit columns we keep only those; a `SELECT *` keeps every
2090/// wrapper-emitted field verbatim.
2091///
2092/// When a wrapper later opts into pushdown (`supports_pushdown = true`)
2093/// the runtime will pass the compiled filter down instead of post-filtering.
2094fn apply_foreign_table_filters(
2095    records: Vec<crate::storage::query::unified::UnifiedRecord>,
2096    query: &crate::storage::query::ast::TableQuery,
2097) -> crate::storage::query::unified::UnifiedResult {
2098    use crate::storage::query::sql_lowering::{
2099        effective_table_filter, effective_table_projections,
2100    };
2101    use crate::storage::query::unified::UnifiedResult;
2102
2103    let filter = effective_table_filter(query);
2104    let projections = effective_table_projections(query);
2105
2106    // Step 1 — WHERE. Reuse the cross-store evaluator so the semantics
2107    // match native-collection queries (same operators, same NULL handling).
2108    let mut filtered: Vec<_> = records
2109        .into_iter()
2110        .filter(|record| match &filter {
2111            Some(f) => {
2112                super::join_filter::evaluate_runtime_filter_with_db(None, record, f, None, None)
2113            }
2114            None => true,
2115        })
2116        .collect();
2117
2118    // Step 2 — LIMIT / OFFSET. Applied after filter to match SQL semantics.
2119    if let Some(offset) = query.offset {
2120        let offset = offset as usize;
2121        if offset >= filtered.len() {
2122            filtered.clear();
2123        } else {
2124            filtered.drain(0..offset);
2125        }
2126    }
2127    if let Some(limit) = query.limit {
2128        filtered.truncate(limit as usize);
2129    }
2130
2131    // Step 3 — columns list. `SELECT *` (no explicit projections) keeps
2132    // the wrapper's column set; an explicit list trims to those names.
2133    let columns: Vec<String> = if projections.is_empty() {
2134        filtered
2135            .first()
2136            .map(|r| r.column_names().iter().map(|k| k.to_string()).collect())
2137            .unwrap_or_default()
2138    } else {
2139        projections
2140            .iter()
2141            .map(super::join_filter::projection_name)
2142            .collect()
2143    };
2144
2145    let mut result = UnifiedResult::empty();
2146    result.columns = columns;
2147    result.records = filtered;
2148    result
2149}
2150
2151/// Collect every concrete table reference inside a `QueryExpr`.
2152///
2153/// Used by view bookkeeping (dependency tracking for materialised
2154/// invalidation) and any other rewriter that needs to know the base
2155/// tables a query pulls from. Does not descend into projections/filters;
2156/// only the `FROM` side.
2157pub(crate) fn collect_table_refs(expr: &QueryExpr) -> Vec<String> {
2158    let mut scopes: HashSet<String> = HashSet::new();
2159    collect_query_expr_result_cache_scopes(&mut scopes, expr);
2160    scopes.into_iter().collect()
2161}
2162
2163fn query_expr_result_cache_scopes(expr: &QueryExpr) -> HashSet<String> {
2164    let mut scopes = HashSet::new();
2165    collect_query_expr_result_cache_scopes(&mut scopes, expr);
2166    scopes
2167}
2168
2169const RESULT_CACHE_BACKEND_KEY: &str = "runtime.result_cache.backend";
2170const RESULT_CACHE_DEFAULT_BACKEND: &str = "legacy";
2171const RESULT_CACHE_BLOB_NAMESPACE: &str = "runtime.result_cache";
2172// Issue #802: TTL / capacity are now read from config at call time; these
2173// constants are the defaults the config falls back to (and match the
2174// `runtime.result_cache.*` matrix entries).
2175const RESULT_CACHE_TTL_SECS: u64 = 30;
2176const RESULT_CACHE_MAX_ENTRIES: usize = 1000;
2177const RESULT_CACHE_ENABLED_KEY: &str = "runtime.result_cache.enabled";
2178const RESULT_CACHE_TTL_KEY: &str = "runtime.result_cache.ttl_seconds";
2179const RESULT_CACHE_CAPACITY_KEY: &str = "runtime.result_cache.capacity_entries";
2180const RESULT_CACHE_PAYLOAD_MAGIC: &[u8; 8] = b"RDRC0001";
2181
2182#[derive(Clone, Copy, Debug, PartialEq, Eq)]
2183enum RuntimeResultCacheBackend {
2184    Legacy,
2185    BlobCache,
2186    Shadow,
2187}
2188
2189/// Evict oldest entries until `map` fits in `max_entries`. Returns the
2190/// number of entries evicted so callers can bump the eviction metric
2191/// (issue #802).
2192fn trim_result_cache(
2193    map: &mut HashMap<String, RuntimeResultCacheEntry>,
2194    order: &mut std::collections::VecDeque<String>,
2195    max_entries: usize,
2196) -> u64 {
2197    let mut evicted = 0u64;
2198    while map.len() > max_entries {
2199        if let Some(oldest) = order.pop_front() {
2200            if map.remove(&oldest).is_some() {
2201                evicted += 1;
2202            }
2203        } else {
2204            break;
2205        }
2206    }
2207    evicted
2208}
2209
2210fn result_cache_fingerprint(result: &RuntimeQueryResult) -> String {
2211    format!(
2212        "{:?}|{}|{}|{}|{}|{:?}",
2213        result.result,
2214        result.query,
2215        result.statement,
2216        result.engine,
2217        result.affected_rows,
2218        result.statement_type
2219    )
2220}
2221
2222fn mode_to_byte(mode: crate::storage::query::modes::QueryMode) -> u8 {
2223    match mode {
2224        crate::storage::query::modes::QueryMode::Sql => 0,
2225        crate::storage::query::modes::QueryMode::Gremlin => 1,
2226        crate::storage::query::modes::QueryMode::Cypher => 2,
2227        crate::storage::query::modes::QueryMode::Sparql => 3,
2228        crate::storage::query::modes::QueryMode::Path => 4,
2229        crate::storage::query::modes::QueryMode::Natural => 5,
2230        crate::storage::query::modes::QueryMode::Unknown => 255,
2231    }
2232}
2233
2234fn mode_from_byte(byte: u8) -> Option<crate::storage::query::modes::QueryMode> {
2235    match byte {
2236        0 => Some(crate::storage::query::modes::QueryMode::Sql),
2237        1 => Some(crate::storage::query::modes::QueryMode::Gremlin),
2238        2 => Some(crate::storage::query::modes::QueryMode::Cypher),
2239        3 => Some(crate::storage::query::modes::QueryMode::Sparql),
2240        4 => Some(crate::storage::query::modes::QueryMode::Path),
2241        5 => Some(crate::storage::query::modes::QueryMode::Natural),
2242        255 => Some(crate::storage::query::modes::QueryMode::Unknown),
2243        _ => None,
2244    }
2245}
2246
2247fn result_cache_static_str(value: &str) -> Option<&'static str> {
2248    match value {
2249        "select" => Some("select"),
2250        "materialized-graph" => Some("materialized-graph"),
2251        "runtime-red-schema" => Some("runtime-red-schema"),
2252        "runtime-fdw" => Some("runtime-fdw"),
2253        "runtime-table-rls" => Some("runtime-table-rls"),
2254        "runtime-table" => Some("runtime-table"),
2255        "runtime-join-rls" => Some("runtime-join-rls"),
2256        "runtime-join" => Some("runtime-join"),
2257        "runtime-vector" => Some("runtime-vector"),
2258        "runtime-hybrid" => Some("runtime-hybrid"),
2259        "runtime-secret" => Some("runtime-secret"),
2260        "runtime-config" => Some("runtime-config"),
2261        "runtime-tenant" => Some("runtime-tenant"),
2262        "runtime-explain" => Some("runtime-explain"),
2263        "runtime-tree" => Some("runtime-tree"),
2264        "runtime-kv" => Some("runtime-kv"),
2265        "runtime-queue" => Some("runtime-queue"),
2266        _ => None,
2267    }
2268}
2269
2270fn write_u32(out: &mut Vec<u8>, value: usize) -> Option<()> {
2271    let value = u32::try_from(value).ok()?;
2272    out.extend_from_slice(&value.to_le_bytes());
2273    Some(())
2274}
2275
2276fn write_string(out: &mut Vec<u8>, value: &str) -> Option<()> {
2277    write_u32(out, value.len())?;
2278    out.extend_from_slice(value.as_bytes());
2279    Some(())
2280}
2281
2282fn write_bytes(out: &mut Vec<u8>, value: &[u8]) -> Option<()> {
2283    write_u32(out, value.len())?;
2284    out.extend_from_slice(value);
2285    Some(())
2286}
2287
2288fn read_u8(input: &mut &[u8]) -> Option<u8> {
2289    let (&value, rest) = input.split_first()?;
2290    *input = rest;
2291    Some(value)
2292}
2293
2294fn read_u32(input: &mut &[u8]) -> Option<usize> {
2295    if input.len() < 4 {
2296        return None;
2297    }
2298    let value = u32::from_le_bytes(input[..4].try_into().ok()?) as usize;
2299    *input = &input[4..];
2300    Some(value)
2301}
2302
2303fn read_u64(input: &mut &[u8]) -> Option<u64> {
2304    if input.len() < 8 {
2305        return None;
2306    }
2307    let value = u64::from_le_bytes(input[..8].try_into().ok()?);
2308    *input = &input[8..];
2309    Some(value)
2310}
2311
2312fn read_string(input: &mut &[u8]) -> Option<String> {
2313    let len = read_u32(input)?;
2314    if input.len() < len {
2315        return None;
2316    }
2317    let value = String::from_utf8(input[..len].to_vec()).ok()?;
2318    *input = &input[len..];
2319    Some(value)
2320}
2321
2322fn read_bytes<'a>(input: &mut &'a [u8]) -> Option<&'a [u8]> {
2323    let len = read_u32(input)?;
2324    if input.len() < len {
2325        return None;
2326    }
2327    let value = &input[..len];
2328    *input = &input[len..];
2329    Some(value)
2330}
2331
2332fn encode_result_cache_payload(entry: &RuntimeResultCacheEntry) -> Option<Vec<u8>> {
2333    let result = &entry.result;
2334    if result.result.pre_serialized_json.is_some()
2335        || result_cache_static_str(result.statement).is_none()
2336        || result_cache_static_str(result.engine).is_none()
2337        || result_cache_static_str(result.statement_type).is_none()
2338        || result.result.records.iter().any(|record| {
2339            !record.nodes.is_empty()
2340                || !record.edges.is_empty()
2341                || !record.paths.is_empty()
2342                || !record.vector_results.is_empty()
2343        })
2344    {
2345        return None;
2346    }
2347
2348    let mut out = Vec::new();
2349    out.extend_from_slice(RESULT_CACHE_PAYLOAD_MAGIC);
2350    write_string(&mut out, &result.query)?;
2351    out.push(mode_to_byte(result.mode));
2352    write_string(&mut out, result.statement)?;
2353    write_string(&mut out, result.engine)?;
2354    out.extend_from_slice(&result.affected_rows.to_le_bytes());
2355    write_string(&mut out, result.statement_type)?;
2356
2357    write_u32(&mut out, result.result.columns.len())?;
2358    for column in &result.result.columns {
2359        write_string(&mut out, column)?;
2360    }
2361    out.extend_from_slice(&result.result.stats.nodes_scanned.to_le_bytes());
2362    out.extend_from_slice(&result.result.stats.edges_scanned.to_le_bytes());
2363    out.extend_from_slice(&result.result.stats.rows_scanned.to_le_bytes());
2364    out.extend_from_slice(&result.result.stats.exec_time_us.to_le_bytes());
2365
2366    write_u32(&mut out, result.result.records.len())?;
2367    for record in &result.result.records {
2368        let fields = record.iter_fields().collect::<Vec<_>>();
2369        write_u32(&mut out, fields.len())?;
2370        for (name, value) in fields {
2371            write_string(&mut out, name)?;
2372            let mut encoded = Vec::new();
2373            crate::storage::schema::value_codec::encode(value, &mut encoded);
2374            write_bytes(&mut out, &encoded)?;
2375        }
2376    }
2377
2378    write_u32(&mut out, entry.scopes.len())?;
2379    for scope in &entry.scopes {
2380        write_string(&mut out, scope)?;
2381    }
2382    Some(out)
2383}
2384
2385fn decode_result_cache_payload(mut input: &[u8]) -> Option<(RuntimeQueryResult, HashSet<String>)> {
2386    if input.len() < RESULT_CACHE_PAYLOAD_MAGIC.len()
2387        || &input[..RESULT_CACHE_PAYLOAD_MAGIC.len()] != RESULT_CACHE_PAYLOAD_MAGIC
2388    {
2389        return None;
2390    }
2391    input = &input[RESULT_CACHE_PAYLOAD_MAGIC.len()..];
2392
2393    let query = read_string(&mut input)?;
2394    let mode = mode_from_byte(read_u8(&mut input)?)?;
2395    let statement = result_cache_static_str(&read_string(&mut input)?)?;
2396    let engine = result_cache_static_str(&read_string(&mut input)?)?;
2397    let affected_rows = read_u64(&mut input)?;
2398    let statement_type = result_cache_static_str(&read_string(&mut input)?)?;
2399
2400    let mut columns = Vec::new();
2401    for _ in 0..read_u32(&mut input)? {
2402        columns.push(read_string(&mut input)?);
2403    }
2404    let stats = crate::storage::query::unified::QueryStats {
2405        nodes_scanned: read_u64(&mut input)?,
2406        edges_scanned: read_u64(&mut input)?,
2407        rows_scanned: read_u64(&mut input)?,
2408        exec_time_us: read_u64(&mut input)?,
2409    };
2410
2411    let mut records = Vec::new();
2412    for _ in 0..read_u32(&mut input)? {
2413        let mut record = crate::storage::query::unified::UnifiedRecord::new();
2414        for _ in 0..read_u32(&mut input)? {
2415            let name = read_string(&mut input)?;
2416            let bytes = read_bytes(&mut input)?;
2417            let (value, used) = crate::storage::schema::value_codec::decode(bytes).ok()?;
2418            if used != bytes.len() {
2419                return None;
2420            }
2421            record.set_owned(name, value);
2422        }
2423        records.push(record);
2424    }
2425
2426    let mut scopes = HashSet::new();
2427    for _ in 0..read_u32(&mut input)? {
2428        scopes.insert(read_string(&mut input)?);
2429    }
2430    if !input.is_empty() {
2431        return None;
2432    }
2433
2434    Some((
2435        RuntimeQueryResult {
2436            query,
2437            mode,
2438            statement,
2439            engine,
2440            result: crate::storage::query::unified::UnifiedResult {
2441                columns,
2442                records,
2443                stats,
2444                pre_serialized_json: None,
2445            },
2446            affected_rows,
2447            statement_type,
2448            bookmark: None,
2449        },
2450        scopes,
2451    ))
2452}
2453
2454/// Heuristic: does the raw SQL reference a built-in whose output
2455/// varies by connection, clock, or randomness? Such queries must
2456/// skip the 30s result cache — see the call site for rationale.
2457///
2458/// ASCII case-insensitive substring match. False positives (the
2459/// token appears in a quoted string) only skip caching, which is
2460/// the conservative direction.
2461/// If `sql` starts with `EXPLAIN` followed by a non-`ALTER` token,
2462/// return the trimmed inner statement; otherwise `None`.
2463///
2464/// `EXPLAIN ALTER FOR CREATE TABLE ...` is a separate schema-diff
2465/// command handled inside the normal SQL parser, so we leave it
2466/// alone here.
2467fn strip_explain_prefix(sql: &str) -> Option<&str> {
2468    let trimmed = sql.trim_start();
2469    let (head, rest) = trimmed.split_at(
2470        trimmed
2471            .find(|c: char| c.is_whitespace())
2472            .unwrap_or(trimmed.len()),
2473    );
2474    if !head.eq_ignore_ascii_case("EXPLAIN") {
2475        return None;
2476    }
2477    let rest = rest.trim_start();
2478    if rest.is_empty() {
2479        return None;
2480    }
2481    // Peek the next token — if ALTER or ASK, defer to the normal parser.
2482    // `EXPLAIN ASK` is an executable read path: it runs retrieval and
2483    // provider selection, then short-circuits before the LLM call.
2484    let next_head_end = rest.find(|c: char| c.is_whitespace()).unwrap_or(rest.len());
2485    if rest[..next_head_end].eq_ignore_ascii_case("ALTER")
2486        || rest[..next_head_end].eq_ignore_ascii_case("ASK")
2487    {
2488        return None;
2489    }
2490    Some(rest)
2491}
2492
2493/// Cheap prefix check for a leading `WITH` keyword. Used to gate the
2494/// CTE-aware parse in `execute_query` without paying for a full
2495/// lexer pass on every statement. Treats `WITHIN` as not-a-CTE so
2496/// `WITHIN TENANT '...' SELECT ...` doesn't mis-route.
2497pub(super) fn has_with_prefix(sql: &str) -> bool {
2498    let trimmed = sql.trim_start();
2499    let head_end = trimmed
2500        .find(|c: char| c.is_whitespace() || c == '(')
2501        .unwrap_or(trimmed.len());
2502    trimmed[..head_end].eq_ignore_ascii_case("WITH")
2503}
2504
2505/// If the query is a plain SELECT whose top-level `TableQuery`
2506/// carries an `AS OF` clause, return a typed spec that the runtime
2507/// can feed to `vcs_resolve_as_of`. Returns `None` for any other
2508/// shape — joins, DML, EXPLAIN, or parse failures — so callers fall
2509/// back to the connection's regular MVCC snapshot. A cheap textual
2510/// prefilter skips the parse entirely when the source doesn't
2511/// mention `AS OF` / `as of`, keeping the autocommit hot path free.
2512fn peek_top_level_as_of(sql: &str) -> Option<crate::application::vcs::AsOfSpec> {
2513    peek_top_level_as_of_with_table(sql).map(|(spec, _)| spec)
2514}
2515
2516/// Same as `peek_top_level_as_of` but also returns the table name
2517/// targeted by the AS OF clause (when the FROM clause names a
2518/// concrete table). `None` for the table slot means scalar SELECT
2519/// or a subquery source — callers treat those as "no enforcement".
2520pub(super) fn peek_top_level_as_of_with_table(
2521    sql: &str,
2522) -> Option<(crate::application::vcs::AsOfSpec, Option<String>)> {
2523    if !sql
2524        .as_bytes()
2525        .windows(5)
2526        .any(|w| w.eq_ignore_ascii_case(b"as of"))
2527    {
2528        return None;
2529    }
2530    let parsed = crate::storage::query::parser::parse(sql).ok()?;
2531    let crate::storage::query::ast::QueryExpr::Table(table) = parsed.query else {
2532        return None;
2533    };
2534    let clause = table.as_of?;
2535    let table_name = if table.table.is_empty() || table.table == "any" {
2536        None
2537    } else {
2538        Some(table.table.clone())
2539    };
2540    let spec = match clause {
2541        crate::storage::query::ast::AsOfClause::Commit(h) => {
2542            crate::application::vcs::AsOfSpec::Commit(h)
2543        }
2544        crate::storage::query::ast::AsOfClause::Branch(b) => {
2545            crate::application::vcs::AsOfSpec::Branch(b)
2546        }
2547        crate::storage::query::ast::AsOfClause::Tag(t) => crate::application::vcs::AsOfSpec::Tag(t),
2548        crate::storage::query::ast::AsOfClause::TimestampMs(ts) => {
2549            crate::application::vcs::AsOfSpec::TimestampMs(ts)
2550        }
2551        crate::storage::query::ast::AsOfClause::Snapshot(x) => {
2552            crate::application::vcs::AsOfSpec::Snapshot(x)
2553        }
2554    };
2555    Some((spec, table_name))
2556}
2557
2558pub(super) fn query_has_volatile_builtin(sql: &str) -> bool {
2559    // Lowercase the bytes up to the first null/newline into a small
2560    // stack buffer for cheap contains() checks. Most SQL fits in the
2561    // buffer; longer queries fall back to owned lowercase.
2562    const VOLATILE_TOKENS: &[&str] = &[
2563        "pg_advisory_lock",
2564        "pg_try_advisory_lock",
2565        "pg_advisory_unlock",
2566        "random()",
2567        // NOW() / CURRENT_TIMESTAMP / CURRENT_DATE intentionally
2568        // omitted for now — they ARE volatile but today's tests rely
2569        // on caching them. Revisit once a tighter volatility story
2570        // lands.
2571    ];
2572    let lowered = sql.to_ascii_lowercase();
2573    VOLATILE_TOKENS.iter().any(|t| lowered.contains(t))
2574}
2575
2576pub(super) fn query_is_ask_statement(sql: &str) -> bool {
2577    let trimmed = sql.trim_start();
2578    let head_end = trimmed
2579        .find(|c: char| c.is_whitespace() || c == '(' || c == ';')
2580        .unwrap_or(trimmed.len());
2581    trimmed[..head_end].eq_ignore_ascii_case("ASK")
2582}
2583
2584/// Pick the `(global_mode, collection_mode)` pair for an expression,
2585/// or `None` for variants that opt out of intent-locking entirely
2586/// (admin statements like `SHOW CONFIG`, transaction control, tenant
2587/// toggles).
2588///
2589/// Phase-1 contract:
2590/// - Reads  — `(IX-compatible) (Global, IS) → (Collection, IS)`
2591/// - Writes — `(IX-compatible) (Global, IX) → (Collection, IX)`
2592/// - DDL    — `(strong)        (Global, IX) → (Collection, X)`
2593pub(super) fn intent_lock_modes_for(
2594    expr: &QueryExpr,
2595) -> Option<(
2596    crate::storage::transaction::lock::LockMode,
2597    crate::storage::transaction::lock::LockMode,
2598)> {
2599    use crate::storage::transaction::lock::LockMode::{Exclusive, IntentExclusive, IntentShared};
2600
2601    match expr {
2602        // Reads — IS / IS.
2603        QueryExpr::Table(_)
2604        | QueryExpr::Join(_)
2605        | QueryExpr::Vector(_)
2606        | QueryExpr::Hybrid(_)
2607        | QueryExpr::Graph(_)
2608        | QueryExpr::Path(_)
2609        | QueryExpr::Ask(_)
2610        | QueryExpr::SearchCommand(_)
2611        | QueryExpr::GraphCommand(_)
2612        | QueryExpr::QueueSelect(_) => Some((IntentShared, IntentShared)),
2613
2614        // Writes — IX / IX. Non-tabular mutations (vector insert,
2615        // graph node insert, queue push, timeseries point insert)
2616        // don't carry their own dispatch arm here; they ride through
2617        // the Insert variant or a command variant covered by the
2618        // read-side arm above. P1.T4 expands only the TableQuery-ish
2619        // writes; non-tabular kinds inherit when their DML variants
2620        // land in later phases.
2621        QueryExpr::Insert(_)
2622        | QueryExpr::Update(_)
2623        | QueryExpr::Delete(_)
2624        | QueryExpr::QueueCommand(QueueCommand::Move { .. }) => {
2625            Some((IntentExclusive, IntentExclusive))
2626        }
2627        QueryExpr::QueueCommand(_) => Some((IntentShared, IntentShared)),
2628
2629        // DDL — IX / X. A DDL against collection `c` blocks all
2630        // other writers + readers on `c` but leaves other collections
2631        // running (because Global stays IX, not X).
2632        QueryExpr::CreateTable(_)
2633        | QueryExpr::CreateCollection(_)
2634        | QueryExpr::CreateVector(_)
2635        | QueryExpr::DropTable(_)
2636        | QueryExpr::DropGraph(_)
2637        | QueryExpr::DropVector(_)
2638        | QueryExpr::DropDocument(_)
2639        | QueryExpr::DropKv(_)
2640        | QueryExpr::DropCollection(_)
2641        | QueryExpr::Truncate(_)
2642        | QueryExpr::AlterTable(_)
2643        | QueryExpr::CreateIndex(_)
2644        | QueryExpr::DropIndex(_)
2645        | QueryExpr::CreateTimeSeries(_)
2646        | QueryExpr::CreateMetric(_)
2647        | QueryExpr::AlterMetric(_)
2648        | QueryExpr::CreateSlo(_)
2649        | QueryExpr::DropTimeSeries(_)
2650        | QueryExpr::CreateQueue(_)
2651        | QueryExpr::AlterQueue(_)
2652        | QueryExpr::DropQueue(_)
2653        | QueryExpr::CreateTree(_)
2654        | QueryExpr::DropTree(_)
2655        | QueryExpr::CreatePolicy(_)
2656        | QueryExpr::DropPolicy(_)
2657        | QueryExpr::CreateView(_)
2658        | QueryExpr::DropView(_)
2659        | QueryExpr::RefreshMaterializedView(_)
2660        | QueryExpr::CreateSchema(_)
2661        | QueryExpr::DropSchema(_)
2662        | QueryExpr::CreateSequence(_)
2663        | QueryExpr::DropSequence(_)
2664        | QueryExpr::CreateServer(_)
2665        | QueryExpr::DropServer(_)
2666        | QueryExpr::CreateForeignTable(_)
2667        | QueryExpr::DropForeignTable(_) => Some((IntentExclusive, Exclusive)),
2668
2669        // Admin / control — skip intent locks. `SET TENANT`,
2670        // `BEGIN / COMMIT / ROLLBACK`, `SET CONFIG`, `SHOW CONFIG`,
2671        // `VACUUM`, etc. don't touch collection data the same way
2672        // and the existing transaction layer already serialises the
2673        // pieces that matter.
2674        _ => None,
2675    }
2676}
2677
2678/// Best-effort collection inventory for an expression. Used to pick
2679/// `Collection(...)` resources for the intent-lock guard. Overshoots
2680/// are fine (take an extra IS, benign); undershoots leak writes past
2681/// DDL X locks, so err on the side of listing more names.
2682pub(super) fn collections_referenced(expr: &QueryExpr) -> Vec<String> {
2683    let mut out = Vec::new();
2684    walk_collections(expr, &mut out);
2685    out.sort();
2686    out.dedup();
2687    out
2688}
2689
2690fn walk_collections(expr: &QueryExpr, out: &mut Vec<String>) {
2691    match expr {
2692        QueryExpr::Table(t) => out.push(t.table.clone()),
2693        QueryExpr::Join(j) => {
2694            walk_collections(&j.left, out);
2695            walk_collections(&j.right, out);
2696        }
2697        QueryExpr::Insert(i) => out.push(i.table.clone()),
2698        QueryExpr::Update(u) => out.push(u.table.clone()),
2699        QueryExpr::Delete(d) => out.push(d.table.clone()),
2700        QueryExpr::QueueSelect(q) => out.push(q.queue.clone()),
2701
2702        // DDL — include the target collection so DDL takes
2703        // `(Collection, X)` and blocks concurrent readers / writers
2704        // on the same collection. Other collections stay live
2705        // because Global is still IX.
2706        QueryExpr::CreateTable(q) => out.push(q.name.clone()),
2707        QueryExpr::CreateCollection(q) => out.push(q.name.clone()),
2708        QueryExpr::CreateVector(q) => out.push(q.name.clone()),
2709        QueryExpr::DropTable(q) => out.push(q.name.clone()),
2710        QueryExpr::DropGraph(q) => out.push(q.name.clone()),
2711        QueryExpr::DropVector(q) => out.push(q.name.clone()),
2712        QueryExpr::DropDocument(q) => out.push(q.name.clone()),
2713        QueryExpr::DropKv(q) => out.push(q.name.clone()),
2714        QueryExpr::DropCollection(q) => out.push(q.name.clone()),
2715        QueryExpr::Truncate(q) => out.push(q.name.clone()),
2716        QueryExpr::AlterTable(q) => out.push(q.name.clone()),
2717        QueryExpr::CreateIndex(q) => out.push(q.table.clone()),
2718        QueryExpr::DropIndex(q) => out.push(q.table.clone()),
2719        QueryExpr::CreateTimeSeries(q) => out.push(q.name.clone()),
2720        QueryExpr::CreateMetric(q) => out.push(q.path.clone()),
2721        QueryExpr::AlterMetric(q) => out.push(q.path.clone()),
2722        QueryExpr::CreateSlo(q) => out.push(q.path.clone()),
2723        QueryExpr::DropTimeSeries(q) => out.push(q.name.clone()),
2724        QueryExpr::CreateQueue(q) => out.push(q.name.clone()),
2725        QueryExpr::AlterQueue(q) => out.push(q.name.clone()),
2726        QueryExpr::DropQueue(q) => out.push(q.name.clone()),
2727        QueryExpr::QueueCommand(QueueCommand::Move {
2728            source,
2729            destination,
2730            ..
2731        }) => {
2732            out.push(source.clone());
2733            out.push(destination.clone());
2734        }
2735        QueryExpr::CreatePolicy(q) => out.push(q.table.clone()),
2736        QueryExpr::CreateView(q) => out.push(q.name.clone()),
2737        QueryExpr::DropView(q) => out.push(q.name.clone()),
2738        QueryExpr::RefreshMaterializedView(q) => out.push(q.name.clone()),
2739
2740        // Vector / Hybrid / Graph / Path / commands reference
2741        // collections through fields whose shape varies; without a
2742        // uniform accessor we fall back to the global lock only —
2743        // benign because every runtime path still holds the global
2744        // mode.
2745        _ => {}
2746    }
2747}
2748
2749impl RedDBRuntime {
2750    pub fn in_memory() -> RedDBResult<Self> {
2751        Self::with_options(RedDBOptions::in_memory())
2752    }
2753
2754    /// Handle to the intent-lock manager for tests + introspection.
2755    /// Production code acquires via `LockerGuard::new(rt.lock_manager())`
2756    /// rather than touching the manager directly.
2757    pub fn lock_manager(&self) -> std::sync::Arc<crate::storage::transaction::lock::LockManager> {
2758        self.inner.lock_manager.clone()
2759    }
2760
2761    /// Process-local governance registry for managed policy/config guardrails.
2762    pub fn config_registry(&self) -> std::sync::Arc<crate::auth::registry::ConfigRegistry> {
2763        self.inner.config_registry.clone()
2764    }
2765
2766    pub fn query_audit(&self) -> std::sync::Arc<crate::runtime::query_audit::QueryAuditStream> {
2767        self.inner.query_audit.clone()
2768    }
2769
2770    pub fn control_events_require_persistence(&self) -> bool {
2771        self.inner.control_event_config.require_persistence()
2772    }
2773
2774    pub fn control_event_config(&self) -> crate::runtime::control_events::ControlEventConfig {
2775        self.inner.control_event_config
2776    }
2777
2778    pub fn control_event_ledger(
2779        &self,
2780    ) -> Arc<dyn crate::runtime::control_events::ControlEventLedger> {
2781        self.inner.control_event_ledger.read().clone()
2782    }
2783
2784    #[doc(hidden)]
2785    pub fn replace_control_event_ledger_for_tests(
2786        &self,
2787        ledger: Arc<dyn crate::runtime::control_events::ControlEventLedger>,
2788    ) {
2789        *self.inner.control_event_ledger.write() = ledger;
2790    }
2791
2792    #[inline(never)]
2793    pub fn with_options(options: RedDBOptions) -> RedDBResult<Self> {
2794        Self::with_pool(options, ConnectionPoolConfig::default())
2795    }
2796
2797    pub fn with_pool(
2798        options: RedDBOptions,
2799        pool_config: ConnectionPoolConfig,
2800    ) -> RedDBResult<Self> {
2801        // PLAN.md Phase 9.1 — capture wall-clock before storage
2802        // open so the cold-start phase markers can be backfilled
2803        // once Lifecycle is constructed below. Storage open
2804        // encapsulates auto-restore + WAL replay; we treat the
2805        // whole window as one combined "restore" + "wal_replay"
2806        // phase split at the same boundary because the storage
2807        // layer doesn't yet emit a finer signal.
2808        let boot_open_start_ms = std::time::SystemTime::now()
2809            .duration_since(std::time::UNIX_EPOCH)
2810            .map(|d| d.as_millis() as u64)
2811            .unwrap_or(0);
2812        let db = Arc::new(
2813            RedDB::open_with_options(&options)
2814                .map_err(|err| RedDBError::Internal(err.to_string()))?,
2815        );
2816        let result_blob_cache = crate::storage::cache::BlobCache::open_with_l2(
2817            crate::storage::cache::BlobCacheConfig::default().with_l2_path(
2818                options
2819                    .resolved_path("data.rdb")
2820                    .with_extension("result-cache.l2"),
2821            ),
2822        )
2823        .map_err(|err| {
2824            RedDBError::Internal(format!("open result Blob Cache L2 failed: {err:?}"))
2825        })?;
2826        let storage_ready_ms = std::time::SystemTime::now()
2827            .duration_since(std::time::UNIX_EPOCH)
2828            .map(|d| d.as_millis() as u64)
2829            .unwrap_or(0);
2830
2831        let runtime = Self {
2832            inner: Arc::new(RuntimeInner {
2833                db: db.clone(),
2834                layout: PhysicalLayout::from_options(&options),
2835                indices: IndexCatalog::register_default_vector_graph(
2836                    options.has_capability(crate::api::Capability::Table),
2837                    options.has_capability(crate::api::Capability::Graph),
2838                ),
2839                pool_config,
2840                pool: Mutex::new(PoolState::default()),
2841                started_at_unix_ms: SystemTime::now()
2842                    .duration_since(UNIX_EPOCH)
2843                    .unwrap_or_default()
2844                    .as_millis(),
2845                probabilistic: super::probabilistic_store::ProbabilisticStore::new(),
2846                index_store: super::index_store::IndexStore::new(),
2847                cdc: crate::replication::cdc::CdcBuffer::new(100_000),
2848                backup_scheduler: crate::replication::scheduler::BackupScheduler::new(3600),
2849                query_cache: parking_lot::RwLock::new(
2850                    crate::storage::query::planner::cache::PlanCache::new(1000),
2851                ),
2852                result_cache: parking_lot::RwLock::new((
2853                    HashMap::new(),
2854                    std::collections::VecDeque::new(),
2855                )),
2856                result_blob_cache,
2857                result_blob_entries: parking_lot::RwLock::new((
2858                    HashMap::new(),
2859                    std::collections::VecDeque::new(),
2860                )),
2861                ask_answer_cache_entries: parking_lot::RwLock::new((
2862                    HashSet::new(),
2863                    std::collections::VecDeque::new(),
2864                )),
2865                result_cache_shadow_divergences: std::sync::atomic::AtomicU64::new(0),
2866                result_cache_hits: std::sync::atomic::AtomicU64::new(0),
2867                result_cache_misses: std::sync::atomic::AtomicU64::new(0),
2868                result_cache_evictions: std::sync::atomic::AtomicU64::new(0),
2869                ask_daily_spend: parking_lot::RwLock::new(HashMap::new()),
2870                queue_message_locks: parking_lot::RwLock::new(HashMap::new()),
2871                rmw_locks: RmwLockTable::new(),
2872                planner_dirty_tables: parking_lot::RwLock::new(HashSet::new()),
2873                ec_registry: Arc::new(crate::ec::config::EcRegistry::new()),
2874                config_registry: Arc::new(crate::auth::registry::ConfigRegistry::new()),
2875                ec_worker: crate::ec::worker::EcWorker::new(),
2876                auth_store: parking_lot::RwLock::new(None),
2877                oauth_validator: parking_lot::RwLock::new(None),
2878                views: parking_lot::RwLock::new(HashMap::new()),
2879                materialized_views: parking_lot::RwLock::new(
2880                    crate::storage::cache::result::MaterializedViewCache::new(),
2881                ),
2882                retention_sweeper: parking_lot::RwLock::new(
2883                    crate::runtime::retention_sweeper::RetentionSweeperState::new(),
2884                ),
2885                snapshot_manager: Arc::new(
2886                    crate::storage::transaction::snapshot::SnapshotManager::new(),
2887                ),
2888                tx_contexts: parking_lot::RwLock::new(HashMap::new()),
2889                tx_local_tenants: parking_lot::RwLock::new(HashMap::new()),
2890                env_config_overrides: crate::runtime::config_overlay::collect_env_overrides(),
2891                lock_manager: Arc::new({
2892                    // Sourced from the matrix: Tier B key
2893                    // `concurrency.locking.deadlock_timeout_ms`
2894                    // (default 5000). Env var wins at boot so
2895                    // operators can tune without touching red_config.
2896                    let env = crate::runtime::config_overlay::collect_env_overrides();
2897                    let timeout_ms = env
2898                        .get("concurrency.locking.deadlock_timeout_ms")
2899                        .and_then(|raw| raw.parse::<u64>().ok())
2900                        .unwrap_or_else(|| {
2901                            match crate::runtime::config_matrix::default_for(
2902                                "concurrency.locking.deadlock_timeout_ms",
2903                            ) {
2904                                Some(crate::serde_json::Value::Number(n)) => n as u64,
2905                                _ => 5000,
2906                            }
2907                        });
2908                    let cfg = crate::storage::transaction::lock::LockConfig {
2909                        default_timeout: std::time::Duration::from_millis(timeout_ms),
2910                        ..Default::default()
2911                    };
2912                    crate::storage::transaction::lock::LockManager::new(cfg)
2913                }),
2914                rls_policies: parking_lot::RwLock::new(HashMap::new()),
2915                rls_enabled_tables: parking_lot::RwLock::new(HashSet::new()),
2916                foreign_tables: Arc::new(crate::storage::fdw::ForeignTableRegistry::with_builtins()),
2917                pending_tombstones: parking_lot::RwLock::new(HashMap::new()),
2918                pending_versioned_updates: parking_lot::RwLock::new(HashMap::new()),
2919                pending_kv_watch_events: parking_lot::RwLock::new(HashMap::new()),
2920                pending_store_wal_actions: parking_lot::RwLock::new(HashMap::new()),
2921                queue_wait_registry: std::sync::Arc::new(
2922                    crate::runtime::queue_wait_registry::QueueWaitRegistry::new(),
2923                ),
2924                pending_queue_wakes: parking_lot::RwLock::new(HashMap::new()),
2925                tenant_tables: parking_lot::RwLock::new(HashMap::new()),
2926                ddl_epoch: std::sync::atomic::AtomicU64::new(0),
2927                write_gate: Arc::new(crate::runtime::write_gate::WriteGate::from_options(
2928                    &options,
2929                )),
2930                lifecycle: crate::runtime::lifecycle::Lifecycle::new(),
2931                resource_limits: crate::runtime::resource_limits::ResourceLimits::from_env(),
2932                audit_log: {
2933                    // Default audit-log path for the in-memory case
2934                    // sits in the system temp dir; persistent runs
2935                    // place it next to data.rdb.
2936                    //
2937                    // gh-471 iter 2: route through the resolved
2938                    // `LogDestination`. Performance/Max tiers emit a
2939                    // `File(...)` under `<dbname>.rdb.red/logs/`;
2940                    // lower tiers / ephemeral runs report `Stderr`
2941                    // and we keep the legacy file-next-to-data sink.
2942                    let data_path = options
2943                        .data_path
2944                        .clone()
2945                        .unwrap_or_else(|| std::env::temp_dir().join("reddb"));
2946                    let (audit_dest, _) = crate::api::tier_wiring::current_log_destinations();
2947                    Arc::new(crate::runtime::audit_log::AuditLogger::for_destination(
2948                        &audit_dest,
2949                        &data_path,
2950                    ))
2951                },
2952                control_event_ledger: parking_lot::RwLock::new(Arc::new(
2953                    crate::runtime::control_events::RuntimeLedger::new(db.store()),
2954                )),
2955                control_event_config: options.control_events,
2956                query_audit: Arc::new(crate::runtime::query_audit::QueryAuditStream::new(
2957                    db.store(),
2958                    options.query_audit.clone(),
2959                )),
2960                lease_lifecycle: std::sync::OnceLock::new(),
2961                replica_apply_metrics: std::sync::Arc::new(
2962                    crate::replication::logical::ReplicaApplyMetrics::default(),
2963                ),
2964                quota_bucket: crate::runtime::quota_bucket::QuotaBucket::from_env(),
2965                schema_vocabulary: parking_lot::RwLock::new(
2966                    crate::runtime::schema_vocabulary::SchemaVocabulary::new(),
2967                ),
2968                slow_query_logger: {
2969                    // Issue #205 — slow-query sink lives in the same
2970                    // directory the audit log uses, so backup/restore
2971                    // ships them together. Threshold + sample-pct
2972                    // default conservatively (1 s, 100% sampling) so
2973                    // emitted lines are rare and complete. Operators
2974                    // tune via env / config matrix in a follow-up.
2975                    //
2976                    // gh-471 iter 2: same routing as the audit log —
2977                    // `LogDestination::File(...)` for Performance/Max
2978                    // lands under `<dbname>.rdb.red/logs/slow.log`;
2979                    // lower tiers fall back to `red-slow.log` in the
2980                    // data directory.
2981                    let fallback_dir = options
2982                        .data_path
2983                        .as_ref()
2984                        .and_then(|p| p.parent().map(std::path::PathBuf::from))
2985                        .unwrap_or_else(|| std::env::temp_dir().join("reddb"));
2986                    let threshold_ms = std::env::var("RED_SLOW_QUERY_THRESHOLD_MS")
2987                        .ok()
2988                        .and_then(|s| s.parse::<u64>().ok())
2989                        .unwrap_or(1000);
2990                    let sample_pct = std::env::var("RED_SLOW_QUERY_SAMPLE_PCT")
2991                        .ok()
2992                        .and_then(|s| s.parse::<u8>().ok())
2993                        .unwrap_or(100);
2994                    let (_, slow_dest) = crate::api::tier_wiring::current_log_destinations();
2995                    crate::telemetry::slow_query_logger::SlowQueryLogger::for_destination(
2996                        &slow_dest,
2997                        &fallback_dir,
2998                        threshold_ms,
2999                        sample_pct,
3000                    )
3001                },
3002                kv_stats: crate::runtime::KvStatsCounters::default(),
3003                metrics_ingest_stats: crate::runtime::MetricsIngestCounters::default(),
3004                metrics_tenant_activity_stats:
3005                    crate::runtime::MetricsTenantActivityCounters::default(),
3006                queue_telemetry: Arc::new(
3007                    crate::runtime::queue_telemetry::QueueTelemetryCounters::default(),
3008                ),
3009                queue_presence: Arc::new(
3010                    crate::storage::queue::presence::ConsumerPresenceRegistry::new(),
3011                ),
3012                vector_introspection: Arc::new(
3013                    crate::storage::vector::introspection::VectorIntrospectionRegistry::new(),
3014                ),
3015                kv_tag_index: crate::runtime::KvTagIndex::default(),
3016                chain_tip_cache: parking_lot::Mutex::new(HashMap::new()),
3017                chain_integrity_broken: parking_lot::Mutex::new(HashMap::new()),
3018                integrity_tombstones: parking_lot::Mutex::new(Vec::new()),
3019                integrity_tombstones_state: std::sync::atomic::AtomicU8::new(0),
3020            }),
3021        };
3022
3023        // Issue #205 — install the process-wide OperatorEvent sink so
3024        // emit sites buried in storage / replication / signal handlers
3025        // can record without threading an `&AuditLogger` through every
3026        // call stack. First registration wins; subsequent in-memory
3027        // runtimes (test harnesses) fall through to tracing+eprintln.
3028        crate::telemetry::operator_event::install_global_audit_sink(Arc::clone(
3029            &runtime.inner.audit_log,
3030        ));
3031
3032        // PLAN.md Phase 9.1 — backfill cold-start phase markers
3033        // from the wall-clock captured before storage open. The
3034        // entire `RedDB::open_with_options` call covers both
3035        // auto-restore (when configured) and WAL replay. We
3036        // record both phases against the same boundary today;
3037        // a follow-up will split them once the storage layer
3038        // surfaces a finer-grained event.
3039        runtime
3040            .inner
3041            .lifecycle
3042            .set_restore_started_at_ms(boot_open_start_ms);
3043        runtime
3044            .inner
3045            .lifecycle
3046            .set_restore_ready_at_ms(storage_ready_ms);
3047        runtime
3048            .inner
3049            .lifecycle
3050            .set_wal_replay_started_at_ms(boot_open_start_ms);
3051        runtime
3052            .inner
3053            .lifecycle
3054            .set_wal_replay_ready_at_ms(storage_ready_ms);
3055
3056        let restored_cdc_lsn = runtime
3057            .inner
3058            .db
3059            .replication
3060            .as_ref()
3061            .map(|repl| {
3062                repl.logical_wal_spool
3063                    .as_ref()
3064                    .map(|spool| spool.current_lsn())
3065                    .unwrap_or(0)
3066            })
3067            .unwrap_or(0)
3068            .max(runtime.config_u64("red.config.timeline.last_archived_lsn", 0));
3069        runtime.inner.cdc.set_current_lsn(restored_cdc_lsn);
3070        runtime.rehydrate_snapshot_xid_floor();
3071        runtime.bootstrap_system_keyed_collections()?;
3072        runtime.rehydrate_declared_column_schemas();
3073        runtime.load_probabilistic_state()?;
3074
3075        // Phase 2.5.4: replay `tenant_tables.{table}.column` markers so
3076        // tables declared via `TENANT BY (col)` survive restart. Each
3077        // entry re-registers the auto-policy and flips RLS on again.
3078        runtime.rehydrate_tenant_tables();
3079        // Issue #593 slice 9a — replay persisted materialized-view
3080        // descriptors so `CREATE MATERIALIZED VIEW v AS …` survives a
3081        // restart. Runs after the system-keyed collections bootstrap
3082        // and before the API opens.
3083        runtime.rehydrate_materialized_view_descriptors();
3084        if let Some(repl) = &runtime.inner.db.replication {
3085            repl.wal_buffer.set_current_lsn(restored_cdc_lsn);
3086        }
3087
3088        // Save system info to red_config on boot
3089        {
3090            let sys = SystemInfo::collect();
3091            runtime.inner.db.store().set_config_tree(
3092                "red.system",
3093                &crate::serde_json::json!({
3094                    "pid": sys.pid,
3095                    "cpu_cores": sys.cpu_cores,
3096                    "total_memory_bytes": sys.total_memory_bytes,
3097                    "available_memory_bytes": sys.available_memory_bytes,
3098                    "os": sys.os,
3099                    "arch": sys.arch,
3100                    "hostname": sys.hostname,
3101                    "started_at": SystemTime::now()
3102                        .duration_since(UNIX_EPOCH)
3103                        .unwrap_or_default()
3104                        .as_millis() as u64
3105                }),
3106            );
3107
3108            // Seed defaults on first boot (only if red_config is empty or missing defaults)
3109            let store = runtime.inner.db.store();
3110            if store
3111                .get_collection("red_config")
3112                .map(|m| m.query_all(|_| true).len())
3113                .unwrap_or(0)
3114                <= 10
3115            {
3116                store.set_config_tree("red.ai", &crate::json!({
3117                    "default": crate::json!({
3118                        "provider": "openai",
3119                        "model": crate::ai::DEFAULT_OPENAI_PROMPT_MODEL
3120                    }),
3121                    "max_embedding_inputs": 256,
3122                    "max_prompt_batch": 256,
3123                    "timeout": crate::json!({ "connect_secs": 10, "read_secs": 90, "write_secs": 30 })
3124                }));
3125                store.set_config_tree(
3126                    "red.server",
3127                    &crate::json!({
3128                        "max_scan_limit": 1000,
3129                        "max_body_size": 1048576,
3130                        "read_timeout_ms": 5000,
3131                        "write_timeout_ms": 5000
3132                    }),
3133                );
3134                store.set_config_tree(
3135                    "red.storage",
3136                    &crate::json!({
3137                        "page_size": 4096,
3138                        "page_cache_capacity": 100000,
3139                        "auto_checkpoint_pages": 1000,
3140                        "snapshot_retention": 16,
3141                        "verify_checksums": true,
3142                        "segment": crate::json!({
3143                            "max_entities": 100000,
3144                            "max_bytes": 268435456_u64,
3145                            "compression_level": 6
3146                        }),
3147                        "hnsw": crate::json!({ "m": 16, "ef_construction": 100, "ef_search": 50 }),
3148                        "ivf": crate::json!({ "n_lists": 100, "n_probes": 10 }),
3149                        "bm25": crate::json!({ "k1": 1.2, "b": 0.75 })
3150                    }),
3151                );
3152                store.set_config_tree(
3153                    "red.search",
3154                    &crate::json!({
3155                        "rag": crate::json!({
3156                            "max_chunks_per_source": 10,
3157                            "max_total_chunks": 25,
3158                            "similarity_threshold": 0.8,
3159                            "graph_depth": 2,
3160                            "min_relevance": 0.3
3161                        }),
3162                        "fusion": crate::json!({
3163                            "vector_weight": 0.5,
3164                            "graph_weight": 0.3,
3165                            "table_weight": 0.2,
3166                            "dedup_threshold": 0.85
3167                        })
3168                    }),
3169                );
3170                store.set_config_tree(
3171                    "red.auth",
3172                    &crate::json!({
3173                        "enabled": false,
3174                        "session_ttl_secs": 3600,
3175                        "require_auth": false
3176                    }),
3177                );
3178                store.set_config_tree(
3179                    "red.query",
3180                    &crate::json!({
3181                        "connection_pool": crate::json!({ "max_connections": 64, "max_idle": 16 }),
3182                        "max_recursion_depth": 1000
3183                    }),
3184                );
3185                store.set_config_tree(
3186                    "red.indexes",
3187                    &crate::json!({
3188                        "auto_select": true,
3189                        "bloom_filter": crate::json!({
3190                            "enabled": true,
3191                            "false_positive_rate": 0.01,
3192                            "prune_on_scan": true
3193                        }),
3194                        "hash": crate::json!({ "enabled": true }),
3195                        "bitmap": crate::json!({ "enabled": true, "max_cardinality": 1000 }),
3196                        "spatial": crate::json!({ "enabled": true })
3197                    }),
3198                );
3199                store.set_config_tree(
3200                    "red.memtable",
3201                    &crate::json!({
3202                        "enabled": true,
3203                        "max_bytes": 67108864_u64,
3204                        "flush_threshold": 0.75
3205                    }),
3206                );
3207                store.set_config_tree(
3208                    "red.probabilistic",
3209                    &crate::json!({
3210                        "hll_registers": 16384,
3211                        "sketch_default_width": 1000,
3212                        "sketch_default_depth": 5,
3213                        "filter_default_capacity": 100000
3214                    }),
3215                );
3216                store.set_config_tree(
3217                    "red.timeseries",
3218                    &crate::json!({
3219                        "default_chunk_size": 1024,
3220                        "compression": crate::json!({
3221                            "timestamps": "delta_of_delta",
3222                            "values": "gorilla_xor"
3223                        }),
3224                        "default_retention_days": 0
3225                    }),
3226                );
3227                store.set_config_tree(
3228                    "red.queue",
3229                    &crate::json!({
3230                        "default_max_size": 0,
3231                        "default_max_attempts": 3,
3232                        "visibility_timeout_ms": 30000,
3233                        "consumer_idle_timeout_ms": 60000
3234                    }),
3235                );
3236                store.set_config_tree(
3237                    "red.backup",
3238                    &crate::json!({
3239                        "enabled": false,
3240                        "interval_secs": 3600,
3241                        "retention_count": 24,
3242                        "upload": false,
3243                        "backend": "local"
3244                    }),
3245                );
3246                store.set_config_tree(
3247                    "red.wal",
3248                    &crate::json!({
3249                        "archive": crate::json!({
3250                            "enabled": false,
3251                            "retention_hours": 168,
3252                            "prefix": "wal/"
3253                        })
3254                    }),
3255                );
3256                store.set_config_tree(
3257                    "red.cdc",
3258                    &crate::json!({
3259                        "enabled": true,
3260                        "buffer_size": 100000
3261                    }),
3262                );
3263                store.set_config_tree(
3264                    "red.config.secret",
3265                    &crate::json!({
3266                        "auto_encrypt": true,
3267                        "auto_decrypt": true
3268                    }),
3269                );
3270            }
3271
3272            // Perf-parity config matrix: heal the Tier A (critical)
3273            // keys unconditionally on every boot. Idempotent — only
3274            // writes the default when the key is missing. Keeps
3275            // `SHOW CONFIG` showing every guarantee the operator has
3276            // (durability.mode, concurrency.locking.enabled, …) even
3277            // on long-running datadirs that predate the matrix.
3278            crate::runtime::config_matrix::heal_critical_keys(store.as_ref());
3279
3280            // Phase 5 — Lehman-Yao runtime flag. Read the Tier A
3281            // `storage.btree.lehman_yao` value from the matrix (env
3282            // > file > red_config > default) and publish it to the
3283            // storage layer's atomic so the B-tree read / split
3284            // paths can branch without re-reading the config on
3285            // every hot-path call.
3286            let lehman_yao = runtime.config_bool("storage.btree.lehman_yao", true);
3287            crate::storage::engine::btree::lehman_yao::set_enabled(lehman_yao);
3288            if lehman_yao {
3289                tracing::info!(
3290                    "storage.btree.lehman_yao=true — lock-free concurrent descent enabled"
3291                );
3292            }
3293
3294            // Config file overlay — mounted `/etc/reddb/config.json`
3295            // (override path via REDDB_CONFIG_FILE). Writes keys with
3296            // write-if-absent semantics so a later user `SET CONFIG`
3297            // always wins. Missing file = silent no-op.
3298            let overlay_path = crate::runtime::config_overlay::config_file_path();
3299            let _ =
3300                crate::runtime::config_overlay::apply_config_file(store.as_ref(), &overlay_path);
3301        }
3302
3303        // VCS ("Git for Data") — create the `red_*` metadata
3304        // collections on first boot. Idempotent: `get_or_create_collection`
3305        // is a no-op if the collection already exists.
3306        {
3307            let store = runtime.inner.db.store();
3308            for name in crate::application::vcs_collections::ALL {
3309                let _ = store.get_or_create_collection(*name);
3310            }
3311            // Seed VCS config namespace with sensible defaults on first
3312            // boot, matching the pattern used by red.ai / red.storage.
3313            store.set_config_tree(
3314                crate::application::vcs_collections::CONFIG_NAMESPACE,
3315                &crate::json!({
3316                    "default_branch": "main",
3317                    "author": crate::json!({
3318                        "name": "reddb",
3319                        "email": "reddb@localhost"
3320                    }),
3321                    "protected_branches": crate::json!(["main"]),
3322                    "closure": crate::json!({
3323                        "enabled": true,
3324                        "lazy": true
3325                    }),
3326                    "merge": crate::json!({
3327                        "default_strategy": "auto",
3328                        "fast_forward": true
3329                    })
3330                }),
3331            );
3332        }
3333
3334        // Migrations — create the `red_migrations` / `red_migration_deps`
3335        // system collections on first boot. Idempotent.
3336        {
3337            let store = runtime.inner.db.store();
3338            for name in crate::application::migration_collections::ALL {
3339                let _ = store.get_or_create_collection(*name);
3340            }
3341        }
3342
3343        // Topology graph (#803) — ensure the built-in `red.topology.cluster`
3344        // graph collection (declared WITH ANALYTICS) and its metadata sidecar
3345        // exist. Idempotent and survives restarts via the WAL-backed contract.
3346        let _ = crate::application::topology_collections::ensure(&runtime);
3347
3348        // Start background maintenance thread (context index refresh +
3349        // session purge). Held by a WEAK reference to `RuntimeInner`
3350        // so dropping the last `RedDBRuntime` handle actually releases
3351        // the underlying Arc<Pager> (and its file lock). Polling at
3352        // 200ms means shutdown latency is bounded; the real 60-second
3353        // work cadence is tracked independently via a `last_work`
3354        // timestamp.
3355        //
3356        // The previous version captured `rt = runtime.clone()` by
3357        // strong reference and ran an unterminated `loop`, which held
3358        // Arc<RuntimeInner> forever — reopening a persistent database
3359        // in the same process failed with "Database is locked" because
3360        // the pager could never drop. See the regression test
3361        // `finding_1_select_after_bulk_insert_persistent_reopen`.
3362        {
3363            let weak = Arc::downgrade(&runtime.inner);
3364            std::thread::Builder::new()
3365                .name("reddb-maintenance".into())
3366                .spawn(move || {
3367                    let tick = std::time::Duration::from_millis(200);
3368                    let work_interval = std::time::Duration::from_secs(60);
3369                    let mut last_work = std::time::Instant::now();
3370                    loop {
3371                        std::thread::sleep(tick);
3372                        let Some(inner) = weak.upgrade() else {
3373                            // All strong references dropped — the
3374                            // runtime is gone, exit cleanly.
3375                            break;
3376                        };
3377                        if last_work.elapsed() >= work_interval {
3378                            let _stats = inner.db.store().context_index().stats();
3379                            last_work = std::time::Instant::now();
3380                        }
3381                    }
3382                })
3383                .ok();
3384        }
3385
3386        // Start backup scheduler if enabled via red_config
3387        {
3388            let store = runtime.inner.db.store();
3389            let mut backup_enabled = false;
3390            let mut backup_interval = 3600u64;
3391
3392            if let Some(manager) = store.get_collection("red_config") {
3393                manager.for_each_entity(|entity| {
3394                    if let Some(row) = entity.data.as_row() {
3395                        let key = row.get_field("key").and_then(|v| match v {
3396                            crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3397                            _ => None,
3398                        });
3399                        let val = row.get_field("value");
3400                        if key == Some("red.config.backup.enabled") {
3401                            backup_enabled = match val {
3402                                Some(crate::storage::schema::Value::Boolean(true)) => true,
3403                                Some(crate::storage::schema::Value::Text(s)) => &**s == "true",
3404                                _ => false,
3405                            };
3406                        } else if key == Some("red.config.backup.interval_secs") {
3407                            if let Some(crate::storage::schema::Value::Integer(n)) = val {
3408                                backup_interval = *n as u64;
3409                            }
3410                        }
3411                    }
3412                    true
3413                });
3414            }
3415
3416            if backup_enabled {
3417                runtime.inner.backup_scheduler.set_interval(backup_interval);
3418                let rt = runtime.clone();
3419                runtime
3420                    .inner
3421                    .backup_scheduler
3422                    .start(move || rt.trigger_backup().map_err(|e| format!("{}", e)));
3423            }
3424        }
3425
3426        // Load EC registry from red_config and start worker
3427        {
3428            runtime
3429                .inner
3430                .ec_registry
3431                .load_from_config_store(runtime.inner.db.store().as_ref());
3432            if !runtime.inner.ec_registry.async_configs().is_empty() {
3433                runtime.inner.ec_worker.start(
3434                    Arc::clone(&runtime.inner.ec_registry),
3435                    Arc::clone(&runtime.inner.db.store()),
3436                );
3437            }
3438        }
3439
3440        if let crate::replication::ReplicationRole::Replica { primary_addr } =
3441            runtime.inner.db.options().replication.role.clone()
3442        {
3443            let rt = runtime.clone();
3444            std::thread::Builder::new()
3445                .name("reddb-replica".into())
3446                .spawn(move || rt.run_replica_loop(primary_addr))
3447                .ok();
3448        }
3449
3450        // PLAN.md Phase 1 — Lifecycle Contract. Mark Ready once every
3451        // boot stage above has completed (WAL replay, restore-from-
3452        // remote, replica-loop spawn). Health probes flip from 503 to
3453        // 200 here; shutdown begins from this state.
3454        runtime.inner.lifecycle.mark_ready();
3455
3456        // Issue #583 slice 10 — ContinuousMaterializedView scheduler.
3457        // Low-priority background ticker that drains the cache's
3458        // `claim_due_at` set every ~50ms. Holds only a Weak<RuntimeInner>
3459        // so the thread exits cleanly when the runtime drops (≤50ms
3460        // latency between drop and exit). Materialized views without
3461        // a `REFRESH EVERY` clause stay on the manual-refresh path
3462        // and are skipped by `claim_due_at`, so the loop is a no-op
3463        // when no scheduled views exist.
3464        {
3465            let weak_inner = Arc::downgrade(&runtime.inner);
3466            std::thread::Builder::new()
3467                .name("reddb-mv-scheduler".into())
3468                .spawn(move || loop {
3469                    std::thread::sleep(std::time::Duration::from_millis(50));
3470                    let Some(inner) = weak_inner.upgrade() else {
3471                        break;
3472                    };
3473                    let rt = RedDBRuntime { inner };
3474                    rt.refresh_due_materialized_views();
3475                })
3476                .ok();
3477        }
3478
3479        // Issue #584 slice 12 — DeclarativeRetention background sweeper.
3480        // Low-priority ticker that physically reclaims rows whose
3481        // timestamp has fallen beyond the retention window. Holds a
3482        // `Weak<RuntimeInner>` so the thread exits within one tick of
3483        // the runtime drop (graceful shutdown leaves storage consistent
3484        // because each tick goes through the standard DELETE path —
3485        // there is no half-finished mutation state to clean up). The
3486        // tick interval is intentionally longer than the MV scheduler
3487        // (500ms) because retention is order-of-seconds at minimum.
3488        if !runtime.write_gate().is_read_only() {
3489            let weak_inner = Arc::downgrade(&runtime.inner);
3490            std::thread::Builder::new()
3491                .name("reddb-retention-sweeper".into())
3492                .spawn(move || loop {
3493                    std::thread::sleep(std::time::Duration::from_millis(500));
3494                    let Some(inner) = weak_inner.upgrade() else {
3495                        break;
3496                    };
3497                    let rt = RedDBRuntime { inner };
3498                    rt.sweep_retention_tick(
3499                        crate::runtime::retention_sweeper::DEFAULT_SWEEPER_BATCH,
3500                    );
3501                })
3502                .ok();
3503        }
3504
3505        Ok(runtime)
3506    }
3507
3508    fn rehydrate_snapshot_xid_floor(&self) {
3509        let store = self.inner.db.store();
3510        for collection in store.list_collections() {
3511            let Some(manager) = store.get_collection(&collection) else {
3512                continue;
3513            };
3514            for entity in manager.query_all(|_| true) {
3515                self.inner
3516                    .snapshot_manager
3517                    .observe_committed_xid(entity.xmin);
3518                self.inner
3519                    .snapshot_manager
3520                    .observe_committed_xid(entity.xmax);
3521            }
3522        }
3523    }
3524
3525    /// Provision an empty Table-shaped collection that backs a
3526    /// `CREATE MATERIALIZED VIEW v` (issue #594 slice 9b of #575).
3527    /// `SELECT FROM v` reads this collection directly; the rewriter is
3528    /// configured to skip materialized views so the body is no longer
3529    /// substituted. REFRESH still writes to the cache slot — wiring it
3530    /// into this backing collection is the job of slice 9c.
3531    ///
3532    /// Idempotent: re-running for the same name leaves the existing
3533    /// collection in place (mirrors `CREATE TABLE IF NOT EXISTS`
3534    /// semantics). This keeps `CREATE OR REPLACE MATERIALIZED VIEW v`
3535    /// cheap — the body change does not invalidate already-buffered
3536    /// rows. Until 9c lands the backing is always empty anyway.
3537    pub(crate) fn ensure_materialized_view_backing(&self, name: &str) -> RedDBResult<()> {
3538        let store = self.inner.db.store();
3539        let mut changed = false;
3540        if store.get_collection(name).is_none() {
3541            store.get_or_create_collection(name);
3542            changed = true;
3543        }
3544        if self.inner.db.collection_contract(name).is_none() {
3545            self.inner
3546                .db
3547                .save_collection_contract(system_keyed_collection_contract(
3548                    name,
3549                    crate::catalog::CollectionModel::Table,
3550                ))
3551                .map_err(|err| RedDBError::Internal(err.to_string()))?;
3552            changed = true;
3553        }
3554        if changed {
3555            self.inner
3556                .db
3557                .persist_metadata()
3558                .map_err(|err| RedDBError::Internal(err.to_string()))?;
3559        }
3560        Ok(())
3561    }
3562
3563    /// Inverse of [`ensure_materialized_view_backing`] — drops the
3564    /// backing collection on `DROP MATERIALIZED VIEW v`. No-op when
3565    /// the collection was never created (e.g. a `DROP MATERIALIZED
3566    /// VIEW IF EXISTS v` against an unknown name).
3567    pub(crate) fn drop_materialized_view_backing(&self, name: &str) -> RedDBResult<()> {
3568        let store = self.inner.db.store();
3569        if store.get_collection(name).is_none() {
3570            return Ok(());
3571        }
3572        store
3573            .drop_collection(name)
3574            .map_err(|err| RedDBError::Internal(err.to_string()))?;
3575        // The contract may have been dropped already (DROP TABLE path)
3576        // — ignore "not found" errors by checking presence first.
3577        if self.inner.db.collection_contract(name).is_some() {
3578            self.inner
3579                .db
3580                .remove_collection_contract(name)
3581                .map_err(|err| RedDBError::Internal(err.to_string()))?;
3582        }
3583        self.invalidate_result_cache();
3584        self.inner
3585            .db
3586            .persist_metadata()
3587            .map_err(|err| RedDBError::Internal(err.to_string()))?;
3588        Ok(())
3589    }
3590
3591    fn bootstrap_system_keyed_collections(&self) -> RedDBResult<()> {
3592        let mut changed = false;
3593        for (name, model) in [
3594            ("red.config", crate::catalog::CollectionModel::Config),
3595            ("red.vault", crate::catalog::CollectionModel::Vault),
3596            // Issue #593 — materialized-view catalog. One row per
3597            // `CREATE MATERIALIZED VIEW`; rehydrated at boot before
3598            // the API opens.
3599            (
3600                crate::runtime::continuous_materialized_view::CATALOG_COLLECTION,
3601                crate::catalog::CollectionModel::Config,
3602            ),
3603        ] {
3604            if self.inner.db.store().get_collection(name).is_none() {
3605                self.inner.db.store().get_or_create_collection(name);
3606                changed = true;
3607            }
3608            if self.inner.db.collection_contract(name).is_none() {
3609                self.inner
3610                    .db
3611                    .save_collection_contract(system_keyed_collection_contract(name, model))
3612                    .map_err(|err| RedDBError::Internal(err.to_string()))?;
3613                changed = true;
3614            }
3615        }
3616        if changed {
3617            self.inner
3618                .db
3619                .persist_metadata()
3620                .map_err(|err| RedDBError::Internal(err.to_string()))?;
3621        }
3622        Ok(())
3623    }
3624
3625    pub fn db(&self) -> Arc<RedDB> {
3626        Arc::clone(&self.inner.db)
3627    }
3628
3629    /// Direct access to the runtime's secondary-index store.
3630    /// Used by bulk-insert entry points (gRPC binary bulk, HTTP bulk,
3631    /// wire bulk) that need to push new rows through the per-index
3632    /// maintenance hook after `store.bulk_insert` returns.
3633    pub fn index_store_ref(&self) -> &super::index_store::IndexStore {
3634        &self.inner.index_store
3635    }
3636
3637    /// Apply a DDL event to the schema-vocabulary reverse index
3638    /// (issue #120). Called by DDL execution paths after the catalog
3639    /// mutation has succeeded so the index never holds entries for
3640    /// half-applied DDL.
3641    pub(crate) fn schema_vocabulary_apply(
3642        &self,
3643        event: crate::runtime::schema_vocabulary::DdlEvent,
3644    ) {
3645        self.inner.schema_vocabulary.write().on_ddl(event);
3646    }
3647
3648    /// Lookup `token` in the schema-vocabulary reverse index. Returns
3649    /// an owned `Vec<VocabHit>` because the underlying read lock
3650    /// cannot be borrowed across the call boundary; the slice from
3651    /// `SchemaVocabulary::lookup` is cloned per hit.
3652    pub fn schema_vocabulary_lookup(
3653        &self,
3654        token: &str,
3655    ) -> Vec<crate::runtime::schema_vocabulary::VocabHit> {
3656        self.inner.schema_vocabulary.read().lookup(token).to_vec()
3657    }
3658
3659    /// Inject an AuthStore into the runtime. Called by server boot
3660    /// after the vault has been bootstrapped, so that `Value::Secret`
3661    /// auto-encrypt/decrypt can reach the vault AES key.
3662    pub fn set_auth_store(&self, store: Arc<crate::auth::store::AuthStore>) {
3663        *self.inner.auth_store.write() = Some(store);
3664    }
3665
3666    /// Snapshot the current AuthStore (if any). Used by the wire listener
3667    /// to validate bearer tokens issued via HTTP `/auth/login`.
3668    pub fn auth_store(&self) -> Option<Arc<crate::auth::store::AuthStore>> {
3669        self.inner.auth_store.read().clone()
3670    }
3671
3672    /// Read a vault KV secret from the configured AuthStore, if present.
3673    pub fn vault_kv_get(&self, key: &str) -> Option<String> {
3674        self.inner
3675            .auth_store
3676            .read()
3677            .as_ref()
3678            .and_then(|store| store.vault_kv_get(key))
3679    }
3680
3681    /// Write a vault KV secret and fail if the encrypted vault write is
3682    /// unavailable or cannot be made durable.
3683    pub fn vault_kv_try_set(&self, key: String, value: String) -> RedDBResult<()> {
3684        let store = self.inner.auth_store.read().clone().ok_or_else(|| {
3685            RedDBError::Query("secret storage requires an enabled, unsealed vault".to_string())
3686        })?;
3687        store
3688            .vault_kv_try_set(key, value)
3689            .map_err(|err| RedDBError::Query(err.to_string()))
3690    }
3691
3692    /// Inject an `OAuthValidator` into the runtime. When set, HTTP and
3693    /// wire transports try OAuth JWT validation before falling back to
3694    /// the local AuthStore lookup. Pass `None` to disable.
3695    pub fn set_oauth_validator(&self, validator: Option<Arc<crate::auth::oauth::OAuthValidator>>) {
3696        *self.inner.oauth_validator.write() = validator;
3697    }
3698
3699    /// Returns a clone of the configured `OAuthValidator` Arc, if any.
3700    /// Hot path: called per HTTP request when an Authorization header
3701    /// is present, so we hand back a cheap Arc clone.
3702    pub fn oauth_validator(&self) -> Option<Arc<crate::auth::oauth::OAuthValidator>> {
3703        self.inner.oauth_validator.read().clone()
3704    }
3705
3706    /// Returns the vault AES key (`red.secret.aes_key`) if an auth
3707    /// store is wired and a key has been generated. Used by the
3708    /// `Value::Secret` encrypt/decrypt pipeline.
3709    pub(crate) fn secret_aes_key(&self) -> Option<[u8; 32]> {
3710        let guard = self.inner.auth_store.read();
3711        guard.as_ref().and_then(|s| s.vault_secret_key())
3712    }
3713
3714    /// Resolve a boolean flag from `red_config`. Defaults to `default`
3715    /// when the key is missing or not coercible. If the same key has
3716    /// been written multiple times (SET CONFIG appends new rows), the
3717    /// most recent entity wins. Env-var overrides
3718    /// (`REDDB_<UP_DOTTED>`) take highest precedence.
3719    pub(crate) fn config_bool(&self, key: &str, default: bool) -> bool {
3720        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3721            if let Some(crate::storage::schema::Value::Boolean(b)) =
3722                crate::runtime::config_overlay::coerce_env_value(key, raw)
3723            {
3724                return b;
3725            }
3726        }
3727        let store = self.inner.db.store();
3728        let Some(manager) = store.get_collection("red_config") else {
3729            return default;
3730        };
3731        let mut result = default;
3732        let mut latest_id: u64 = 0;
3733        manager.for_each_entity(|entity| {
3734            if let Some(row) = entity.data.as_row() {
3735                let entry_key = row.get_field("key").and_then(|v| match v {
3736                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3737                    _ => None,
3738                });
3739                if entry_key == Some(key) {
3740                    let id = entity.id.raw();
3741                    if id >= latest_id {
3742                        latest_id = id;
3743                        result = match row.get_field("value") {
3744                            Some(crate::storage::schema::Value::Boolean(b)) => *b,
3745                            Some(crate::storage::schema::Value::Text(s)) => {
3746                                matches!(s.as_ref(), "true" | "TRUE" | "True" | "1")
3747                            }
3748                            Some(crate::storage::schema::Value::Integer(n)) => *n != 0,
3749                            _ => default,
3750                        };
3751                    }
3752                }
3753            }
3754            true
3755        });
3756        result
3757    }
3758
3759    pub(crate) fn config_u64(&self, key: &str, default: u64) -> u64 {
3760        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3761            if let Some(crate::storage::schema::Value::UnsignedInteger(n)) =
3762                crate::runtime::config_overlay::coerce_env_value(key, raw)
3763            {
3764                return n;
3765            }
3766        }
3767        let store = self.inner.db.store();
3768        let Some(manager) = store.get_collection("red_config") else {
3769            return default;
3770        };
3771        let mut result = default;
3772        let mut latest_id: u64 = 0;
3773        manager.for_each_entity(|entity| {
3774            if let Some(row) = entity.data.as_row() {
3775                let entry_key = row.get_field("key").and_then(|v| match v {
3776                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3777                    _ => None,
3778                });
3779                if entry_key == Some(key) {
3780                    let id = entity.id.raw();
3781                    if id >= latest_id {
3782                        latest_id = id;
3783                        result = match row.get_field("value") {
3784                            Some(crate::storage::schema::Value::Integer(n)) => *n as u64,
3785                            Some(crate::storage::schema::Value::UnsignedInteger(n)) => *n,
3786                            Some(crate::storage::schema::Value::Text(s)) => {
3787                                s.parse::<u64>().unwrap_or(default)
3788                            }
3789                            _ => default,
3790                        };
3791                    }
3792                }
3793            }
3794            true
3795        });
3796        result
3797    }
3798
3799    pub(crate) fn config_f64(&self, key: &str, default: f64) -> f64 {
3800        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3801            if let Ok(n) = raw.parse::<f64>() {
3802                return n;
3803            }
3804        }
3805        let store = self.inner.db.store();
3806        let Some(manager) = store.get_collection("red_config") else {
3807            return default;
3808        };
3809        let mut result = default;
3810        let mut latest_id: u64 = 0;
3811        manager.for_each_entity(|entity| {
3812            if let Some(row) = entity.data.as_row() {
3813                let entry_key = row.get_field("key").and_then(|v| match v {
3814                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3815                    _ => None,
3816                });
3817                if entry_key == Some(key) {
3818                    let id = entity.id.raw();
3819                    if id >= latest_id {
3820                        latest_id = id;
3821                        result = match row.get_field("value") {
3822                            Some(crate::storage::schema::Value::Float(n)) => *n,
3823                            Some(crate::storage::schema::Value::Integer(n)) => *n as f64,
3824                            Some(crate::storage::schema::Value::UnsignedInteger(n)) => *n as f64,
3825                            Some(crate::storage::schema::Value::Text(s)) => {
3826                                s.parse::<f64>().unwrap_or(default)
3827                            }
3828                            _ => default,
3829                        };
3830                    }
3831                }
3832            }
3833            true
3834        });
3835        result
3836    }
3837
3838    pub(crate) fn config_string(&self, key: &str, default: &str) -> String {
3839        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3840            return raw.clone();
3841        }
3842        let store = self.inner.db.store();
3843        let Some(manager) = store.get_collection("red_config") else {
3844            return default.to_string();
3845        };
3846        let mut result = default.to_string();
3847        let mut latest_id: u64 = 0;
3848        manager.for_each_entity(|entity| {
3849            if let Some(row) = entity.data.as_row() {
3850                let entry_key = row.get_field("key").and_then(|v| match v {
3851                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3852                    _ => None,
3853                });
3854                if entry_key == Some(key) {
3855                    let id = entity.id.raw();
3856                    if id >= latest_id {
3857                        latest_id = id;
3858                        if let Some(crate::storage::schema::Value::Text(value)) =
3859                            row.get_field("value")
3860                        {
3861                            result = value.to_string();
3862                        }
3863                    }
3864                }
3865            }
3866            true
3867        });
3868        result
3869    }
3870
3871    fn latest_metadata_for(
3872        &self,
3873        collection: &str,
3874        entity_id: u64,
3875    ) -> Option<crate::serde_json::Value> {
3876        self.inner
3877            .db
3878            .store()
3879            .get_metadata(collection, EntityId::new(entity_id))
3880            .map(|metadata| metadata_to_json(&metadata))
3881    }
3882
3883    fn persist_replica_lsn(&self, lsn: u64) {
3884        self.inner.db.store().set_config_tree(
3885            "red.replication",
3886            &crate::json!({
3887                "last_applied_lsn": lsn
3888            }),
3889        );
3890    }
3891
3892    /// Resolve this replica's stable identity (issue #812). The primary keys
3893    /// per-replica progress off this id, so it MUST be stable across reboots
3894    /// — a changing id would make the primary treat every restart as a brand
3895    /// new replica. Honours an operator-configured `red.replication.replica_id`
3896    /// first; otherwise generates one once and persists it so the next boot
3897    /// reuses the same value.
3898    fn resolve_replica_id(&self) -> String {
3899        let configured = self.config_string("red.replication.replica_id", "");
3900        if !configured.is_empty() {
3901            return configured;
3902        }
3903        let generated = crate::crypto::uuid::Uuid::new_v4().to_string();
3904        self.inner.db.store().set_config_tree(
3905            "red.replication",
3906            &crate::json!({
3907                "replica_id": generated.clone()
3908            }),
3909        );
3910        generated
3911    }
3912
3913    fn persist_replication_health(
3914        &self,
3915        state: &str,
3916        last_error: &str,
3917        primary_lsn: Option<u64>,
3918        oldest_available_lsn: Option<u64>,
3919    ) {
3920        self.inner.db.store().set_config_tree(
3921            "red.replication",
3922            &crate::json!({
3923                "state": state,
3924                "last_error": last_error,
3925                "last_seen_primary_lsn": primary_lsn.unwrap_or(0),
3926                "last_seen_oldest_lsn": oldest_available_lsn.unwrap_or(0),
3927                "updated_at_unix_ms": SystemTime::now()
3928                    .duration_since(UNIX_EPOCH)
3929                    .unwrap_or_default()
3930                    .as_millis() as u64
3931            }),
3932        );
3933    }
3934
3935    /// Whether `SECRET('...')` literals should be encrypted with the
3936    /// vault AES key on INSERT. Default `true`.
3937    pub(crate) fn secret_auto_encrypt(&self) -> bool {
3938        self.config_bool("red.config.secret.auto_encrypt", true)
3939    }
3940
3941    /// Whether `Value::Secret` columns should be decrypted back to
3942    /// plaintext on SELECT when the vault is unsealed. Default `true`.
3943    /// Turning this off keeps secrets masked as `***` even while the
3944    /// vault is open — useful for audit trails or read-only exports.
3945    pub(crate) fn secret_auto_decrypt(&self) -> bool {
3946        self.config_bool("red.config.secret.auto_decrypt", true)
3947    }
3948
3949    /// Walk every record in `result` and swap `Value::Secret(bytes)`
3950    /// for the decrypted plaintext when the runtime has the vault
3951    /// AES key AND `red.config.secret.auto_decrypt = true`. If the
3952    /// key is missing, the vault is sealed, or auto_decrypt is off,
3953    /// secrets are left as `Value::Secret` which every formatter
3954    /// (Display, JSON) already masks as `***`.
3955    pub(crate) fn apply_secret_decryption(&self, result: &mut RuntimeQueryResult) {
3956        if !self.secret_auto_decrypt() {
3957            return;
3958        }
3959        let Some(key) = self.secret_aes_key() else {
3960            return;
3961        };
3962        for record in result.result.records.iter_mut() {
3963            for value in record.values_mut() {
3964                if let Value::Secret(ref bytes) = value {
3965                    if let Some(plain) =
3966                        super::impl_dml::decrypt_secret_payload(&key, bytes.as_slice())
3967                    {
3968                        if let Ok(text) = String::from_utf8(plain) {
3969                            *value = Value::text(text);
3970                        }
3971                    }
3972                }
3973            }
3974        }
3975    }
3976
3977    /// Emit a CDC change event and replicate to WAL buffer.
3978    /// Create a `MutationEngine` bound to this runtime.
3979    ///
3980    /// The engine is cheap to construct (no allocation) and should be
3981    /// dropped after `apply` returns. Use this from application-layer
3982    /// `create_row` / `create_rows_batch` instead of calling
3983    /// `bulk_insert` + `index_entity_insert` + `cdc_emit` separately.
3984    pub(crate) fn mutation_engine(&self) -> crate::runtime::mutation::MutationEngine<'_> {
3985        crate::runtime::mutation::MutationEngine::new(self)
3986    }
3987
3988    /// Public-mutation gate snapshot (PLAN.md W1).
3989    ///
3990    /// Surfaces that accept untrusted client requests (SQL DML/DDL,
3991    /// gRPC mutating RPCs, HTTP/native wire mutations, admin
3992    /// maintenance, serverless lifecycle) call `check_write` before
3993    /// dispatching to storage. Returns `RedDBError::ReadOnly` on any
3994    /// instance running as a replica or with `options.read_only =
3995    /// true`. The replica internal logical-WAL apply path reaches into
3996    /// the store directly and never calls this method, so legitimate
3997    /// replica catch-up still works.
3998    pub fn check_write(&self, kind: crate::runtime::write_gate::WriteKind) -> RedDBResult<()> {
3999        self.inner.write_gate.check(kind)
4000    }
4001
4002    /// Read-only handle to the gate, useful for transports that want
4003    /// to surface the policy in health/status output without taking on
4004    /// a dependency on the concrete enum.
4005    pub fn write_gate(&self) -> &crate::runtime::write_gate::WriteGate {
4006        &self.inner.write_gate
4007    }
4008
4009    /// Process lifecycle handle (PLAN.md Phase 1). Health probes,
4010    /// admin/shutdown, and signal handlers consult this single
4011    /// state machine.
4012    pub fn lifecycle(&self) -> &crate::runtime::lifecycle::Lifecycle {
4013        &self.inner.lifecycle
4014    }
4015
4016    /// Operator-imposed resource limits (PLAN.md Phase 4.1).
4017    pub fn resource_limits(&self) -> &crate::runtime::resource_limits::ResourceLimits {
4018        &self.inner.resource_limits
4019    }
4020
4021    /// Append-only audit log for admin mutations (PLAN.md Phase 6.5).
4022    pub fn audit_log(&self) -> &crate::runtime::audit_log::AuditLogger {
4023        &self.inner.audit_log
4024    }
4025
4026    /// Shared `Arc` to the audit logger — used by collaborators (the
4027    /// lease lifecycle, future request-context plumbing) that need to
4028    /// keep the logger alive past the runtime's stack frame.
4029    pub fn audit_log_arc(&self) -> Arc<crate::runtime::audit_log::AuditLogger> {
4030        Arc::clone(&self.inner.audit_log)
4031    }
4032
4033    pub(crate) fn emit_control_event(
4034        &self,
4035        kind: crate::runtime::control_events::EventKind,
4036        outcome: crate::runtime::control_events::Outcome,
4037        action: &'static str,
4038        resource: Option<String>,
4039        reason: Option<String>,
4040        extra_fields: Vec<(String, crate::runtime::control_events::Sensitivity)>,
4041    ) -> RedDBResult<()> {
4042        use crate::runtime::control_events::{
4043            ActorRef, ControlEvent, ControlEventCtx, ControlEventLedger, Sensitivity,
4044        };
4045
4046        let tenant = current_tenant();
4047        let principal = current_auth_identity();
4048        let actor_user = principal
4049            .as_ref()
4050            .map(|(principal, _)| UserId::from_parts(tenant.as_deref(), principal));
4051        let actor = actor_user
4052            .as_ref()
4053            .map(ActorRef::User)
4054            .unwrap_or(ActorRef::Anonymous);
4055        let ctx = ControlEventCtx {
4056            actor,
4057            scope: tenant
4058                .as_ref()
4059                .map(|scope| std::borrow::Cow::Borrowed(scope.as_str())),
4060            request_id: Some(std::borrow::Cow::Owned(format!(
4061                "conn-{}",
4062                current_connection_id()
4063            ))),
4064            trace_id: None,
4065        };
4066        let mut fields = std::collections::HashMap::new();
4067        fields.insert(
4068            "connection_id".to_string(),
4069            Sensitivity::raw(current_connection_id().to_string()),
4070        );
4071        if let Some((_, role)) = principal {
4072            fields.insert("actor_role".to_string(), Sensitivity::raw(role.as_str()));
4073        }
4074        for (key, value) in extra_fields {
4075            fields.insert(key, value);
4076        }
4077        let event = ControlEvent {
4078            kind,
4079            outcome,
4080            action: std::borrow::Cow::Borrowed(action),
4081            resource,
4082            reason,
4083            matched_policy_id: None,
4084            fields,
4085        };
4086        let ledger = self.inner.control_event_ledger.read();
4087        match ledger.emit(&ctx, event) {
4088            Ok(_) => Ok(()),
4089            Err(err) if self.inner.control_event_config.require_persistence() => {
4090                Err(RedDBError::Internal(err.to_string()))
4091            }
4092            Err(_) => Ok(()),
4093        }
4094    }
4095
4096    fn policy_mutation_control_ctx<'a>(
4097        &self,
4098        actor: &'a crate::auth::UserId,
4099        tenant: Option<&'a str>,
4100    ) -> crate::runtime::control_events::ControlEventCtx<'a> {
4101        crate::runtime::control_events::ControlEventCtx {
4102            actor: crate::runtime::control_events::ActorRef::User(actor),
4103            scope: tenant.map(std::borrow::Cow::Borrowed),
4104            request_id: Some(std::borrow::Cow::Owned(format!(
4105                "conn-{}",
4106                current_connection_id()
4107            ))),
4108            trace_id: None,
4109        }
4110    }
4111
4112    fn emit_query_audit(
4113        &self,
4114        query: &str,
4115        plan: &QueryAuditPlan,
4116        duration_ms: u64,
4117        result: &RuntimeQueryResult,
4118    ) {
4119        if !self.inner.query_audit.has_rules() {
4120            return;
4121        }
4122        let actor = current_auth_identity().map(|(principal, _)| principal);
4123        let tenant = current_tenant();
4124        let row_count = if result.statement_type == "select" {
4125            result.result.records.len() as u64
4126        } else {
4127            result.affected_rows
4128        };
4129        self.inner
4130            .query_audit
4131            .emit(crate::runtime::query_audit::QueryAuditEvent {
4132                actor,
4133                tenant,
4134                statement_kind: plan.statement_kind,
4135                touched_collections: plan.collections.clone(),
4136                duration_ms,
4137                row_count,
4138                request_id: Some(crate::crypto::uuid::Uuid::new_v7().to_string()),
4139                query_hash: Some(blake3::hash(query.as_bytes()).to_hex().to_string()),
4140            });
4141    }
4142
4143    /// Slice 10 of issue #527 — shared queue telemetry counters
4144    /// (delivered/acked/nacked). Cloned by `queue_delivery.rs` on
4145    /// each transition.
4146    pub(crate) fn queue_telemetry(
4147        &self,
4148    ) -> &crate::runtime::queue_telemetry::QueueTelemetryCounters {
4149        &self.inner.queue_telemetry
4150    }
4151
4152    /// Snapshots of the queue telemetry counters in label-deterministic
4153    /// order for `/metrics` rendering and the integration test.
4154    pub fn queue_telemetry_snapshot(
4155        &self,
4156    ) -> crate::runtime::queue_telemetry::QueueTelemetrySnapshot {
4157        crate::runtime::queue_telemetry::QueueTelemetrySnapshot {
4158            delivered: self.inner.queue_telemetry.delivered_snapshot(),
4159            acked: self.inner.queue_telemetry.acked_snapshot(),
4160            nacked: self.inner.queue_telemetry.nacked_snapshot(),
4161            wait_started: self.inner.queue_telemetry.wait_started_snapshot(),
4162            wait_woken: self.inner.queue_telemetry.wait_woken_snapshot(),
4163            wait_timed_out: self.inner.queue_telemetry.wait_timed_out_snapshot(),
4164            wait_cancelled: self.inner.queue_telemetry.wait_cancelled_snapshot(),
4165            wait_duration: self.inner.queue_telemetry.wait_duration_snapshot(),
4166        }
4167    }
4168
4169    /// Issue #742 — consumer presence registry. Heartbeats land here
4170    /// from `QUEUE READ` (and, in a follow-up slice, an explicit
4171    /// `QUEUE HEARTBEAT` command); Red UI and `red.queue_consumers`
4172    /// read snapshots through `queue_consumer_presence_snapshot`.
4173    pub(crate) fn queue_presence(
4174        &self,
4175    ) -> &std::sync::Arc<crate::storage::queue::presence::ConsumerPresenceRegistry> {
4176        &self.inner.queue_presence
4177    }
4178
4179    /// Issue #742 — point-in-time presence snapshot, classifying each
4180    /// `(queue, group, consumer)` as active/stale/expired against the
4181    /// supplied TTL. Wall-clock is read once here so the lifecycle
4182    /// flags inside the snapshot are internally consistent.
4183    pub fn queue_consumer_presence_snapshot(
4184        &self,
4185        ttl_ms: u64,
4186    ) -> Vec<crate::storage::queue::presence::ConsumerPresence> {
4187        let now_ns = std::time::SystemTime::now()
4188            .duration_since(std::time::UNIX_EPOCH)
4189            .map(|d| d.as_nanos() as u64)
4190            .unwrap_or(0);
4191        self.inner.queue_presence.snapshot(now_ns, ttl_ms)
4192    }
4193
4194    /// Issue #742 — active-consumer count per `(queue, group)` for the
4195    /// queue-metadata surface. Stale/expired entries are excluded by
4196    /// definition; they are still visible in the per-row snapshot.
4197    pub fn queue_active_consumer_counts(
4198        &self,
4199        ttl_ms: u64,
4200    ) -> std::collections::HashMap<(String, String), u32> {
4201        let now_ns = std::time::SystemTime::now()
4202            .duration_since(std::time::UNIX_EPOCH)
4203            .map(|d| d.as_nanos() as u64)
4204            .unwrap_or(0);
4205        self.inner
4206            .queue_presence
4207            .count_active_by_group(now_ns, ttl_ms)
4208    }
4209
4210    /// Issue #743 — vector + TurboQuant introspection registry. Engine
4211    /// publish points (collection create, artifact build start /
4212    /// finish, fallback toggle, drop) update this; Red UI and
4213    /// `red.*` vector virtual tables read snapshots through
4214    /// `vector_introspection_snapshot` / `vector_introspection_get`.
4215    pub(crate) fn vector_introspection_registry(
4216        &self,
4217    ) -> &std::sync::Arc<crate::storage::vector::introspection::VectorIntrospectionRegistry> {
4218        &self.inner.vector_introspection
4219    }
4220
4221    /// Issue #743 — full snapshot of every tracked vector collection's
4222    /// `(VectorMetadata, ArtifactMetadata)`. Deterministically ordered
4223    /// by collection name so Red UI tables and tests both see a
4224    /// stable shape.
4225    pub fn vector_introspection_snapshot(
4226        &self,
4227    ) -> Vec<crate::storage::vector::introspection::VectorIntrospection> {
4228        self.inner.vector_introspection.snapshot()
4229    }
4230
4231    /// Issue #743 — single-collection lookup, for the per-collection
4232    /// metadata endpoint Red UI hits when an operator opens one
4233    /// vector's toolbar.
4234    pub fn vector_introspection_get(
4235        &self,
4236        collection: &str,
4237    ) -> Option<crate::storage::vector::introspection::VectorIntrospection> {
4238        self.inner.vector_introspection.get(collection)
4239    }
4240
4241    /// Slice 10 of issue #527 — render-time scan of pending entries
4242    /// per (queue, group) for the `queue_pending_gauge` exposition.
4243    /// Walks `red_queue_meta` live so the gauge cannot drift from
4244    /// the source of truth.
4245    pub fn queue_pending_counts(&self) -> Vec<((String, String), u64)> {
4246        let store = self.inner.db.store();
4247        crate::runtime::impl_queue::pending_counts_by_group(store.as_ref())
4248            .into_iter()
4249            .collect()
4250    }
4251
4252    /// Shared `Arc` to the write gate. Same rationale as
4253    /// `audit_log_arc`: collaborators (lease lifecycle, refresh
4254    /// thread) need a clone-cheap handle they can move into a
4255    /// background thread.
4256    pub fn write_gate_arc(&self) -> Arc<crate::runtime::write_gate::WriteGate> {
4257        Arc::clone(&self.inner.write_gate)
4258    }
4259
4260    /// Serverless writer-lease state machine. `None` when the operator
4261    /// did not opt into lease fencing (`RED_LEASE_REQUIRED` unset).
4262    pub fn lease_lifecycle(&self) -> Option<&Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>> {
4263        self.inner.lease_lifecycle.get()
4264    }
4265
4266    /// Install the lease lifecycle. Idempotent; subsequent calls
4267    /// return the previously stored value untouched.
4268    pub fn set_lease_lifecycle(
4269        &self,
4270        lifecycle: Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>,
4271    ) -> Result<(), Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>> {
4272        self.inner.lease_lifecycle.set(lifecycle)
4273    }
4274
4275    /// Reject the call when the requested batch size exceeds
4276    /// `RED_MAX_BATCH_SIZE`. Returns `RedDBError::QuotaExceeded`
4277    /// shaped so the HTTP layer can map it to 413 Payload Too
4278    /// Large (PLAN.md Phase 4.1).
4279    pub fn check_batch_size(&self, requested: usize) -> RedDBResult<()> {
4280        if self.inner.resource_limits.batch_size_exceeded(requested) {
4281            let max = self.inner.resource_limits.max_batch_size.unwrap_or(0);
4282            return Err(RedDBError::QuotaExceeded(format!(
4283                "max_batch_size:{requested}:{max}"
4284            )));
4285        }
4286        Ok(())
4287    }
4288
4289    /// Reject the call when the local DB file exceeds
4290    /// `RED_MAX_DB_SIZE_BYTES`. Reads file metadata once per call —
4291    /// the cost is a single `stat()` syscall, negligible against the
4292    /// I/O the caller is about to do. Returns `QuotaExceeded` shaped
4293    /// for HTTP 507 Insufficient Storage.
4294    pub fn check_db_size(&self) -> RedDBResult<()> {
4295        let Some(limit) = self.inner.resource_limits.max_db_size_bytes else {
4296            return Ok(());
4297        };
4298        if limit == 0 {
4299            return Ok(());
4300        }
4301        let Some(path) = self.inner.db.path() else {
4302            return Ok(());
4303        };
4304        let current = std::fs::metadata(path).map(|m| m.len()).unwrap_or(0);
4305        if current > limit {
4306            return Err(RedDBError::QuotaExceeded(format!(
4307                "max_db_size_bytes:{current}:{limit}"
4308            )));
4309        }
4310        Ok(())
4311    }
4312
4313    /// Graceful shutdown coordinator (PLAN.md Phase 1.1).
4314    ///
4315    /// Steps, in order, all idempotent across re-entrant calls:
4316    ///   1. Move lifecycle into `ShuttingDown` (concurrent callers
4317    ///      observe `Stopped` after first finishes).
4318    ///   2. Flush WAL + run final checkpoint via `db.flush()` so
4319    ///      every acked write is durable on disk.
4320    ///   3. If `backup_on_shutdown == true` and a remote backend is
4321    ///      configured, run a synchronous `trigger_backup()` so the
4322    ///      remote head reflects the final state.
4323    ///   4. Stamp the report and move to `Stopped`. Subsequent calls
4324    ///      return the cached report without re-running anything.
4325    ///
4326    /// On any error, the runtime is still marked `Stopped` so the
4327    /// process can exit; the caller logs the error context but does
4328    /// not retry the same shutdown — the operator can inspect the
4329    /// report fields to see which step failed.
4330    pub fn graceful_shutdown(
4331        &self,
4332        backup_on_shutdown: bool,
4333    ) -> RedDBResult<crate::runtime::lifecycle::ShutdownReport> {
4334        if !self.inner.lifecycle.begin_shutdown() {
4335            // Someone else already shut down (or is in flight). Return
4336            // the cached report so the HTTP caller and SIGTERM handler
4337            // get the same idempotent answer.
4338            return Ok(self.inner.lifecycle.shutdown_report().unwrap_or_default());
4339        }
4340
4341        let started_ms = std::time::SystemTime::now()
4342            .duration_since(std::time::UNIX_EPOCH)
4343            .map(|d| d.as_millis() as u64)
4344            .unwrap_or(0);
4345        let mut report = crate::runtime::lifecycle::ShutdownReport {
4346            started_at_ms: started_ms,
4347            ..Default::default()
4348        };
4349
4350        // Flush WAL + run any pending checkpoint. Local fsync is
4351        // unconditional — even a lease-lost replica needs its WAL on
4352        // disk before exit so a future restore has the latest tail.
4353        // The remote upload is gated separately so a lost-lease writer
4354        // doesn't clobber the new holder's state on its way out.
4355        let flush_res = self.inner.db.flush_local_only();
4356        report.flushed_wal = flush_res.is_ok();
4357        report.final_checkpoint = flush_res.is_ok();
4358        if let Err(err) = &flush_res {
4359            tracing::error!(
4360                target: "reddb::lifecycle",
4361                error = %err,
4362                "graceful_shutdown: local flush failed"
4363            );
4364        } else if let Err(lease_err) =
4365            self.assert_remote_write_allowed("shutdown/checkpoint_upload")
4366        {
4367            tracing::warn!(
4368                target: "reddb::serverless::lease",
4369                error = %lease_err,
4370                "graceful_shutdown: remote upload skipped — lease not held"
4371            );
4372        } else if let Err(err) = self.inner.db.upload_to_remote_backend() {
4373            tracing::error!(
4374                target: "reddb::lifecycle",
4375                error = %err,
4376                "graceful_shutdown: remote upload failed"
4377            );
4378        }
4379
4380        // Optional final backup. Skipped silently when no remote
4381        // backend is configured — `trigger_backup()` returns Err
4382        // anyway in that case, but logging it as a shutdown failure
4383        // would be misleading on a standalone (no-backend) runtime.
4384        if backup_on_shutdown && self.inner.db.remote_backend.is_some() {
4385            // The trigger_backup gate now reads `WriteKind::Backup`,
4386            // which a replica/read_only instance refuses. That's
4387            // intentional — replicas don't drive backups; only the
4388            // primary does. We still want shutdown to flush its WAL
4389            // even if the backup branch is gated off.
4390            match self.trigger_backup() {
4391                Ok(result) => {
4392                    report.backup_uploaded = result.uploaded;
4393                }
4394                Err(err) => {
4395                    tracing::warn!(
4396                        target: "reddb::lifecycle",
4397                        error = %err,
4398                        "graceful_shutdown: final backup skipped"
4399                    );
4400                }
4401            }
4402        }
4403
4404        let completed_ms = std::time::SystemTime::now()
4405            .duration_since(std::time::UNIX_EPOCH)
4406            .map(|d| d.as_millis() as u64)
4407            .unwrap_or(started_ms);
4408        report.completed_at_ms = completed_ms;
4409        report.duration_ms = completed_ms.saturating_sub(started_ms);
4410
4411        self.inner.lifecycle.finish_shutdown(report.clone());
4412        Ok(report)
4413    }
4414
4415    /// Emit a CDC record without invalidating the result cache.
4416    ///
4417    /// Used by `MutationEngine::append_batch` which calls
4418    /// `invalidate_result_cache` once for the whole batch before this
4419    /// loop, avoiding N write-lock acquisitions.
4420    pub(crate) fn cdc_emit_no_cache_invalidate(
4421        &self,
4422        operation: crate::replication::cdc::ChangeOperation,
4423        collection: &str,
4424        entity_id: u64,
4425        entity_kind: &str,
4426    ) -> u64 {
4427        let lsn = self
4428            .inner
4429            .cdc
4430            .emit(operation, collection, entity_id, entity_kind);
4431
4432        // Append to logical WAL replication buffer (if primary mode)
4433        if let Some(ref primary) = self.inner.db.replication {
4434            let store = self.inner.db.store();
4435            let entity = if operation == crate::replication::cdc::ChangeOperation::Delete {
4436                None
4437            } else {
4438                store.get(collection, EntityId::new(entity_id))
4439            };
4440            let record = ChangeRecord {
4441                term: self.current_replication_term(),
4442                lsn,
4443                timestamp: SystemTime::now()
4444                    .duration_since(UNIX_EPOCH)
4445                    .unwrap_or_default()
4446                    .as_millis() as u64,
4447                operation,
4448                collection: collection.to_string(),
4449                entity_id,
4450                entity_kind: entity_kind.to_string(),
4451                entity_bytes: entity
4452                    .as_ref()
4453                    .map(|e| UnifiedStore::serialize_entity(e, store.format_version())),
4454                metadata: self.latest_metadata_for(collection, entity_id),
4455                refresh_records: None,
4456            };
4457            let encoded = record.encode();
4458            primary.append_logical_record(record.lsn, encoded);
4459        }
4460        lsn
4461    }
4462
4463    pub(crate) fn cdc_emit_insert_batch_no_cache_invalidate(
4464        &self,
4465        collection: &str,
4466        ids: &[EntityId],
4467        entity_kind: &str,
4468    ) -> Vec<u64> {
4469        if ids.is_empty() {
4470            return Vec::new();
4471        }
4472
4473        // Without logical replication, CDC only needs the in-memory event
4474        // ring. Reserve all LSNs and push the batch under one mutex instead
4475        // of taking the ring lock once per inserted row.
4476        if self.inner.db.replication.is_none() {
4477            return self.inner.cdc.emit_batch_same_collection(
4478                crate::replication::cdc::ChangeOperation::Insert,
4479                collection,
4480                entity_kind,
4481                ids.iter().map(|id| id.raw()),
4482            );
4483        }
4484
4485        // Replication needs one logical-WAL record per entity with the
4486        // serialized entity bytes, so keep the existing per-row path.
4487        ids.iter()
4488            .map(|id| {
4489                self.cdc_emit_no_cache_invalidate(
4490                    crate::replication::cdc::ChangeOperation::Insert,
4491                    collection,
4492                    id.raw(),
4493                    entity_kind,
4494                )
4495            })
4496            .collect()
4497    }
4498
4499    pub fn cdc_emit(
4500        &self,
4501        operation: crate::replication::cdc::ChangeOperation,
4502        collection: &str,
4503        entity_id: u64,
4504        entity_kind: &str,
4505    ) -> u64 {
4506        let lsn = self
4507            .inner
4508            .cdc
4509            .emit(operation, collection, entity_id, entity_kind);
4510        // Perf: prior to this we called `invalidate_result_cache()`
4511        // which wipes EVERY cached query, across every table, under
4512        // a write lock — turning each INSERT into a serialisation
4513        // point for all readers. Swap to the per-table variant so
4514        // unrelated query caches survive.
4515        self.invalidate_result_cache_for_table(collection);
4516
4517        // Append to logical WAL replication buffer (if primary mode)
4518        if let Some(ref primary) = self.inner.db.replication {
4519            let store = self.inner.db.store();
4520            let entity = if operation == crate::replication::cdc::ChangeOperation::Delete {
4521                None
4522            } else {
4523                store.get(collection, EntityId::new(entity_id))
4524            };
4525            let record = ChangeRecord {
4526                term: self.current_replication_term(),
4527                lsn,
4528                timestamp: SystemTime::now()
4529                    .duration_since(UNIX_EPOCH)
4530                    .unwrap_or_default()
4531                    .as_millis() as u64,
4532                operation,
4533                collection: collection.to_string(),
4534                entity_id,
4535                entity_kind: entity_kind.to_string(),
4536                entity_bytes: entity
4537                    .as_ref()
4538                    .map(|entity| UnifiedStore::serialize_entity(entity, store.format_version())),
4539                metadata: self.latest_metadata_for(collection, entity_id),
4540                refresh_records: None,
4541            };
4542            let encoded = record.encode();
4543            primary.append_logical_record(record.lsn, encoded);
4544        }
4545        lsn
4546    }
4547
4548    pub(crate) fn cdc_emit_kv(
4549        &self,
4550        operation: crate::replication::cdc::ChangeOperation,
4551        collection: &str,
4552        key: &str,
4553        entity_id: u64,
4554        before: Option<crate::json::Value>,
4555        after: Option<crate::json::Value>,
4556    ) -> u64 {
4557        let lsn = self
4558            .inner
4559            .cdc
4560            .emit_kv(operation, collection, key, entity_id, before, after);
4561        self.inner.kv_stats.incr_watch_events_emitted();
4562        self.invalidate_result_cache_for_table(collection);
4563        lsn
4564    }
4565
4566    pub(crate) fn record_kv_watch_event(
4567        &self,
4568        operation: crate::replication::cdc::ChangeOperation,
4569        collection: &str,
4570        key: &str,
4571        entity_id: u64,
4572        before: Option<crate::json::Value>,
4573        after: Option<crate::json::Value>,
4574    ) {
4575        if self.current_xid().is_some() {
4576            let conn_id = current_connection_id();
4577            let event = crate::replication::cdc::KvWatchEvent {
4578                collection: collection.to_string(),
4579                key: key.to_string(),
4580                op: operation,
4581                before,
4582                after,
4583                lsn: 0,
4584                committed_at: 0,
4585                dropped_event_count: 0,
4586            };
4587            self.inner
4588                .pending_kv_watch_events
4589                .write()
4590                .entry(conn_id)
4591                .or_default()
4592                .push(event);
4593            return;
4594        }
4595
4596        self.cdc_emit_kv(operation, collection, key, entity_id, before, after);
4597    }
4598
4599    pub(crate) fn cdc_emit_prebuilt(
4600        &self,
4601        operation: crate::replication::cdc::ChangeOperation,
4602        collection: &str,
4603        entity: &UnifiedEntity,
4604        entity_kind: &str,
4605        metadata: Option<&crate::storage::Metadata>,
4606        invalidate_cache: bool,
4607    ) -> u64 {
4608        self.cdc_emit_prebuilt_with_columns(
4609            operation,
4610            collection,
4611            entity,
4612            entity_kind,
4613            metadata,
4614            invalidate_cache,
4615            None,
4616        )
4617    }
4618
4619    /// `cdc_emit_prebuilt` plus the list of column names whose values
4620    /// changed on this update. Callers that have already computed a
4621    /// `RowDamageVector` pass it here so downstream CDC consumers can
4622    /// filter events by touched column without re-diffing.
4623    /// `changed_columns` is only meaningful for `Update` operations —
4624    /// insert and delete events ignore it.
4625    pub(crate) fn cdc_emit_prebuilt_with_columns(
4626        &self,
4627        operation: crate::replication::cdc::ChangeOperation,
4628        collection: &str,
4629        entity: &UnifiedEntity,
4630        entity_kind: &str,
4631        metadata: Option<&crate::storage::Metadata>,
4632        invalidate_cache: bool,
4633        changed_columns: Option<Vec<String>>,
4634    ) -> u64 {
4635        if invalidate_cache {
4636            self.invalidate_result_cache();
4637        }
4638
4639        let public_id = entity.logical_id().raw();
4640        let lsn = self.inner.cdc.emit_with_columns(
4641            operation,
4642            collection,
4643            public_id,
4644            entity_kind,
4645            changed_columns,
4646        );
4647
4648        if let Some(ref primary) = self.inner.db.replication {
4649            let store = self.inner.db.store();
4650            let record = ChangeRecord {
4651                term: self.current_replication_term(),
4652                lsn,
4653                timestamp: SystemTime::now()
4654                    .duration_since(UNIX_EPOCH)
4655                    .unwrap_or_default()
4656                    .as_millis() as u64,
4657                operation,
4658                collection: collection.to_string(),
4659                entity_id: entity.id.raw(),
4660                entity_kind: entity_kind.to_string(),
4661                entity_bytes: Some(UnifiedStore::serialize_entity(
4662                    entity,
4663                    store.format_version(),
4664                )),
4665                metadata: metadata
4666                    .map(metadata_to_json)
4667                    .or_else(|| self.latest_metadata_for(collection, entity.id.raw())),
4668                refresh_records: None,
4669            };
4670            let encoded = record.encode();
4671            primary.append_logical_record(record.lsn, encoded);
4672        }
4673
4674        lsn
4675    }
4676
4677    pub(crate) fn current_replication_term(&self) -> u64 {
4678        self.inner.db.options().replication.term
4679    }
4680
4681    pub(crate) fn cdc_emit_prebuilt_batch<'a, I>(
4682        &self,
4683        operation: crate::replication::cdc::ChangeOperation,
4684        entity_kind: &str,
4685        items: I,
4686        invalidate_cache: bool,
4687    ) where
4688        I: IntoIterator<
4689            Item = (
4690                &'a str,
4691                &'a UnifiedEntity,
4692                Option<&'a crate::storage::Metadata>,
4693            ),
4694        >,
4695    {
4696        let items: Vec<(&str, &UnifiedEntity, Option<&crate::storage::Metadata>)> =
4697            items.into_iter().collect();
4698        if items.is_empty() {
4699            return;
4700        }
4701
4702        if invalidate_cache {
4703            self.invalidate_result_cache();
4704        }
4705
4706        for (collection, entity, metadata) in items {
4707            self.cdc_emit_prebuilt(operation, collection, entity, entity_kind, metadata, false);
4708        }
4709    }
4710
4711    fn run_replica_loop(&self, primary_addr: String) {
4712        let endpoint = if primary_addr.starts_with("http") {
4713            primary_addr
4714        } else {
4715            format!("http://{primary_addr}")
4716        };
4717        let poll_ms = self.inner.db.options().replication.poll_interval_ms;
4718        let max_count = self.inner.db.options().replication.max_batch_size;
4719        let mut since_lsn = self.config_u64("red.replication.last_applied_lsn", 0);
4720        // Issue #812 — stable identity sent on every WAL pull so the primary
4721        // can self-register this replica and attribute pulls to it.
4722        let replica_id = self.resolve_replica_id();
4723
4724        let runtime = match tokio::runtime::Builder::new_current_thread()
4725            .enable_all()
4726            .build()
4727        {
4728            Ok(runtime) => runtime,
4729            Err(_) => return,
4730        };
4731
4732        runtime.block_on(async move {
4733            use crate::grpc::proto::red_db_client::RedDbClient;
4734            use crate::grpc::proto::JsonPayloadRequest;
4735
4736            let mut client = loop {
4737                match RedDbClient::connect(endpoint.clone()).await {
4738                    Ok(client) => {
4739                        self.persist_replication_health("connecting", "", None, None);
4740                        break client;
4741                    }
4742                    Err(_) => {
4743                        self.persist_replication_health(
4744                            "connecting",
4745                            "waiting for primary connection",
4746                            None,
4747                            None,
4748                        );
4749                        std::thread::sleep(std::time::Duration::from_millis(poll_ms.max(250)))
4750                    }
4751                }
4752            };
4753
4754            // PLAN.md Phase 11.5 — stateful applier guards LSN
4755            // monotonicity across pulls. Seed with the persisted
4756            // `last_applied_lsn` so reboots don't lose the chain
4757            // pointer.
4758            let applier = crate::replication::logical::LogicalChangeApplier::with_metrics(
4759                since_lsn,
4760                self.inner.replica_apply_metrics.clone(),
4761            );
4762
4763            loop {
4764                let payload = crate::json!({
4765                    "since_lsn": since_lsn,
4766                    "max_count": max_count,
4767                    "replica_id": replica_id,
4768                    "await_data": true,
4769                    "await_timeout_ms": 30_000
4770                });
4771                let request = tonic::Request::new(JsonPayloadRequest {
4772                    payload_json: crate::json::to_string(&payload)
4773                        .unwrap_or_else(|_| "{}".to_string()),
4774                });
4775
4776                if let Ok(response) = client.pull_wal_records(request).await {
4777                    if let Ok(value) =
4778                        crate::json::from_str::<crate::json::Value>(&response.into_inner().payload)
4779                    {
4780                        let current_lsn =
4781                            value.get("current_lsn").and_then(crate::json::Value::as_u64);
4782                        let oldest_available_lsn = value
4783                            .get("oldest_available_lsn")
4784                            .and_then(crate::json::Value::as_u64);
4785                        if value
4786                            .get("needs_rebootstrap")
4787                            .and_then(crate::json::Value::as_bool)
4788                            .unwrap_or(false)
4789                        {
4790                            let reason = value
4791                                .get("invalidation_reason")
4792                                .and_then(crate::json::Value::as_str)
4793                                .unwrap_or("unknown");
4794                            self.persist_replication_health(
4795                                "rebootstrap_required",
4796                                &format!("replication slot invalidated ({reason}); re-bootstrap required"),
4797                                current_lsn,
4798                                oldest_available_lsn,
4799                            );
4800                            std::thread::sleep(std::time::Duration::from_millis(poll_ms.max(250)));
4801                            continue;
4802                        }
4803                        if since_lsn > 0
4804                            && oldest_available_lsn
4805                                .map(|oldest| oldest > since_lsn.saturating_add(1))
4806                                .unwrap_or(false)
4807                        {
4808                            self.persist_replication_health(
4809                                "rebootstrap_required",
4810                                "replica is behind the oldest logical WAL available on primary; re-bootstrap required",
4811                                current_lsn,
4812                                oldest_available_lsn,
4813                            );
4814                            std::thread::sleep(std::time::Duration::from_millis(poll_ms.max(250)));
4815                            continue;
4816                        }
4817                        if let Some(records) =
4818                            value.get("records").and_then(crate::json::Value::as_array)
4819                        {
4820                            let mut batch_applied_lsn = None;
4821                            let mut ack_failed = false;
4822                            for record in records {
4823                                let Some(data_hex) =
4824                                    record.get("data").and_then(crate::json::Value::as_str)
4825                                else {
4826                                    continue;
4827                                };
4828                                let Ok(data) = hex::decode(data_hex) else {
4829                                    self.inner.replica_apply_metrics.record(
4830                                        crate::replication::logical::ApplyErrorKind::Decode,
4831                                    );
4832                                    self.persist_replication_health(
4833                                        "apply_error",
4834                                        "failed to decode WAL record hex payload",
4835                                        current_lsn,
4836                                        oldest_available_lsn,
4837                                    );
4838                                    continue;
4839                                };
4840                                let Ok(change) = ChangeRecord::decode(&data) else {
4841                                    self.inner.replica_apply_metrics.record(
4842                                        crate::replication::logical::ApplyErrorKind::Decode,
4843                                    );
4844                                    self.persist_replication_health(
4845                                        "apply_error",
4846                                        "failed to decode logical WAL record",
4847                                        current_lsn,
4848                                        oldest_available_lsn,
4849                                    );
4850                                    continue;
4851                                };
4852                                match applier.apply(
4853                                    self.inner.db.as_ref(),
4854                                    &change,
4855                                    ApplyMode::Replica,
4856                                ) {
4857                                    Ok(crate::replication::logical::ApplyOutcome::Applied) => {
4858                                        self.invalidate_result_cache_for_table(&change.collection);
4859                                        since_lsn = since_lsn.max(change.lsn);
4860                                        self.persist_replica_lsn(since_lsn);
4861                                        batch_applied_lsn = Some(since_lsn);
4862                                    }
4863                                    Ok(_) => {
4864                                        // Idempotent / Skipped: no advance, no error.
4865                                    }
4866                                    Err(err) => {
4867                                        self.inner.replica_apply_metrics.record(err.kind());
4868                                        // Issue #205 — emit operator-grade event
4869                                        // for the two replication-fatal kinds. `Gap`
4870                                        // / `Apply` / `Decode` already persist via
4871                                        // `persist_replication_health`; the
4872                                        // OperatorEvent variants only cover the
4873                                        // two "stream is broken" / "follower
4874                                        // diverged" conditions an operator must act
4875                                        // on out-of-band.
4876                                        match &err {
4877                                            crate::replication::logical::LogicalApplyError::Divergence { lsn, expected: _, got: _, .. } => {
4878                                                crate::telemetry::operator_event::OperatorEvent::Divergence {
4879                                                    peer: "primary".to_string(),
4880                                                    leader_lsn: *lsn,
4881                                                    follower_lsn: since_lsn,
4882                                                }
4883                                                .emit_global();
4884                                            }
4885                                            crate::replication::logical::LogicalApplyError::Gap { last, next } => {
4886                                                crate::telemetry::operator_event::OperatorEvent::ReplicationBroken {
4887                                                    peer: "primary".to_string(),
4888                                                    reason: format!("stalled gap last={last} next={next}"),
4889                                                }
4890                                                .emit_global();
4891                                            }
4892                                            _ => {}
4893                                        }
4894                                        let kind = match &err {
4895                                            crate::replication::logical::LogicalApplyError::Gap { .. } => "stalled_gap",
4896                                            crate::replication::logical::LogicalApplyError::Divergence { .. } => "divergence",
4897                                            _ => "apply_error",
4898                                        };
4899                                        self.persist_replication_health(
4900                                            kind,
4901                                            &format!("replica apply rejected: {err}"),
4902                                            current_lsn,
4903                                            oldest_available_lsn,
4904                                        );
4905                                        // Stop applying this batch. The
4906                                        // outer loop will retry on next
4907                                        // pull, which on a real Gap will
4908                                        // not magically heal — operator
4909                                        // must rebootstrap. For
4910                                        // Divergence, we explicitly do
4911                                        // not advance; this keeps the
4912                                        // replica visibly unhealthy
4913                                        // instead of silently swallowing
4914                                        // corruption.
4915                                        break;
4916                                    }
4917                                }
4918                            }
4919                            if let Some(applied_lsn) = batch_applied_lsn {
4920                                let apply_errors = self.replica_apply_error_counts();
4921                                let apply_errors_total =
4922                                    apply_errors.iter().map(|(_, count)| *count).sum::<u64>();
4923                                let divergence_total = apply_errors
4924                                    .iter()
4925                                    .find(|(kind, _)| {
4926                                        matches!(
4927                                            kind,
4928                                            crate::replication::logical::ApplyErrorKind::Divergence
4929                                        )
4930                                    })
4931                                    .map(|(_, count)| *count)
4932                                    .unwrap_or(0);
4933                                let ack_payload = crate::json!({
4934                                    "replica_id": replica_id.clone(),
4935                                    "applied_lsn": applied_lsn,
4936                                    "durable_lsn": applied_lsn,
4937                                    "apply_errors_total": apply_errors_total,
4938                                    "divergence_total": divergence_total
4939                                });
4940                                let ack_request = tonic::Request::new(JsonPayloadRequest {
4941                                    payload_json: crate::json::to_string(&ack_payload)
4942                                        .unwrap_or_else(|_| "{}".to_string()),
4943                                });
4944                                if client.ack_replica_lsn(ack_request).await.is_err() {
4945                                    ack_failed = true;
4946                                    self.persist_replication_health(
4947                                        "ack_error",
4948                                        "primary ack_replica_lsn request failed",
4949                                        current_lsn,
4950                                        oldest_available_lsn,
4951                                    );
4952                                }
4953                            }
4954                            if ack_failed {
4955                                std::thread::sleep(std::time::Duration::from_millis(poll_ms));
4956                                continue;
4957                            }
4958                        }
4959                        self.persist_replication_health(
4960                            "healthy",
4961                            "",
4962                            current_lsn,
4963                            oldest_available_lsn,
4964                        );
4965                    } else {
4966                        self.persist_replication_health(
4967                            "apply_error",
4968                            "failed to parse pull_wal_records response",
4969                            None,
4970                            None,
4971                        );
4972                    }
4973                } else {
4974                    self.persist_replication_health(
4975                        "connecting",
4976                        "primary pull_wal_records request failed",
4977                        None,
4978                        None,
4979                    );
4980                    std::thread::sleep(std::time::Duration::from_millis(poll_ms.max(250)));
4981                }
4982            }
4983        });
4984    }
4985
4986    /// Poll CDC events since a given LSN.
4987    pub fn cdc_poll(
4988        &self,
4989        since_lsn: u64,
4990        max_count: usize,
4991    ) -> Vec<crate::replication::cdc::ChangeEvent> {
4992        self.inner.cdc.poll(since_lsn, max_count)
4993    }
4994
4995    /// PLAN.md Phase 11.4 — current CDC LSN. Public mutation
4996    /// surfaces (HTTP query, gRPC entity ops) call this immediately
4997    /// after a successful write to feed `enforce_commit_policy`.
4998    pub fn cdc_current_lsn(&self) -> u64 {
4999        self.inner.cdc.current_lsn()
5000    }
5001
5002    pub fn kv_watch_events_since(
5003        &self,
5004        collection: &str,
5005        key: &str,
5006        since_lsn: u64,
5007        max_count: usize,
5008    ) -> Vec<crate::replication::cdc::KvWatchEvent> {
5009        self.inner
5010            .cdc
5011            .poll(since_lsn, max_count)
5012            .into_iter()
5013            .filter_map(|event| event.kv)
5014            .filter(|event| event.collection == collection && event.key == key)
5015            .collect()
5016    }
5017
5018    pub fn kv_watch_events_since_prefix(
5019        &self,
5020        collection: &str,
5021        prefix: &str,
5022        since_lsn: u64,
5023        max_count: usize,
5024    ) -> Vec<crate::replication::cdc::KvWatchEvent> {
5025        self.inner
5026            .cdc
5027            .poll(since_lsn, max_count)
5028            .into_iter()
5029            .filter_map(|event| event.kv)
5030            .filter(|event| event.collection == collection && event.key.starts_with(prefix))
5031            .collect()
5032    }
5033
5034    pub(crate) fn kv_watch_subscribe<'a>(
5035        &'a self,
5036        collection: impl Into<String>,
5037        key: impl Into<String>,
5038        from_lsn: Option<u64>,
5039    ) -> crate::runtime::kv_watch::KvWatchStream<'a> {
5040        crate::runtime::kv_watch::KvWatchStream::subscribe(
5041            &self.inner.cdc,
5042            &self.inner.kv_stats,
5043            collection,
5044            key,
5045            from_lsn,
5046            self.kv_watch_idle_timeout_ms(),
5047        )
5048    }
5049
5050    pub(crate) fn kv_watch_subscribe_prefix<'a>(
5051        &'a self,
5052        collection: impl Into<String>,
5053        prefix: impl Into<String>,
5054        from_lsn: Option<u64>,
5055    ) -> crate::runtime::kv_watch::KvWatchStream<'a> {
5056        crate::runtime::kv_watch::KvWatchStream::subscribe_prefix(
5057            &self.inner.cdc,
5058            &self.inner.kv_stats,
5059            collection,
5060            prefix,
5061            from_lsn,
5062            self.kv_watch_idle_timeout_ms(),
5063        )
5064    }
5065
5066    pub(crate) fn kv_watch_idle_timeout_ms(&self) -> u64 {
5067        self.config_u64("red.config.kv.watch.idle_timeout_ms", 60_000)
5068    }
5069
5070    /// Get backup scheduler status.
5071    pub fn backup_status(&self) -> crate::replication::scheduler::BackupStatus {
5072        self.inner.backup_scheduler.status()
5073    }
5074
5075    /// Borrow the runtime's result Blob Cache.
5076    ///
5077    /// Wired for the `/admin/blob_cache/sweep` and
5078    /// `/admin/blob_cache/flush_namespace` HTTP handlers (issue #148
5079    /// follow-up): both delegate to
5080    /// `crate::storage::cache::sweeper::BlobCacheSweeper`, which takes a
5081    /// `&BlobCache`. Also used by `trigger_backup` when
5082    /// `red.config.backup.include_blob_cache=true` to locate the L2
5083    /// directory for archival.
5084    pub fn result_blob_cache(&self) -> &crate::storage::cache::BlobCache {
5085        &self.inner.result_blob_cache
5086    }
5087
5088    /// PLAN.md Phase 11.4 — owned snapshot of every registered
5089    /// replica's state on this primary. Returns empty vec on
5090    /// non-primary instances or when no replicas are registered yet.
5091    pub fn primary_replica_snapshots(&self) -> Vec<crate::replication::primary::ReplicaState> {
5092        self.inner
5093            .db
5094            .replication
5095            .as_ref()
5096            .map(|repl| repl.replica_snapshots())
5097            .unwrap_or_default()
5098    }
5099
5100    /// Issue #826 — re-evaluate write-admission flow control from the
5101    /// live primary replica registry and return the resulting throttle
5102    /// state. Computes the max lag across in-quorum replicas (async
5103    /// read-replicas excluded) against the primary's current LSN and
5104    /// engages/releases the `WriteGate` throttle accordingly.
5105    ///
5106    /// No-op (returns `false`) on non-primary instances or when flow
5107    /// control is disabled (soft target `0`). Cheap enough to call on
5108    /// the replica-ack path and from `/metrics` scrapes so the throttle
5109    /// tracks lag without a dedicated background loop.
5110    pub fn refresh_replication_flow_control(&self) -> bool {
5111        let flow = self.inner.write_gate.flow_control();
5112        if !flow.is_enabled() {
5113            return false;
5114        }
5115        let Some(repl) = self.inner.db.replication.as_ref() else {
5116            return false;
5117        };
5118        let primary_lsn = repl.current_logical_lsn();
5119        let replicas = repl.replica_snapshots();
5120        flow.observe(&replicas, primary_lsn)
5121    }
5122
5123    /// PLAN.md Phase 11.4 — active commit policy. Reads
5124    /// `RED_PRIMARY_COMMIT_POLICY` once at runtime construction;
5125    /// future env reloads will need a reload endpoint. Default is
5126    /// `Local` — current behavior, no replica blocking.
5127    pub fn commit_policy(&self) -> crate::replication::CommitPolicy {
5128        crate::replication::CommitPolicy::from_env()
5129    }
5130
5131    /// PLAN.md Phase 11.5 — accessor for replica-side apply error
5132    /// counters (gap / divergence / apply / decode / apply_miss). Returned
5133    /// snapshot is consistent across the counters; the labels match
5134    /// `reddb_replica_apply_errors_total{kind}`. Issue #814 adds the
5135    /// `apply_miss` kind for deletes against a missing target.
5136    pub fn replica_apply_error_counts(
5137        &self,
5138    ) -> [(crate::replication::logical::ApplyErrorKind, u64); 5] {
5139        self.inner.replica_apply_metrics.snapshot()
5140    }
5141
5142    /// PLAN.md Phase 4.4 — per-caller quota bucket. Always
5143    /// returned; `is_configured()` lets callers short-circuit.
5144    pub fn quota_bucket(&self) -> &crate::runtime::quota_bucket::QuotaBucket {
5145        &self.inner.quota_bucket
5146    }
5147
5148    /// PLAN.md Phase 11.4 — observability snapshot of every
5149    /// replica's durable LSN as known to the commit waiter. Empty
5150    /// vec on non-primary instances or when no replica has acked.
5151    pub fn commit_waiter_snapshot(&self) -> Vec<(String, u64)> {
5152        self.inner
5153            .db
5154            .replication
5155            .as_ref()
5156            .map(|repl| repl.commit_waiter.snapshot())
5157            .unwrap_or_default()
5158    }
5159
5160    /// PLAN.md Phase 11.4 — `(reached, timed_out, not_required, last_micros)`
5161    /// counters for /metrics. Always-zero on non-primary instances.
5162    pub fn commit_waiter_metrics_snapshot(&self) -> (u64, u64, u64, u64) {
5163        self.inner
5164            .db
5165            .replication
5166            .as_ref()
5167            .map(|repl| repl.commit_waiter.metrics_snapshot())
5168            .unwrap_or((0, 0, 0, 0))
5169    }
5170
5171    /// Named commit watermark: highest LSN durable on the active
5172    /// synchronous commit quorum. Returns 0 when the active policy does
5173    /// not require replica durability.
5174    pub fn commit_watermark(&self) -> u64 {
5175        match self.commit_policy() {
5176            crate::replication::CommitPolicy::AckN(n) if n > 0 => self
5177                .inner
5178                .db
5179                .replication
5180                .as_ref()
5181                .map(|repl| repl.commit_waiter.commit_watermark(n))
5182                .unwrap_or(0),
5183            crate::replication::CommitPolicy::Quorum => self
5184                .inner
5185                .db
5186                .quorum
5187                .as_ref()
5188                .map(|q| q.commit_watermark())
5189                .unwrap_or(0),
5190            _ => 0,
5191        }
5192    }
5193
5194    /// PLAN.md Phase 11.4 — block until at least `count` replicas
5195    /// have durably applied through `target_lsn`, or `timeout`
5196    /// elapses. Returns the `AwaitOutcome` so the caller can decide
5197    /// whether to surface a timeout error to the client or continue
5198    /// (the policy mapping lives in the commit dispatcher).
5199    ///
5200    /// Used by the `ack_n` commit policy once the operator flips
5201    /// `RED_PRIMARY_COMMIT_POLICY` away from `local`.
5202    pub fn await_replica_acks(
5203        &self,
5204        target_lsn: u64,
5205        count: u32,
5206        timeout: std::time::Duration,
5207    ) -> crate::replication::AwaitOutcome {
5208        match &self.inner.db.replication {
5209            Some(repl) => repl.commit_waiter.await_acks(target_lsn, count, timeout),
5210            None => {
5211                // No replication configured: policy must be `Local`.
5212                // Treat as immediate `NotRequired` so callers don't
5213                // block on a degenerate setup.
5214                crate::replication::AwaitOutcome::NotRequired
5215            }
5216        }
5217    }
5218
5219    /// PLAN.md Phase 11.4 — enforce the configured commit policy
5220    /// against `post_lsn` (the LSN of the just-completed write).
5221    /// Returns `Ok(AwaitOutcome)` on every successful enforcement
5222    /// (including `Reached` and `TimedOut` when fail-on-timeout is
5223    /// off). Returns `Err(ReadOnly)` only when a synchronous policy
5224    /// misses its threshold and `RED_COMMIT_FAIL_ON_TIMEOUT=true` is
5225    /// set.
5226    ///
5227    /// The HTTP / gRPC / wire surfaces map the error to 504 / wire
5228    /// backoff. Default behaviour (env unset) logs warn and returns
5229    /// success — matches PLAN.md "default v1 stays local" semantics
5230    /// while still letting the operator opt into hard-blocking.
5231    pub fn enforce_commit_policy(
5232        &self,
5233        post_lsn: u64,
5234    ) -> RedDBResult<crate::replication::AwaitOutcome> {
5235        let policy = self.commit_policy();
5236        if matches!(policy, crate::replication::CommitPolicy::Quorum) {
5237            return match self.inner.db.wait_for_replication_quorum(post_lsn) {
5238                Ok(()) => Ok(crate::replication::AwaitOutcome::Reached(0)),
5239                Err(err) => {
5240                    tracing::warn!(
5241                        target: "reddb::commit",
5242                        post_lsn,
5243                        error = %err,
5244                        "quorum: timed out waiting for commit watermark"
5245                    );
5246                    let fail = std::env::var("RED_COMMIT_FAIL_ON_TIMEOUT")
5247                        .ok()
5248                        .map(|v| {
5249                            let t = v.trim();
5250                            t.eq_ignore_ascii_case("true")
5251                                || t == "1"
5252                                || t.eq_ignore_ascii_case("yes")
5253                        })
5254                        .unwrap_or(false);
5255                    if fail {
5256                        return Err(RedDBError::ReadOnly(format!(
5257                            "commit policy timed out at lsn {post_lsn}: {err} (RED_COMMIT_FAIL_ON_TIMEOUT=true)"
5258                        )));
5259                    }
5260                    Ok(crate::replication::AwaitOutcome::TimedOut {
5261                        observed: 0,
5262                        required: 1,
5263                    })
5264                }
5265            };
5266        }
5267
5268        let n = match policy {
5269            crate::replication::CommitPolicy::AckN(n) if n > 0 => n,
5270            _ => return Ok(crate::replication::AwaitOutcome::NotRequired),
5271        };
5272        let timeout_ms = std::env::var("RED_REPLICATION_ACK_TIMEOUT_MS")
5273            .ok()
5274            .and_then(|v| v.parse::<u64>().ok())
5275            .unwrap_or(5_000);
5276        let outcome =
5277            self.await_replica_acks(post_lsn, n, std::time::Duration::from_millis(timeout_ms));
5278        {
5279            use crate::runtime::control_events::{EventKind, Outcome, Sensitivity};
5280            let (event_outcome, fields) = match &outcome {
5281                crate::replication::AwaitOutcome::Reached(count) => (
5282                    Outcome::Allowed,
5283                    vec![
5284                        (
5285                            "post_lsn".to_string(),
5286                            Sensitivity::raw(post_lsn.to_string()),
5287                        ),
5288                        ("required".to_string(), Sensitivity::raw(n.to_string())),
5289                        ("observed".to_string(), Sensitivity::raw(count.to_string())),
5290                        (
5291                            "timeout_ms".to_string(),
5292                            Sensitivity::raw(timeout_ms.to_string()),
5293                        ),
5294                    ],
5295                ),
5296                crate::replication::AwaitOutcome::TimedOut { observed, required } => (
5297                    Outcome::Error,
5298                    vec![
5299                        (
5300                            "post_lsn".to_string(),
5301                            Sensitivity::raw(post_lsn.to_string()),
5302                        ),
5303                        (
5304                            "required".to_string(),
5305                            Sensitivity::raw(required.to_string()),
5306                        ),
5307                        (
5308                            "observed".to_string(),
5309                            Sensitivity::raw(observed.to_string()),
5310                        ),
5311                        (
5312                            "timeout_ms".to_string(),
5313                            Sensitivity::raw(timeout_ms.to_string()),
5314                        ),
5315                    ],
5316                ),
5317                crate::replication::AwaitOutcome::NotRequired => (Outcome::Allowed, Vec::new()),
5318            };
5319            if !fields.is_empty() {
5320                self.emit_control_event(
5321                    EventKind::ReplicationSafety,
5322                    event_outcome,
5323                    "replication_commit_policy",
5324                    Some(format!("replication:lsn:{post_lsn}")),
5325                    None,
5326                    fields,
5327                )?;
5328            }
5329        }
5330        if let crate::replication::AwaitOutcome::TimedOut { observed, required } = &outcome {
5331            tracing::warn!(
5332                target: "reddb::commit",
5333                post_lsn,
5334                observed = *observed,
5335                required = *required,
5336                timeout_ms,
5337                "ack_n: timed out waiting for replicas"
5338            );
5339            let fail = std::env::var("RED_COMMIT_FAIL_ON_TIMEOUT")
5340                .ok()
5341                .map(|v| {
5342                    let t = v.trim();
5343                    t.eq_ignore_ascii_case("true") || t == "1" || t.eq_ignore_ascii_case("yes")
5344                })
5345                .unwrap_or(false);
5346            if fail {
5347                return Err(RedDBError::ReadOnly(format!(
5348                    "commit policy timed out at lsn {post_lsn}: observed={observed} required={required} (RED_COMMIT_FAIL_ON_TIMEOUT=true)"
5349                )));
5350            }
5351        }
5352        Ok(outcome)
5353    }
5354
5355    /// PLAN.md Phase 6.3 — whether at-rest encryption is configured.
5356    /// Reads `RED_ENCRYPTION_KEY` / `RED_ENCRYPTION_KEY_FILE` lazily;
5357    /// returns `("enabled", None)` when a key is loadable, `("error", Some(msg))`
5358    /// when the operator set the env but it doesn't parse, and
5359    /// `("disabled", None)` when no key is configured. The pager
5360    /// hookup is deferred — this accessor surfaces the operator's
5361    /// intent for /admin/status without yet using the key in writes.
5362    pub fn encryption_at_rest_status(&self) -> (&'static str, Option<String>) {
5363        match crate::crypto::page_encryption::key_from_env() {
5364            Ok(Some(_)) => ("enabled", None),
5365            Ok(None) => ("disabled", None),
5366            Err(err) => ("error", Some(err)),
5367        }
5368    }
5369
5370    /// PLAN.md Phase 11.5 — current replica apply health label
5371    /// (`ok`, `gap`, `divergence`, `apply_error`, `connecting`,
5372    /// `stalled_gap`). Read from the persisted `red.replication.state`
5373    /// config key updated by the replica loop. Returns `None` on
5374    /// non-replica instances or when no apply has run yet.
5375    pub fn replica_apply_health(&self) -> Option<String> {
5376        let state = self.config_string("red.replication.state", "");
5377        if state.is_empty() {
5378            None
5379        } else {
5380            Some(state)
5381        }
5382    }
5383
5384    /// Current local LSN paired with the LSN of the most recently
5385    /// archived WAL segment. The difference is the replication /
5386    /// archive lag operators alert on (PLAN.md Phase 5.1). Returns
5387    /// `(0, 0)` when neither replication nor archiving is configured.
5388    pub fn wal_archive_progress(&self) -> (u64, u64) {
5389        let current_lsn = self
5390            .inner
5391            .db
5392            .replication
5393            .as_ref()
5394            .map(|repl| {
5395                repl.logical_wal_spool
5396                    .as_ref()
5397                    .map(|spool| spool.current_lsn())
5398                    .unwrap_or_else(|| repl.wal_buffer.current_lsn())
5399            })
5400            .unwrap_or_else(|| self.inner.cdc.current_lsn());
5401        let last_archived_lsn = self.config_u64("red.config.timeline.last_archived_lsn", 0);
5402        (current_lsn, last_archived_lsn)
5403    }
5404
5405    /// Trigger an immediate backup.
5406    pub fn trigger_backup(&self) -> RedDBResult<crate::replication::scheduler::BackupResult> {
5407        let result = (|| {
5408            self.check_write(crate::runtime::write_gate::WriteKind::Backup)?;
5409            // Defense in depth — check_write above already rejects when
5410            // the lease is NotHeld, but log + audit the lease angle here
5411            // explicitly so dashboards distinguish "lease lost" from a
5412            // generic read-only refusal.
5413            self.assert_remote_write_allowed("admin/backup")?;
5414            let started = std::time::Instant::now();
5415            let snapshot = self.create_snapshot()?;
5416            let mut uploaded = false;
5417
5418            if let (Some(backend), Some(path)) =
5419                (&self.inner.db.remote_backend, self.inner.db.path())
5420            {
5421                let default_snapshot_prefix = self.inner.db.options().default_snapshot_prefix();
5422                let default_wal_prefix = self.inner.db.options().default_wal_archive_prefix();
5423                let default_head_key = self.inner.db.options().default_backup_head_key();
5424                let snapshot_prefix = self.config_string(
5425                    "red.config.backup.snapshot_prefix",
5426                    &default_snapshot_prefix,
5427                );
5428                let wal_prefix =
5429                    self.config_string("red.config.wal.archive.prefix", &default_wal_prefix);
5430                let head_key = self.config_string("red.config.backup.head_key", &default_head_key);
5431                let timeline_id = self.config_string("red.config.timeline.id", "main");
5432                let snapshot_key = crate::storage::wal::archive_snapshot(
5433                    backend.as_ref(),
5434                    path,
5435                    snapshot.snapshot_id,
5436                    &snapshot_prefix,
5437                )
5438                .map_err(|err| RedDBError::Internal(err.to_string()))?;
5439                let current_lsn = self
5440                    .inner
5441                    .db
5442                    .replication
5443                    .as_ref()
5444                    .map(|repl| {
5445                        repl.logical_wal_spool
5446                            .as_ref()
5447                            .map(|spool| spool.current_lsn())
5448                            .unwrap_or_else(|| repl.wal_buffer.current_lsn())
5449                    })
5450                    .unwrap_or_else(|| self.inner.cdc.current_lsn());
5451                let last_archived_lsn = self.config_u64("red.config.timeline.last_archived_lsn", 0);
5452                // Hash the local snapshot bytes so the manifest can carry
5453                // the digest for restore-side verification (PLAN.md
5454                // Phase 4). Failure to hash is non-fatal — we still
5455                // publish the manifest, just without a checksum, so a
5456                // future fix can backfill rather than losing the backup.
5457                let snapshot_sha256 =
5458                    crate::storage::wal::SnapshotManifest::compute_snapshot_sha256(path)
5459                        .map_err(|err| {
5460                            tracing::warn!(
5461                                target: "reddb::backup",
5462                                error = %err,
5463                                snapshot_id = snapshot.snapshot_id,
5464                                "snapshot hash failed; manifest will lack checksum"
5465                            );
5466                        })
5467                        .ok();
5468                let manifest = crate::storage::wal::SnapshotManifest {
5469                    timeline_id: timeline_id.clone(),
5470                    snapshot_key: snapshot_key.clone(),
5471                    snapshot_id: snapshot.snapshot_id,
5472                    snapshot_time: snapshot.created_at_unix_ms as u64,
5473                    base_lsn: current_lsn,
5474                    schema_version: crate::api::REDDB_FORMAT_VERSION,
5475                    format_version: crate::api::REDDB_FORMAT_VERSION,
5476                    snapshot_sha256,
5477                };
5478                crate::storage::wal::publish_snapshot_manifest(backend.as_ref(), &manifest)
5479                    .map_err(|err| RedDBError::Internal(err.to_string()))?;
5480
5481                // PLAN.md Phase 11.3 — read the head of the WAL hash chain
5482                // so the new segment can link back. `None` means we're
5483                // starting a fresh timeline (after a clean restore or on
5484                // first archive ever); the segment's `prev_hash` will be
5485                // `None` and restore-side validation accepts that only for
5486                // the first segment in `plan.wal_segments`.
5487                let prev_segment_hash =
5488                    self.config_string("red.config.timeline.last_segment_hash", "");
5489                let prev_hash_arg = if prev_segment_hash.is_empty() {
5490                    None
5491                } else {
5492                    Some(prev_segment_hash)
5493                };
5494
5495                let archived_lsn = if let Some(primary) = &self.inner.db.replication {
5496                    let oldest = primary
5497                        .logical_wal_spool
5498                        .as_ref()
5499                        .and_then(|spool| spool.oldest_lsn().ok().flatten())
5500                        .or_else(|| primary.wal_buffer.oldest_lsn())
5501                        .unwrap_or(last_archived_lsn);
5502                    if last_archived_lsn > 0 && last_archived_lsn < oldest.saturating_sub(1) {
5503                        return Err(RedDBError::Internal(format!(
5504                        "logical WAL gap detected: last_archived_lsn={last_archived_lsn}, oldest_available_lsn={oldest}"
5505                    )));
5506                    }
5507                    let records = if let Some(spool) = &primary.logical_wal_spool {
5508                        spool
5509                            .read_since(last_archived_lsn, usize::MAX)
5510                            .map_err(|err| RedDBError::Internal(err.to_string()))?
5511                    } else {
5512                        primary.wal_buffer.read_since(last_archived_lsn, usize::MAX)
5513                    };
5514                    if let Some(meta) = crate::storage::wal::archive_change_records(
5515                        backend.as_ref(),
5516                        &wal_prefix,
5517                        &records,
5518                        prev_hash_arg,
5519                    )
5520                    .map_err(|err| RedDBError::Internal(err.to_string()))?
5521                    {
5522                        let _ = primary.prune_retained_wal_through(meta.lsn_end);
5523                        // Advance the chain head so the next archive call
5524                        // links to this segment's hash. If the segment has
5525                        // no sha256 (legacy / hashing failed) we leave the
5526                        // head as-is — the next segment then carries the
5527                        // prior chain head, preserving continuity.
5528                        if let Some(sha) = &meta.sha256 {
5529                            self.inner.db.store().set_config_tree(
5530                                "red.config.timeline",
5531                                &crate::json!({ "last_segment_hash": sha }),
5532                            );
5533                        }
5534                        meta.lsn_end
5535                    } else {
5536                        last_archived_lsn
5537                    }
5538                } else {
5539                    last_archived_lsn
5540                };
5541
5542                let head = crate::storage::wal::BackupHead {
5543                    timeline_id,
5544                    snapshot_key,
5545                    snapshot_id: snapshot.snapshot_id,
5546                    snapshot_time: snapshot.created_at_unix_ms as u64,
5547                    current_lsn,
5548                    last_archived_lsn: archived_lsn,
5549                    wal_prefix,
5550                };
5551                crate::storage::wal::publish_backup_head(backend.as_ref(), &head_key, &head)
5552                    .map_err(|err| RedDBError::Internal(err.to_string()))?;
5553                self.inner.db.store().set_config_tree(
5554                    "red.config.timeline",
5555                    &crate::json!({
5556                        "last_archived_lsn": archived_lsn,
5557                        "id": head.timeline_id
5558                    }),
5559                );
5560
5561                // PLAN.md Phase 2.4 — refresh the unified `MANIFEST.json`
5562                // at the prefix root so external tooling sees a single
5563                // catalog of every snapshot + WAL segment with their
5564                // checksums. Best-effort: a manifest publish failure
5565                // doesn't fail the backup (the per-artifact sidecars
5566                // already give restore-side integrity), but it does log
5567                // so dashboards can flag stale catalogs.
5568                if let Err(err) = crate::storage::wal::publish_unified_manifest_for_prefix(
5569                    backend.as_ref(),
5570                    &snapshot_prefix,
5571                ) {
5572                    tracing::warn!(
5573                        target: "reddb::backup",
5574                        error = %err,
5575                        snapshot_prefix = %snapshot_prefix,
5576                        "unified MANIFEST.json refresh failed; per-artifact sidecars unaffected"
5577                    );
5578                }
5579
5580                // PLAN.md Phase 11.4 — when the operator picked a
5581                // commit policy that demands replica durability, block
5582                // until the configured count of replicas has acked the
5583                // archived LSN (or the timeout fires). For backup the
5584                // policy decides the *DR posture* — `local` returns
5585                // immediately, `ack_n` ensures at least N replicas saw
5586                // the new tail before we report success to the
5587                // operator. A `TimedOut` is logged but does NOT fail
5588                // the backup: the local WAL + remote upload are durable
5589                // regardless; the missing acks are reported via
5590                // /metrics and /admin/status so the operator can decide.
5591                match self.commit_policy() {
5592                    crate::replication::CommitPolicy::AckN(n) if n > 0 => {
5593                        let timeout = std::env::var("RED_REPLICATION_ACK_TIMEOUT_MS")
5594                            .ok()
5595                            .and_then(|v| v.parse::<u64>().ok())
5596                            .unwrap_or(5_000);
5597                        let outcome = self.await_replica_acks(
5598                            archived_lsn,
5599                            n,
5600                            std::time::Duration::from_millis(timeout),
5601                        );
5602                        match outcome {
5603                            crate::replication::AwaitOutcome::Reached(count) => {
5604                                tracing::debug!(
5605                                    target: "reddb::backup",
5606                                    archived_lsn,
5607                                    n,
5608                                    count,
5609                                    "ack_n: replicas synced before backup return"
5610                                );
5611                            }
5612                            crate::replication::AwaitOutcome::TimedOut { observed, required } => {
5613                                tracing::warn!(
5614                                    target: "reddb::backup",
5615                                    archived_lsn,
5616                                    observed,
5617                                    required,
5618                                    timeout_ms = timeout,
5619                                    "ack_n: timed out waiting for replicas; backup uploaded but DR posture degraded"
5620                                );
5621                            }
5622                            crate::replication::AwaitOutcome::NotRequired => {}
5623                        }
5624                    }
5625                    _ => {} // Local / RemoteWal / Quorum: no blocking yet
5626                }
5627
5628                // Issue #148 follow-up — opt-in archive of the L2 Blob Cache
5629                // directory tree. Default off so a standard backup stays
5630                // small; flip via `red.config.backup.include_blob_cache=true`
5631                // when warm-cache restore is required (per
5632                // docs/operations/blob-cache-backup-restore.md §1).
5633                //
5634                // The L2 tree is *derived* state (ADR 0006) — its absence
5635                // never causes data loss; it only affects post-restore
5636                // p99 latency until the cache re-warms. We therefore log
5637                // (not fail) on per-file upload errors so a partial L2
5638                // upload never aborts a healthy snapshot+WAL backup.
5639                if self.config_bool("red.config.backup.include_blob_cache", false) {
5640                    let blob_cache_prefix = self.config_string(
5641                        "red.config.backup.blob_cache_prefix",
5642                        &format!("{snapshot_prefix}blob_cache/"),
5643                    );
5644                    if let Some(l2_path) = self.inner.result_blob_cache.l2_path() {
5645                        match crate::storage::cache::archive_blob_cache_l2(
5646                            backend.as_ref(),
5647                            l2_path,
5648                            &blob_cache_prefix,
5649                        ) {
5650                            Ok(count) => {
5651                                tracing::info!(
5652                                    target: "reddb::backup",
5653                                    files_uploaded = count,
5654                                    blob_cache_prefix = %blob_cache_prefix,
5655                                    "include_blob_cache: archived L2 directory"
5656                                );
5657                            }
5658                            Err(err) => {
5659                                tracing::warn!(
5660                                    target: "reddb::backup",
5661                                    error = %err,
5662                                    blob_cache_prefix = %blob_cache_prefix,
5663                                    "include_blob_cache: L2 archive failed; backup proceeding (cache is derived state)"
5664                                );
5665                            }
5666                        }
5667                    } else {
5668                        tracing::debug!(
5669                            target: "reddb::backup",
5670                            "include_blob_cache=true but no L2 path configured; nothing to archive"
5671                        );
5672                    }
5673                }
5674
5675                uploaded = true;
5676            }
5677
5678            Ok(crate::replication::scheduler::BackupResult {
5679                snapshot_id: snapshot.snapshot_id,
5680                uploaded,
5681                duration_ms: started.elapsed().as_millis() as u64,
5682                timestamp: snapshot.created_at_unix_ms as u64,
5683            })
5684        })();
5685
5686        use crate::runtime::control_events::{EventKind, Outcome, Sensitivity};
5687        let (current_lsn, last_archived_lsn) = self.wal_archive_progress();
5688        let mut fields = vec![
5689            (
5690                "current_lsn".to_string(),
5691                Sensitivity::raw(current_lsn.to_string()),
5692            ),
5693            (
5694                "last_archived_lsn".to_string(),
5695                Sensitivity::raw(last_archived_lsn.to_string()),
5696            ),
5697        ];
5698        if let Ok(backup) = &result {
5699            fields.push((
5700                "snapshot_id".to_string(),
5701                Sensitivity::raw(backup.snapshot_id.to_string()),
5702            ));
5703            fields.push((
5704                "uploaded".to_string(),
5705                Sensitivity::raw(backup.uploaded.to_string()),
5706            ));
5707            fields.push((
5708                "duration_ms".to_string(),
5709                Sensitivity::raw(backup.duration_ms.to_string()),
5710            ));
5711            fields.push((
5712                "snapshot_time".to_string(),
5713                Sensitivity::raw(backup.timestamp.to_string()),
5714            ));
5715        }
5716        let outcome = match &result {
5717            Ok(_) => Outcome::Allowed,
5718            Err(err) => control_event_outcome_for_error(err),
5719        };
5720        let reason = result.as_ref().err().map(|err| err.to_string());
5721        self.emit_control_event(
5722            EventKind::BackupRun,
5723            outcome,
5724            "backup_trigger",
5725            Some("backup:trigger".to_string()),
5726            reason,
5727            fields,
5728        )?;
5729        result
5730    }
5731
5732    pub fn acquire(&self) -> RedDBResult<RuntimeConnection> {
5733        let mut pool = self
5734            .inner
5735            .pool
5736            .lock()
5737            .map_err(|e| RedDBError::Internal(format!("connection pool lock poisoned: {e}")))?;
5738        if pool.active >= self.inner.pool_config.max_connections {
5739            return Err(RedDBError::Internal(
5740                "connection pool exhausted".to_string(),
5741            ));
5742        }
5743
5744        let id = if let Some(id) = pool.idle.pop() {
5745            id
5746        } else {
5747            let id = pool.next_id;
5748            pool.next_id += 1;
5749            id
5750        };
5751        pool.active += 1;
5752        pool.total_checkouts += 1;
5753        drop(pool);
5754
5755        Ok(RuntimeConnection {
5756            id,
5757            inner: Arc::clone(&self.inner),
5758        })
5759    }
5760
5761    pub fn checkpoint(&self) -> RedDBResult<()> {
5762        // Local fsync always allowed — losing the lease shouldn't
5763        // prevent us from durably persisting what's already in memory.
5764        // The remote upload is the side-effect that risks clobbering a
5765        // peer's state, so it's behind the lease gate.
5766        self.inner.db.flush_local_only().map_err(|err| {
5767            // Issue #205 — local flush failure is a CheckpointFailed
5768            // operator-grade event. The local-flush path also covers
5769            // the WAL fsync we depend on, so a failure here doubles as
5770            // the WalFsyncFailed signal for the runtime entry point.
5771            let msg = err.to_string();
5772            crate::telemetry::operator_event::OperatorEvent::CheckpointFailed {
5773                lsn: 0,
5774                error: msg.clone(),
5775            }
5776            .emit_global();
5777            crate::telemetry::operator_event::OperatorEvent::WalFsyncFailed {
5778                path: "<flush_local_only>".to_string(),
5779                error: msg.clone(),
5780            }
5781            .emit_global();
5782            RedDBError::Engine(msg)
5783        })?;
5784        if let Err(err) = self.assert_remote_write_allowed("checkpoint") {
5785            tracing::warn!(
5786                target: "reddb::serverless::lease",
5787                error = %err,
5788                "checkpoint: skipping remote upload — lease not held"
5789            );
5790            return Ok(());
5791        }
5792        self.inner
5793            .db
5794            .upload_to_remote_backend()
5795            .map_err(|err| RedDBError::Engine(err.to_string()))
5796    }
5797
5798    /// Guard remote-mutating operations on the writer lease.
5799    /// Returns `Ok(())` when no remote backend is configured (the
5800    /// lease is irrelevant) or the lease state is `NotRequired` /
5801    /// `Held`. Returns `RedDBError::ReadOnly` when the lease is
5802    /// `NotHeld`, with an audit-friendly action label so the caller
5803    /// can record the rejection.
5804    pub(crate) fn assert_remote_write_allowed(&self, action: &str) -> RedDBResult<()> {
5805        if self.inner.db.remote_backend.is_none() {
5806            return Ok(());
5807        }
5808        match self.inner.write_gate.lease_state() {
5809            crate::runtime::write_gate::LeaseGateState::NotHeld => {
5810                self.inner.audit_log.record(
5811                    action,
5812                    "system",
5813                    "remote_backend",
5814                    "err: writer lease not held",
5815                    crate::json::Value::Null,
5816                );
5817                Err(RedDBError::ReadOnly(format!(
5818                    "writer lease not held — {action} blocked (serverless fence)"
5819                )))
5820            }
5821            _ => Ok(()),
5822        }
5823    }
5824
5825    pub fn run_maintenance(&self) -> RedDBResult<()> {
5826        self.inner
5827            .db
5828            .run_maintenance()
5829            .map_err(|err| RedDBError::Internal(err.to_string()))
5830    }
5831
5832    pub fn scan_collection(
5833        &self,
5834        collection: &str,
5835        cursor: Option<ScanCursor>,
5836        limit: usize,
5837    ) -> RedDBResult<ScanPage> {
5838        let store = self.inner.db.store();
5839        let manager = store
5840            .get_collection(collection)
5841            .ok_or_else(|| RedDBError::NotFound(collection.to_string()))?;
5842
5843        let mut entities = manager.query_all(|_| true);
5844        entities.sort_by_key(|entity| entity.id.raw());
5845
5846        let offset = cursor.map(|cursor| cursor.offset).unwrap_or(0);
5847        let total = entities.len();
5848        let end = total.min(offset.saturating_add(limit.max(1)));
5849        let items = if offset >= total {
5850            Vec::new()
5851        } else {
5852            entities[offset..end].to_vec()
5853        };
5854        let next = (end < total).then_some(ScanCursor { offset: end });
5855
5856        Ok(ScanPage {
5857            collection: collection.to_string(),
5858            items,
5859            next,
5860            total,
5861        })
5862    }
5863
5864    pub fn catalog(&self) -> CatalogModelSnapshot {
5865        self.inner.db.catalog_model_snapshot()
5866    }
5867
5868    pub fn catalog_consistency_report(&self) -> crate::catalog::CatalogConsistencyReport {
5869        self.inner.db.catalog_consistency_report()
5870    }
5871
5872    pub fn catalog_attention_summary(&self) -> CatalogAttentionSummary {
5873        crate::catalog::attention_summary(&self.catalog())
5874    }
5875
5876    pub fn collection_attention(&self) -> Vec<CollectionDescriptor> {
5877        crate::catalog::collection_attention(&self.catalog())
5878    }
5879
5880    pub fn index_attention(&self) -> Vec<CatalogIndexStatus> {
5881        crate::catalog::index_attention(&self.catalog())
5882    }
5883
5884    pub fn graph_projection_attention(&self) -> Vec<CatalogGraphProjectionStatus> {
5885        crate::catalog::graph_projection_attention(&self.catalog())
5886    }
5887
5888    pub fn analytics_job_attention(&self) -> Vec<CatalogAnalyticsJobStatus> {
5889        crate::catalog::analytics_job_attention(&self.catalog())
5890    }
5891
5892    pub fn stats(&self) -> RuntimeStats {
5893        let pool = runtime_pool_lock(self);
5894        RuntimeStats {
5895            active_connections: pool.active,
5896            idle_connections: pool.idle.len(),
5897            total_checkouts: pool.total_checkouts,
5898            paged_mode: self.inner.db.is_paged(),
5899            started_at_unix_ms: self.inner.started_at_unix_ms,
5900            store: self.inner.db.stats(),
5901            system: SystemInfo::collect(),
5902            result_blob_cache: self.inner.result_blob_cache.stats(),
5903            kv: self.inner.kv_stats.snapshot(),
5904            metrics_ingest: self.inner.metrics_ingest_stats.snapshot(),
5905        }
5906    }
5907
5908    pub(crate) fn record_metrics_ingest(
5909        &self,
5910        accepted_samples: u64,
5911        accepted_series: u64,
5912        rejected_samples: u64,
5913        rejected_series: u64,
5914    ) {
5915        self.inner.metrics_ingest_stats.record(
5916            accepted_samples,
5917            accepted_series,
5918            rejected_samples,
5919            rejected_series,
5920        );
5921    }
5922
5923    pub(crate) fn record_metrics_cardinality_budget_rejections(&self, rejected_series: u64) {
5924        self.inner
5925            .metrics_ingest_stats
5926            .record_cardinality_budget_rejections(rejected_series);
5927    }
5928
5929    pub(crate) fn record_metrics_tenant_activity(
5930        &self,
5931        tenant: &str,
5932        namespace: &str,
5933        operation: &str,
5934    ) {
5935        self.inner
5936            .metrics_tenant_activity_stats
5937            .record(tenant, namespace, operation);
5938    }
5939
5940    pub(crate) fn metrics_tenant_activity_snapshot(
5941        &self,
5942    ) -> Vec<crate::runtime::MetricsTenantActivityStats> {
5943        self.inner.metrics_tenant_activity_stats.snapshot()
5944    }
5945
5946    /// Execute a query under a typed scope override without embedding
5947    /// the tenant / user / role values into the SQL string. Use this
5948    /// from transport middleware (HTTP / gRPC / worker loops) where the
5949    /// scope is resolved from auth claims and the SQL is a parameterised
5950    /// template — avoids the string-concat injection risk of building
5951    /// `WITHIN TENANT '<id>' …` manually, and is drop-in compatible with
5952    /// prepared statements that didn't know about tenancy.
5953    ///
5954    /// Precedence matches the `WITHIN` clause: the passed `scope`
5955    /// overrides `SET LOCAL TENANT`, which overrides `SET TENANT`.
5956    /// The override is pushed on the thread-local scope stack for the
5957    /// duration of the call and popped on return — pool-shared
5958    /// connections cannot leak it across requests.
5959    pub fn execute_query_with_scope(
5960        &self,
5961        query: &str,
5962        scope: crate::runtime::within_clause::ScopeOverride,
5963    ) -> RedDBResult<RuntimeQueryResult> {
5964        if scope.is_empty() {
5965            return self.execute_query(query);
5966        }
5967        let _scope_guard = ScopeOverrideGuard::install(scope);
5968        self.execute_query(query)
5969    }
5970
5971    /// Issue #205 — single lifecycle exit for slow-query logging.
5972    ///
5973    /// `execute_query_inner` does the real work; this wrapper times it
5974    /// and, if elapsed exceeds the configured threshold, hands the
5975    /// triple `(QueryKind, elapsed_ms, sql_redacted, scope)` to the
5976    /// SlowQueryLogger. The threshold + sample_pct were captured at
5977    /// SlowQueryLogger construction (runtime startup), so the per-call
5978    /// cost on below-threshold paths is one relaxed atomic load.
5979    pub fn execute_query(&self, query: &str) -> RedDBResult<RuntimeQueryResult> {
5980        let started = std::time::Instant::now();
5981        let mut result = self.execute_query_inner(query);
5982        // Issue #765 / S6 — filter integrity-tombstoned rows out of SELECT
5983        // results before they reach any consumer. Fast no-op (one relaxed
5984        // atomic load) unless an input-stream digest mismatch has tombstoned
5985        // a RID range on this store.
5986        if let Ok(ref mut query_result) = result {
5987            if query_result.statement_type == "select" {
5988                self.filter_integrity_tombstoned(&mut query_result.result);
5989            }
5990        }
5991        let elapsed_ms = started.elapsed().as_millis() as u64;
5992
5993        // Build EffectiveScope from the same thread-locals frame-build
5994        // consults — keeps the slow-log row consistent with the audit /
5995        // RLS view of "this statement". `ai_scope()` is the canonical
5996        // builder.
5997        let scope = self.ai_scope();
5998        let kind = match result
5999            .as_ref()
6000            .map(|r| r.statement_type)
6001            .unwrap_or("select")
6002        {
6003            "select" => crate::telemetry::slow_query_logger::QueryKind::Select,
6004            "insert" => crate::telemetry::slow_query_logger::QueryKind::Insert,
6005            "update" => crate::telemetry::slow_query_logger::QueryKind::Update,
6006            "delete" => crate::telemetry::slow_query_logger::QueryKind::Delete,
6007            _ => crate::telemetry::slow_query_logger::QueryKind::Internal,
6008        };
6009        // SQL redaction: pass the raw query through. The slow-query
6010        // logger writes structured JSON so embedded literals stay
6011        // escape-safe at the JSON boundary (proven by
6012        // `adversarial_sql_is_escape_safe` in slow_query_logger.rs).
6013        // PII redaction (e.g. literal masking) is a follow-up.
6014        self.inner
6015            .slow_query_logger
6016            .record(kind, elapsed_ms, query.to_string(), &scope);
6017
6018        if let Ok(ref mut query_result) = result {
6019            if matches!(query_result.statement_type, "insert" | "update" | "delete") {
6020                let bookmark = crate::replication::CausalBookmark::new(
6021                    self.current_replication_term(),
6022                    self.cdc_current_lsn(),
6023                );
6024                query_result.bookmark = Some(bookmark.encode());
6025            }
6026        }
6027
6028        result
6029    }
6030
6031    pub fn causal_session(&self) -> crate::runtime::CausalSession {
6032        crate::runtime::CausalSession {
6033            runtime: self.clone(),
6034            bookmark: None,
6035            wait_timeout: std::time::Duration::from_secs(5),
6036        }
6037    }
6038
6039    pub fn wait_for_bookmark(
6040        &self,
6041        bookmark: &crate::replication::CausalBookmark,
6042        timeout: std::time::Duration,
6043    ) -> RedDBResult<()> {
6044        let deadline = std::time::Instant::now() + timeout;
6045        loop {
6046            let applied_lsn = self.local_contiguous_applied_lsn();
6047            if applied_lsn >= bookmark.commit_lsn() {
6048                return Ok(());
6049            }
6050            let now = std::time::Instant::now();
6051            if now >= deadline {
6052                return Err(RedDBError::InvalidOperation(format!(
6053                    "timed out waiting for causal bookmark lsn {}; applied={}",
6054                    bookmark.commit_lsn(),
6055                    applied_lsn
6056                )));
6057            }
6058            let remaining = deadline.saturating_duration_since(now);
6059            std::thread::sleep(remaining.min(std::time::Duration::from_millis(5)));
6060        }
6061    }
6062
6063    fn local_contiguous_applied_lsn(&self) -> u64 {
6064        match self.inner.db.options().replication.role {
6065            crate::replication::ReplicationRole::Replica { .. } => {
6066                self.config_u64("red.replication.last_applied_lsn", 0)
6067            }
6068            _ => self.cdc_current_lsn(),
6069        }
6070    }
6071
6072    #[inline(never)]
6073    fn execute_query_inner(&self, query: &str) -> RedDBResult<RuntimeQueryResult> {
6074        // ── ULTRA-TURBO: autocommit `SELECT * FROM t WHERE _entity_id = N` ──
6075        //
6076        // Moved above every boot-cost the normal path pays (WITHIN
6077        // strip, SET LOCAL parse, tx_local_tenants read, snapshot
6078        // guard, tracing span, tx_contexts read) because the bench's
6079        // `select_point` scenario was observed at 28× vs PostgreSQL —
6080        // the dominant cost wasn't the entity fetch but the ceremony
6081        // before it. Only fires when there's no ambient transaction
6082        // context or WITHIN override, so the snapshot install we skip
6083        // truly is a no-op for this query.
6084        if !has_scope_override_active()
6085            && !query.trim_start().starts_with("WITHIN")
6086            && !query.trim_start().starts_with("within")
6087            && !self.inner.query_audit.has_rules()
6088            && !self
6089                .inner
6090                .tx_contexts
6091                .read()
6092                .contains_key(&current_connection_id())
6093        {
6094            if let Some(result) = self.try_fast_entity_lookup(query) {
6095                return result;
6096            }
6097        }
6098
6099        // `WITHIN TENANT '<id>' [USER '<u>'] [AS ROLE '<r>'] <stmt>` —
6100        // strip the prefix, push a stack-scoped override, recurse on
6101        // the inner statement, pop on return. Stack lives in a
6102        // thread-local but is balanced by the RAII guard, so a
6103        // pool-shared connection cannot leak the override across
6104        // requests and an early `?` return still pops cleanly.
6105        match crate::runtime::within_clause::try_strip_within_prefix(query) {
6106            Ok(Some((scope, inner))) => {
6107                let _scope_guard = ScopeOverrideGuard::install(scope);
6108                // Re-enter the inner path, NOT `execute_query`, so the
6109                // slow-query lifecycle hook records exactly one row per
6110                // top-level statement (the WITHIN-stripped form would
6111                // double-record).
6112                return self.execute_query_inner(inner);
6113            }
6114            Ok(None) => {}
6115            Err(msg) => return Err(RedDBError::Query(msg)),
6116        }
6117
6118        // `EXPLAIN <stmt>` — introspection. Runs the planner on the
6119        // inner statement (WITHOUT executing it) and returns the
6120        // CanonicalLogicalNode tree as rows so the caller can see the
6121        // operator shape and estimated cost. `EXPLAIN ALTER FOR ...`
6122        // is a distinct schema-diff command and continues down the
6123        // regular SQL path.
6124        if let Some(inner) = strip_explain_prefix(query) {
6125            return self.explain_as_rows(query, inner);
6126        }
6127
6128        // `SET LOCAL TENANT '<id>'` — write the per-transaction tenant
6129        // override and return. Outside a transaction the statement is
6130        // an error (matches PG semantics: SET LOCAL only takes effect
6131        // within an active transaction).
6132        if let Some(value) = parse_set_local_tenant(query)? {
6133            let conn_id = current_connection_id();
6134            if !self.inner.tx_contexts.read().contains_key(&conn_id) {
6135                return Err(RedDBError::Query(
6136                    "SET LOCAL TENANT requires an active transaction".to_string(),
6137                ));
6138            }
6139            self.inner
6140                .tx_local_tenants
6141                .write()
6142                .insert(conn_id, value.clone());
6143            return Ok(RuntimeQueryResult::ok_message(
6144                query.to_string(),
6145                &match &value {
6146                    Some(id) => format!("local tenant set: {id}"),
6147                    None => "local tenant cleared".to_string(),
6148                },
6149                "set_local_tenant",
6150            ));
6151        }
6152
6153        if super::red_schema::is_system_schema_write(query) {
6154            return Err(RedDBError::Query(
6155                super::red_schema::READ_ONLY_ERROR.to_string(),
6156            ));
6157        }
6158
6159        if let Some(create_source) = super::analytics_source_catalog::parse_create_statement(query)?
6160        {
6161            return self.execute_create_analytics_source(query, create_source);
6162        }
6163
6164        // Issue #790 — `READ METRIC <path>` is intentionally rejected at
6165        // v0. The descriptor itself is readable through
6166        // `red.analytics.metrics`; the *output* read returns a
6167        // structured error so callers can tell "execution engine not yet
6168        // built" apart from "metric does not exist".
6169        if let Some(path) = super::metric_descriptor_catalog::parse_read_metric_statement(query) {
6170            return Err(super::metric_descriptor_catalog::read_output_unsupported(
6171                &path,
6172            ));
6173        }
6174
6175        let rewritten_query = super::red_schema::rewrite_virtual_names(query);
6176        let execution_query = rewritten_query.as_deref().unwrap_or(query);
6177
6178        let frame = super::statement_frame::StatementExecutionFrame::build(self, execution_query)?;
6179        let _frame_guards = frame.install(self);
6180
6181        // Phase 6 logging: enter a span stamped with conn_id / tenant
6182        // / query_len. Every downstream tracing::info!/warn!/error!
6183        // inherits these fields — no need to thread them manually
6184        // through storage/scan layers. Entered AFTER the WITHIN /
6185        // SET LOCAL TENANT resolution above so the span reflects the
6186        // effective scope for this statement.
6187        let _log_span = crate::telemetry::span::query_span(query).entered();
6188
6189        // ── CTE prelude (#41) — `WITH x AS (...) SELECT ... FROM x` ──
6190        if let Some(rewritten) = frame.prepare_cte(execution_query)? {
6191            return self.execute_query_expr(rewritten);
6192        }
6193
6194        // ── TURBO: bypass SQL parse for SELECT * FROM x WHERE _entity_id = N ──
6195        if !self.inner.query_audit.has_rules() {
6196            if let Some(result) = self.try_fast_entity_lookup(execution_query) {
6197                return result;
6198            }
6199        }
6200
6201        // ── Result cache: return cached result if still fresh (30s TTL) ──
6202        if !self.inner.query_audit.has_rules() {
6203            if let Some(result) = frame.read_result_cache(self) {
6204                return Ok(result);
6205            }
6206        }
6207
6208        let prepared = frame.prepare_statement(self, execution_query)?;
6209        let mode = prepared.mode;
6210        let expr = prepared.expr;
6211
6212        let statement = query_expr_name(&expr);
6213        let result_cache_scopes = query_expr_result_cache_scopes(&expr);
6214        let control_event_specs = query_control_event_specs(&expr);
6215        let query_audit_plan = query_audit_plan(&expr);
6216
6217        let _lock_guard = match frame.prepare_dispatch(self, &expr) {
6218            Ok(guard) => guard,
6219            Err(err) => {
6220                let outcome = control_event_outcome_for_error(&err);
6221                for spec in &control_event_specs {
6222                    self.emit_control_event(
6223                        spec.kind,
6224                        outcome,
6225                        spec.action,
6226                        spec.resource.clone(),
6227                        Some(err.to_string()),
6228                        spec.fields.clone(),
6229                    )?;
6230                }
6231                return Err(err);
6232            }
6233        };
6234        let frame_iface: &dyn super::statement_frame::ReadFrame = &frame;
6235        let query_audit_started = std::time::Instant::now();
6236
6237        let query_result = match expr {
6238            QueryExpr::Graph(_) | QueryExpr::Path(_) => {
6239                // Apply MVCC visibility + RLS gate while materialising the
6240                // graph: every node entity is screened against the source
6241                // collection's policy chain (basic and `Nodes`-targeted)
6242                // and dropped when the caller's tenant / role doesn't
6243                // admit it. Edges are pruned automatically because the
6244                // graph builder skips edges whose endpoints aren't in
6245                // `allowed_nodes`.
6246                let (graph, node_properties, edge_properties) =
6247                    self.materialize_graph_with_rls()?;
6248                let result =
6249                    crate::storage::query::unified::UnifiedExecutor::execute_on_with_graph_properties(
6250                        &graph,
6251                        &expr,
6252                        node_properties,
6253                        edge_properties,
6254                    )
6255                        .map_err(|err| RedDBError::Query(err.to_string()))?;
6256
6257                Ok(RuntimeQueryResult {
6258                    query: query.to_string(),
6259                    mode,
6260                    statement,
6261                    engine: "materialized-graph",
6262                    result,
6263                    affected_rows: 0,
6264                    statement_type: "select",
6265                    bookmark: None,
6266                })
6267            }
6268            QueryExpr::Table(table) => {
6269                let table = self.resolve_table_expr_subqueries(
6270                    table,
6271                    &frame as &dyn super::statement_frame::ReadFrame,
6272                )?;
6273                // Table-valued functions (e.g. components(g)) dispatch to a
6274                // read-only executor before any catalog/virtual-table routing
6275                // (issue #795).
6276                if let Some(TableSource::Function {
6277                    name,
6278                    args,
6279                    named_args,
6280                }) = table.source.clone()
6281                {
6282                    // The graph-collection form is cacheable (issue #802): the
6283                    // result-cache read at the top of this function keys on the
6284                    // query string, and `result_cache_scopes` carries the graph
6285                    // collection (see `collect_table_source_scopes`) so a write
6286                    // to it invalidates the entry. Deterministic algorithm
6287                    // output is worth caching at any row count, so the write
6288                    // bypasses the generic ≤5-row payload heuristic.
6289                    let tvf_result = RuntimeQueryResult {
6290                        query: query.to_string(),
6291                        mode,
6292                        statement,
6293                        engine: "runtime-graph-tvf",
6294                        result: self.execute_table_function(&name, &args, &named_args)?,
6295                        affected_rows: 0,
6296                        statement_type: "select",
6297                        bookmark: None,
6298                    };
6299                    frame.write_result_cache(self, &tvf_result, result_cache_scopes.clone());
6300                    return Ok(tvf_result);
6301                }
6302                // Inline-graph TVF (issue #799): the graph is supplied by two
6303                // subqueries instead of a collection reference. Unlike the
6304                // graph-collection form, the result IS cacheable — its cache
6305                // key is the query string (the result-cache read at the top of
6306                // `execute_query_inner` keys on it) and `result_cache_scopes`
6307                // already carries the `nodes`/`edges` source collections, so a
6308                // write to any of them invalidates the entry.
6309                if let Some(TableSource::InlineGraphFunction {
6310                    name,
6311                    nodes,
6312                    edges,
6313                    named_args,
6314                }) = table.source.clone()
6315                {
6316                    let inline_result = RuntimeQueryResult {
6317                        query: query.to_string(),
6318                        mode,
6319                        statement,
6320                        engine: "runtime-graph-tvf-inline",
6321                        result: self.execute_inline_graph_function(
6322                            &name,
6323                            &nodes,
6324                            &edges,
6325                            &named_args,
6326                        )?,
6327                        affected_rows: 0,
6328                        statement_type: "select",
6329                        bookmark: None,
6330                    };
6331                    frame.write_result_cache(self, &inline_result, result_cache_scopes);
6332                    return Ok(inline_result);
6333                }
6334                if super::red_schema::is_virtual_table(&table.table) {
6335                    return Ok(RuntimeQueryResult {
6336                        query: query.to_string(),
6337                        mode,
6338                        statement,
6339                        engine: "runtime-red-schema",
6340                        result: super::red_schema::red_query(
6341                            self,
6342                            &table.table,
6343                            &table,
6344                            &frame as &dyn super::statement_frame::ReadFrame,
6345                        )?,
6346                        affected_rows: 0,
6347                        statement_type: "select",
6348                        bookmark: None,
6349                    });
6350                }
6351
6352                // `<graph>.<output>` analytics virtual view (issue #800).
6353                // Recomputed on demand — intentionally not result-cached, so it
6354                // always reflects the current graph data.
6355                if let Some(view_result) = self.try_resolve_analytics_view(
6356                    &table,
6357                    &frame as &dyn super::statement_frame::ReadFrame,
6358                )? {
6359                    return Ok(RuntimeQueryResult {
6360                        query: query.to_string(),
6361                        mode,
6362                        statement,
6363                        engine: "runtime-graph-analytics-view",
6364                        result: view_result,
6365                        affected_rows: 0,
6366                        statement_type: "select",
6367                        bookmark: None,
6368                    });
6369                }
6370
6371                if let Some(result) = self.execute_probabilistic_select(&table)? {
6372                    return Ok(RuntimeQueryResult {
6373                        query: query.to_string(),
6374                        mode,
6375                        statement,
6376                        engine: "runtime-probabilistic",
6377                        result,
6378                        affected_rows: 0,
6379                        statement_type: "select",
6380                        bookmark: None,
6381                    });
6382                }
6383
6384                // Foreign-table intercept (Phase 3.2.2 PG parity).
6385                //
6386                // When the referenced table matches a `CREATE FOREIGN TABLE`
6387                // registration, short-circuit into the FDW scan. Phase 3.2
6388                // wrappers don't yet support pushdown, so filters/projections
6389                // apply post-scan via `apply_foreign_table_filters` — good
6390                // enough for correctness; perf work lands in 3.2.3.
6391                if self.inner.foreign_tables.is_foreign_table(&table.table) {
6392                    let records = self
6393                        .inner
6394                        .foreign_tables
6395                        .scan(&table.table)
6396                        .map_err(|e| RedDBError::Internal(e.to_string()))?;
6397                    let result = apply_foreign_table_filters(records, &table);
6398                    return Ok(RuntimeQueryResult {
6399                        query: query.to_string(),
6400                        mode,
6401                        statement,
6402                        engine: "runtime-fdw",
6403                        result,
6404                        affected_rows: 0,
6405                        statement_type: "select",
6406                        bookmark: None,
6407                    });
6408                }
6409
6410                // Row-Level Security enforcement (Phase 2.5.2 PG parity).
6411                //
6412                // When RLS is enabled on this table, fetch every policy
6413                // that applies to the current (role, SELECT) pair and
6414                // fold them into the query's WHERE clause: policies
6415                // OR-combine (any of them admitting the row is enough),
6416                // then AND into the caller's existing filter.
6417                //
6418                // Anonymous callers (no thread-local identity) pass
6419                // `role = None`; policies with a specific `TO role`
6420                // clause skip, but `TO PUBLIC` policies still apply.
6421                //
6422                // When `inject_rls_filters` returns `None` the table has
6423                // RLS enabled but no policy admits the caller's role —
6424                // short-circuit with an empty result set instead of
6425                // synthesising a contradiction filter.
6426                let Some(table_with_rls) = self.authorize_relational_table_select(
6427                    table,
6428                    &frame as &dyn super::statement_frame::ReadFrame,
6429                )?
6430                else {
6431                    let empty = crate::storage::query::unified::UnifiedResult::empty();
6432                    return Ok(RuntimeQueryResult {
6433                        query: query.to_string(),
6434                        mode,
6435                        statement,
6436                        engine: "runtime-table-rls",
6437                        result: empty,
6438                        affected_rows: 0,
6439                        statement_type: "select",
6440                        bookmark: None,
6441                    });
6442                };
6443                Ok(RuntimeQueryResult {
6444                    query: query.to_string(),
6445                    mode,
6446                    statement,
6447                    engine: "runtime-table",
6448                    // #885: lend the frame-owned row-buffer arena to the
6449                    // streaming path so chunk buffers are reused across
6450                    // this statement's chunk-fetches instead of allocated
6451                    // fresh per chunk. This is the table-query dispatch
6452                    // that runs under a `StatementExecutionFrame`; the
6453                    // frameless prepared/subquery paths keep `None`.
6454                    result: execute_runtime_table_query_in(
6455                        &self.inner.db,
6456                        &table_with_rls,
6457                        Some(&self.inner.index_store),
6458                        Some(frame.row_arena()),
6459                    )?,
6460                    affected_rows: 0,
6461                    statement_type: "select",
6462                    bookmark: None,
6463                })
6464            }
6465            QueryExpr::Join(join) => {
6466                // Fold per-table RLS filters into each `QueryExpr::Table`
6467                // leaf of the join tree before executing. Without this
6468                // the join executor scans both tables raw and ignores
6469                // policies — a `WITHIN TENANT 'x'` against a join of
6470                // two tenant-scoped tables would leak cross-tenant rows.
6471                // When any leaf has RLS enabled and zero matching policy,
6472                // short-circuit to an empty join result instead of
6473                // emitting a contradiction filter.
6474                let join_with_rls = match self.authorize_relational_join_select(
6475                    join,
6476                    &frame as &dyn super::statement_frame::ReadFrame,
6477                )? {
6478                    Some(j) => j,
6479                    None => {
6480                        return Ok(RuntimeQueryResult {
6481                            query: query.to_string(),
6482                            mode,
6483                            statement,
6484                            engine: "runtime-join-rls",
6485                            result: crate::storage::query::unified::UnifiedResult::empty(),
6486                            affected_rows: 0,
6487                            statement_type: "select",
6488                            bookmark: None,
6489                        });
6490                    }
6491                };
6492                Ok(RuntimeQueryResult {
6493                    query: query.to_string(),
6494                    mode,
6495                    statement,
6496                    engine: "runtime-join",
6497                    result: execute_runtime_join_query(&self.inner.db, &join_with_rls)?,
6498                    affected_rows: 0,
6499                    statement_type: "select",
6500                    bookmark: None,
6501                })
6502            }
6503            QueryExpr::Vector(vector) => Ok(RuntimeQueryResult {
6504                query: query.to_string(),
6505                mode,
6506                statement,
6507                engine: "runtime-vector",
6508                result: execute_runtime_vector_query(&self.inner.db, &vector)?,
6509                affected_rows: 0,
6510                statement_type: "select",
6511                bookmark: None,
6512            }),
6513            QueryExpr::Hybrid(hybrid) => Ok(RuntimeQueryResult {
6514                query: query.to_string(),
6515                mode,
6516                statement,
6517                engine: "runtime-hybrid",
6518                result: execute_runtime_hybrid_query(&self.inner.db, &hybrid)?,
6519                affected_rows: 0,
6520                statement_type: "select",
6521                bookmark: None,
6522            }),
6523            // DML execution
6524            QueryExpr::Insert(ref insert) if super::red_schema::is_virtual_table(&insert.table) => {
6525                Err(RedDBError::Query(
6526                    super::red_schema::READ_ONLY_ERROR.to_string(),
6527                ))
6528            }
6529            QueryExpr::Update(ref update) if super::red_schema::is_virtual_table(&update.table) => {
6530                Err(RedDBError::Query(
6531                    super::red_schema::READ_ONLY_ERROR.to_string(),
6532                ))
6533            }
6534            QueryExpr::Delete(ref delete) if super::red_schema::is_virtual_table(&delete.table) => {
6535                Err(RedDBError::Query(
6536                    super::red_schema::READ_ONLY_ERROR.to_string(),
6537                ))
6538            }
6539            QueryExpr::Insert(ref insert) => self
6540                .with_deferred_store_wal_for_dml(self.insert_may_emit_events(insert), || {
6541                    self.execute_insert(query, insert)
6542                }),
6543            QueryExpr::Update(ref update) => self
6544                .with_deferred_store_wal_for_dml(self.update_may_emit_events(update), || {
6545                    self.execute_update(query, update)
6546                }),
6547            QueryExpr::Delete(ref delete) => self
6548                .with_deferred_store_wal_for_dml(self.delete_may_emit_events(delete), || {
6549                    self.execute_delete(query, delete)
6550                }),
6551            // DDL execution
6552            QueryExpr::CreateTable(ref create) => self.execute_create_table(query, create),
6553            QueryExpr::CreateCollection(ref create) => {
6554                self.execute_create_collection(query, create)
6555            }
6556            QueryExpr::CreateVector(ref create) => self.execute_create_vector(query, create),
6557            QueryExpr::DropTable(ref drop_tbl) => self.execute_drop_table(query, drop_tbl),
6558            QueryExpr::DropGraph(ref drop_graph) => self.execute_drop_graph(query, drop_graph),
6559            QueryExpr::DropVector(ref drop_vector) => self.execute_drop_vector(query, drop_vector),
6560            QueryExpr::DropDocument(ref drop_document) => {
6561                self.execute_drop_document(query, drop_document)
6562            }
6563            QueryExpr::DropKv(ref drop_kv) => self.execute_drop_kv(query, drop_kv),
6564            QueryExpr::DropCollection(ref drop_collection) => {
6565                self.execute_drop_collection(query, drop_collection)
6566            }
6567            QueryExpr::Truncate(ref truncate) => self.execute_truncate(query, truncate),
6568            QueryExpr::AlterTable(ref alter) => self.execute_alter_table(query, alter),
6569            QueryExpr::ExplainAlter(ref explain) => self.execute_explain_alter(query, explain),
6570            // Graph analytics commands
6571            QueryExpr::GraphCommand(ref cmd) => self.execute_graph_command(query, cmd),
6572            // Search commands
6573            QueryExpr::SearchCommand(ref cmd) => self.execute_search_command(query, cmd),
6574            // ASK: RAG query with LLM synthesis
6575            QueryExpr::Ask(ref ask) => self.execute_ask(query, ask),
6576            QueryExpr::CreateIndex(ref create_idx) => self.execute_create_index(query, create_idx),
6577            QueryExpr::DropIndex(ref drop_idx) => self.execute_drop_index(query, drop_idx),
6578            QueryExpr::ProbabilisticCommand(ref cmd) => {
6579                self.execute_probabilistic_command(query, cmd)
6580            }
6581            // Time-series DDL
6582            QueryExpr::CreateTimeSeries(ref ts) => self.execute_create_timeseries(query, ts),
6583            QueryExpr::CreateMetric(ref metric) => self.execute_create_metric(query, metric),
6584            QueryExpr::AlterMetric(ref alter) => self.execute_alter_metric(query, alter),
6585            QueryExpr::CreateSlo(ref slo) => self.execute_create_slo(query, slo),
6586            QueryExpr::DropTimeSeries(ref ts) => self.execute_drop_timeseries(query, ts),
6587            // Queue DDL and commands
6588            QueryExpr::CreateQueue(ref q) => self.execute_create_queue(query, q),
6589            QueryExpr::AlterQueue(ref q) => self.execute_alter_queue(query, q),
6590            QueryExpr::DropQueue(ref q) => self.execute_drop_queue(query, q),
6591            QueryExpr::QueueSelect(ref q) => self.execute_queue_select(query, q),
6592            QueryExpr::QueueCommand(ref cmd) => self.execute_queue_command(query, cmd),
6593            QueryExpr::EventsBackfill(ref backfill) => {
6594                self.execute_events_backfill(query, backfill)
6595            }
6596            QueryExpr::EventsBackfillStatus { ref collection } => Err(RedDBError::Query(format!(
6597                "EVENTS BACKFILL STATUS for '{collection}' is not implemented in this slice"
6598            ))),
6599            QueryExpr::KvCommand(ref cmd) => self.execute_kv_command(query, cmd),
6600            QueryExpr::ConfigCommand(ref cmd) => self.execute_config_command(query, cmd),
6601            QueryExpr::CreateTree(ref tree) => self.execute_create_tree(query, tree),
6602            QueryExpr::DropTree(ref tree) => self.execute_drop_tree(query, tree),
6603            QueryExpr::TreeCommand(ref cmd) => self.execute_tree_command(query, cmd),
6604            // SET CONFIG key = value
6605            QueryExpr::SetConfig { ref key, ref value } => {
6606                if key.starts_with("red.secret.") {
6607                    return Err(RedDBError::Query(
6608                        "red.secret.* is reserved for vault secrets; use SET SECRET".to_string(),
6609                    ));
6610                }
6611                match self.check_managed_config_write_for_set_config(key) {
6612                    Err(err) => Err(err),
6613                    Ok(()) => {
6614                        let store = self.inner.db.store();
6615                        let json_val = match value {
6616                            Value::Text(s) => crate::serde_json::Value::String(s.to_string()),
6617                            Value::Integer(n) => crate::serde_json::Value::Number(*n as f64),
6618                            Value::Float(n) => crate::serde_json::Value::Number(*n),
6619                            Value::Boolean(b) => crate::serde_json::Value::Bool(*b),
6620                            _ => crate::serde_json::Value::String(value.to_string()),
6621                        };
6622                        store.set_config_tree(key, &json_val);
6623                        update_current_config_value(key, value.clone());
6624                        // Config changes can flip runtime behavior mid-session
6625                        // (auto_decrypt, auto_encrypt, etc.) — invalidate the
6626                        // result cache so subsequent reads re-execute against
6627                        // the new config.
6628                        self.invalidate_result_cache();
6629                        Ok(RuntimeQueryResult::ok_message(
6630                            query.to_string(),
6631                            &format!("config set: {key}"),
6632                            "set",
6633                        ))
6634                    }
6635                }
6636            }
6637            // SET SECRET key = value
6638            QueryExpr::SetSecret { ref key, ref value } => {
6639                if key.starts_with("red.config.") {
6640                    return Err(RedDBError::Query(
6641                        "red.config.* is reserved for config; use SET CONFIG".to_string(),
6642                    ));
6643                }
6644                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
6645                    RedDBError::Query("SET SECRET requires an enabled, unsealed vault".to_string())
6646                })?;
6647                if matches!(value, Value::Null) {
6648                    auth_store
6649                        .vault_kv_try_delete(key)
6650                        .map_err(|err| RedDBError::Query(err.to_string()))?;
6651                    update_current_secret_value(key, None);
6652                    self.invalidate_result_cache();
6653                    return Ok(RuntimeQueryResult::ok_message(
6654                        query.to_string(),
6655                        &format!("secret deleted: {key}"),
6656                        "delete_secret",
6657                    ));
6658                }
6659                let value = secret_sql_value_to_string(value)?;
6660                auth_store
6661                    .vault_kv_try_set(key.clone(), value.clone())
6662                    .map_err(|err| RedDBError::Query(err.to_string()))?;
6663                update_current_secret_value(key, Some(value));
6664                self.invalidate_result_cache();
6665                Ok(RuntimeQueryResult::ok_message(
6666                    query.to_string(),
6667                    &format!("secret set: {key}"),
6668                    "set_secret",
6669                ))
6670            }
6671            // DELETE SECRET key
6672            QueryExpr::DeleteSecret { ref key } => {
6673                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
6674                    RedDBError::Query(
6675                        "DELETE SECRET requires an enabled, unsealed vault".to_string(),
6676                    )
6677                })?;
6678                let deleted = auth_store
6679                    .vault_kv_try_delete(key)
6680                    .map_err(|err| RedDBError::Query(err.to_string()))?;
6681                if deleted {
6682                    update_current_secret_value(key, None);
6683                }
6684                self.invalidate_result_cache();
6685                Ok(RuntimeQueryResult::ok_message(
6686                    query.to_string(),
6687                    &format!("secret deleted: {key}"),
6688                    if deleted {
6689                        "delete_secret"
6690                    } else {
6691                        "delete_secret_not_found"
6692                    },
6693                ))
6694            }
6695            // SHOW SECRET[S] [prefix]
6696            QueryExpr::ShowSecrets { ref prefix } => {
6697                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
6698                    RedDBError::Query("SHOW SECRET requires an enabled, unsealed vault".to_string())
6699                })?;
6700                if !auth_store.is_vault_backed() {
6701                    return Err(RedDBError::Query(
6702                        "SHOW SECRET requires an enabled, unsealed vault".to_string(),
6703                    ));
6704                }
6705                let mut keys = auth_store.vault_kv_keys();
6706                keys.sort();
6707                let mut result = UnifiedResult::with_columns(vec![
6708                    "key".into(),
6709                    "value".into(),
6710                    "status".into(),
6711                ]);
6712                for key in keys {
6713                    if let Some(ref pfx) = prefix {
6714                        if !key.starts_with(pfx) {
6715                            continue;
6716                        }
6717                    }
6718                    let mut record = UnifiedRecord::new();
6719                    record.set("key", Value::text(key));
6720                    record.set("value", Value::text("***"));
6721                    record.set("status", Value::text("active"));
6722                    result.push(record);
6723                }
6724                Ok(RuntimeQueryResult {
6725                    query: query.to_string(),
6726                    mode,
6727                    statement: "show_secrets",
6728                    engine: "runtime-secret",
6729                    result,
6730                    affected_rows: 0,
6731                    statement_type: "select",
6732                    bookmark: None,
6733                })
6734            }
6735            // SHOW CONFIG [prefix]
6736            QueryExpr::ShowConfig { ref prefix } => {
6737                let store = self.inner.db.store();
6738                let all_collections = store.list_collections();
6739                if !all_collections.contains(&"red_config".to_string()) {
6740                    let result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
6741                    return Ok(RuntimeQueryResult {
6742                        query: query.to_string(),
6743                        mode,
6744                        statement: "show_config",
6745                        engine: "runtime-config",
6746                        result,
6747                        affected_rows: 0,
6748                        statement_type: "select",
6749                        bookmark: None,
6750                    });
6751                }
6752                let manager = store
6753                    .get_collection("red_config")
6754                    .ok_or_else(|| RedDBError::NotFound("red_config".to_string()))?;
6755                let entities = manager.query_all(|_| true);
6756                let mut latest = std::collections::BTreeMap::<String, (u64, Value, Value)>::new();
6757                for entity in entities {
6758                    if let EntityData::Row(ref row) = entity.data {
6759                        if let Some(ref named) = row.named {
6760                            let key_val = named.get("key").cloned().unwrap_or(Value::Null);
6761                            let val = named.get("value").cloned().unwrap_or(Value::Null);
6762                            let key_str = match &key_val {
6763                                Value::Text(s) => s.as_ref(),
6764                                _ => continue,
6765                            };
6766                            if let Some(ref pfx) = prefix {
6767                                if !key_str.starts_with(pfx.as_str()) {
6768                                    continue;
6769                                }
6770                            }
6771                            let entity_id = entity.id.raw();
6772                            match latest.get(key_str) {
6773                                Some((prev_id, _, _)) if *prev_id > entity_id => {}
6774                                _ => {
6775                                    latest.insert(key_str.to_string(), (entity_id, key_val, val));
6776                                }
6777                            }
6778                        }
6779                    }
6780                }
6781                let mut result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
6782                for (_, key_val, val) in latest.into_values() {
6783                    let mut record = UnifiedRecord::new();
6784                    record.set("key", key_val);
6785                    record.set("value", val);
6786                    result.push(record);
6787                }
6788                Ok(RuntimeQueryResult {
6789                    query: query.to_string(),
6790                    mode,
6791                    statement: "show_config",
6792                    engine: "runtime-config",
6793                    result,
6794                    affected_rows: 0,
6795                    statement_type: "select",
6796                    bookmark: None,
6797                })
6798            }
6799            // Session-local multi-tenancy handle (Phase 2.5.3).
6800            //
6801            // SET TENANT 'id' / SET TENANT NULL / RESET TENANT — writes
6802            // the thread-local; SHOW TENANT returns it. Paired with the
6803            // CURRENT_TENANT() scalar for use in RLS policies.
6804            QueryExpr::SetTenant(ref value) => {
6805                match value {
6806                    Some(id) => set_current_tenant(id.clone()),
6807                    None => clear_current_tenant(),
6808                }
6809                Ok(RuntimeQueryResult::ok_message(
6810                    query.to_string(),
6811                    &match value {
6812                        Some(id) => format!("tenant set: {id}"),
6813                        None => "tenant cleared".to_string(),
6814                    },
6815                    "set_tenant",
6816                ))
6817            }
6818            QueryExpr::ShowTenant => {
6819                let mut result = UnifiedResult::with_columns(vec!["tenant".into()]);
6820                let mut record = UnifiedRecord::new();
6821                record.set(
6822                    "tenant",
6823                    current_tenant().map(Value::text).unwrap_or(Value::Null),
6824                );
6825                result.push(record);
6826                Ok(RuntimeQueryResult {
6827                    query: query.to_string(),
6828                    mode,
6829                    statement: "show_tenant",
6830                    engine: "runtime-tenant",
6831                    result,
6832                    affected_rows: 0,
6833                    statement_type: "select",
6834                    bookmark: None,
6835                })
6836            }
6837            // Transaction control (Phase 2.3 PG parity).
6838            //
6839            // BEGIN allocates a real `Xid` and stores a `TxnContext` keyed by
6840            // the current connection's id. COMMIT/ROLLBACK release it through
6841            // the `SnapshotManager` so future snapshots see the correct set of
6842            // active/aborted transactions.
6843            //
6844            // Tuple stamping (xmin/xmax) and read-path visibility filtering
6845            // land in Phase 2.3.2 — this dispatch only manages the snapshot
6846            // registry. Statements running outside a TxnContext still behave
6847            // as autocommit (xid=0 → visible to every snapshot).
6848            QueryExpr::TransactionControl(ref ctl) => {
6849                use crate::storage::query::ast::TxnControl;
6850                use crate::storage::transaction::snapshot::{TxnContext, Xid};
6851                use crate::storage::transaction::IsolationLevel;
6852
6853                // Phase 2.3 keys transactions by a thread-local connection id.
6854                // The stdio/gRPC paths wire a real per-connection id later;
6855                // for embedded use (one RedDBRuntime per process-ish caller)
6856                // we fall back to a deterministic placeholder.
6857                let conn_id = current_connection_id();
6858
6859                let (kind, msg) = match ctl {
6860                    TxnControl::Begin => {
6861                        let mgr = Arc::clone(&self.inner.snapshot_manager);
6862                        let xid = mgr.begin();
6863                        let snapshot = mgr.snapshot(xid);
6864                        let ctx = TxnContext {
6865                            xid,
6866                            isolation: IsolationLevel::SnapshotIsolation,
6867                            snapshot,
6868                            savepoints: Vec::new(),
6869                            released_sub_xids: Vec::new(),
6870                        };
6871                        self.inner.tx_contexts.write().insert(conn_id, ctx);
6872                        ("begin", format!("BEGIN — xid={xid} (snapshot isolation)"))
6873                    }
6874                    TxnControl::Commit => {
6875                        // SET LOCAL TENANT ends with the transaction.
6876                        self.inner.tx_local_tenants.write().remove(&conn_id);
6877                        let ctx = self.inner.tx_contexts.write().remove(&conn_id);
6878                        match ctx {
6879                            Some(ctx) => {
6880                                let mut own_xids = std::collections::HashSet::new();
6881                                own_xids.insert(ctx.xid);
6882                                for (_, sub) in &ctx.savepoints {
6883                                    own_xids.insert(*sub);
6884                                }
6885                                for sub in &ctx.released_sub_xids {
6886                                    own_xids.insert(*sub);
6887                                }
6888                                if let Err(err) = self.check_table_row_write_conflicts(
6889                                    conn_id,
6890                                    &ctx.snapshot,
6891                                    &own_xids,
6892                                ) {
6893                                    for (_, sub) in &ctx.savepoints {
6894                                        self.inner.snapshot_manager.rollback(*sub);
6895                                    }
6896                                    for sub in &ctx.released_sub_xids {
6897                                        self.inner.snapshot_manager.rollback(*sub);
6898                                    }
6899                                    self.inner.snapshot_manager.rollback(ctx.xid);
6900                                    self.revive_pending_versioned_updates(conn_id);
6901                                    self.revive_pending_tombstones(conn_id);
6902                                    self.discard_pending_kv_watch_events(conn_id);
6903                                    self.discard_pending_queue_wakes(conn_id);
6904                                    self.discard_pending_store_wal_actions(conn_id);
6905                                    return Err(err);
6906                                }
6907                                self.restore_pending_write_stamps(conn_id);
6908                                if let Err(err) = self.flush_pending_store_wal_actions(conn_id) {
6909                                    for (_, sub) in &ctx.savepoints {
6910                                        self.inner.snapshot_manager.rollback(*sub);
6911                                    }
6912                                    for sub in &ctx.released_sub_xids {
6913                                        self.inner.snapshot_manager.rollback(*sub);
6914                                    }
6915                                    self.inner.snapshot_manager.rollback(ctx.xid);
6916                                    self.revive_pending_versioned_updates(conn_id);
6917                                    self.revive_pending_tombstones(conn_id);
6918                                    self.discard_pending_kv_watch_events(conn_id);
6919                                    return Err(err);
6920                                }
6921                                // Phase 2.3.2e: commit every open sub-xid
6922                                // so they also become visible. Their
6923                                // work is promoted to the parent txn's
6924                                // result exactly like a RELEASE would
6925                                // have done.
6926                                for (_, sub) in &ctx.savepoints {
6927                                    self.inner.snapshot_manager.commit(*sub);
6928                                }
6929                                for sub in &ctx.released_sub_xids {
6930                                    self.inner.snapshot_manager.commit(*sub);
6931                                }
6932                                self.inner.snapshot_manager.commit(ctx.xid);
6933                                self.finalize_pending_versioned_updates(conn_id);
6934                                self.finalize_pending_tombstones(conn_id);
6935                                self.finalize_pending_kv_watch_events(conn_id);
6936                                self.finalize_pending_queue_wakes(conn_id);
6937                                ("commit", format!("COMMIT — xid={} committed", ctx.xid))
6938                            }
6939                            None => (
6940                                "commit",
6941                                "COMMIT outside transaction — no-op (autocommit)".to_string(),
6942                            ),
6943                        }
6944                    }
6945                    TxnControl::Rollback => {
6946                        self.inner.tx_local_tenants.write().remove(&conn_id);
6947                        let ctx = self.inner.tx_contexts.write().remove(&conn_id);
6948                        match ctx {
6949                            Some(ctx) => {
6950                                // Phase 2.3.2e: abort every open sub-xid
6951                                // too so their writes stay hidden.
6952                                for (_, sub) in &ctx.savepoints {
6953                                    self.inner.snapshot_manager.rollback(*sub);
6954                                }
6955                                for sub in &ctx.released_sub_xids {
6956                                    self.inner.snapshot_manager.rollback(*sub);
6957                                }
6958                                self.inner.snapshot_manager.rollback(ctx.xid);
6959                                // Phase 2.3.2b: tuples that the txn had
6960                                // xmax-stamped become live again — wipe xmax
6961                                // back to 0 so later snapshots see them.
6962                                self.revive_pending_versioned_updates(conn_id);
6963                                self.revive_pending_tombstones(conn_id);
6964                                self.discard_pending_kv_watch_events(conn_id);
6965                                self.discard_pending_queue_wakes(conn_id);
6966                                self.discard_pending_store_wal_actions(conn_id);
6967                                ("rollback", format!("ROLLBACK — xid={} aborted", ctx.xid))
6968                            }
6969                            None => (
6970                                "rollback",
6971                                "ROLLBACK outside transaction — no-op (autocommit)".to_string(),
6972                            ),
6973                        }
6974                    }
6975                    // Phase 2.3.2e: savepoints map onto sub-xids. Each
6976                    // SAVEPOINT allocates a fresh xid and pushes it
6977                    // onto the per-txn stack so subsequent writes can
6978                    // be selectively rolled back. RELEASE pops without
6979                    // aborting; ROLLBACK TO aborts the sub-xid (and
6980                    // any nested ones) + revives their tombstones.
6981                    TxnControl::Savepoint(name) => {
6982                        let mgr = Arc::clone(&self.inner.snapshot_manager);
6983                        let mut guard = self.inner.tx_contexts.write();
6984                        match guard.get_mut(&conn_id) {
6985                            Some(ctx) => {
6986                                let sub = mgr.begin();
6987                                ctx.savepoints.push((name.clone(), sub));
6988                                ("savepoint", format!("SAVEPOINT {name} — sub_xid={sub}"))
6989                            }
6990                            None => (
6991                                "savepoint",
6992                                "SAVEPOINT outside transaction — no-op".to_string(),
6993                            ),
6994                        }
6995                    }
6996                    TxnControl::ReleaseSavepoint(name) => {
6997                        let mut guard = self.inner.tx_contexts.write();
6998                        match guard.get_mut(&conn_id) {
6999                            Some(ctx) => {
7000                                let pos = ctx
7001                                    .savepoints
7002                                    .iter()
7003                                    .position(|(n, _)| n == name)
7004                                    .ok_or_else(|| {
7005                                        RedDBError::Internal(format!(
7006                                            "savepoint {name} does not exist"
7007                                        ))
7008                                    })?;
7009                                // RELEASE pops the named savepoint and
7010                                // any nested ones. Their sub-xids move
7011                                // to `released_sub_xids` so they commit
7012                                // (or roll back) alongside the parent
7013                                // xid — PG semantics: released
7014                                // savepoints still contribute their
7015                                // work, but their names are gone.
7016                                let released = ctx.savepoints.len() - pos;
7017                                let popped: Vec<Xid> = ctx
7018                                    .savepoints
7019                                    .split_off(pos)
7020                                    .into_iter()
7021                                    .map(|(_, x)| x)
7022                                    .collect();
7023                                ctx.released_sub_xids.extend(popped);
7024                                (
7025                                    "release_savepoint",
7026                                    format!("RELEASE SAVEPOINT {name} — {released} level(s)"),
7027                                )
7028                            }
7029                            None => (
7030                                "release_savepoint",
7031                                "RELEASE outside transaction — no-op".to_string(),
7032                            ),
7033                        }
7034                    }
7035                    TxnControl::RollbackToSavepoint(name) => {
7036                        let mgr = Arc::clone(&self.inner.snapshot_manager);
7037                        // Splice out the savepoint + nested ones under
7038                        // a narrow lock, then run the snapshot-manager
7039                        // + tombstone side-effects without the tx map
7040                        // held so nothing re-enters.
7041                        let drop_result: Option<(Xid, Vec<Xid>)> = {
7042                            let mut guard = self.inner.tx_contexts.write();
7043                            if let Some(ctx) = guard.get_mut(&conn_id) {
7044                                let pos = ctx
7045                                    .savepoints
7046                                    .iter()
7047                                    .position(|(n, _)| n == name)
7048                                    .ok_or_else(|| {
7049                                        RedDBError::Internal(format!(
7050                                            "savepoint {name} does not exist"
7051                                        ))
7052                                    })?;
7053                                let savepoint_xid = ctx.savepoints[pos].1;
7054                                let aborted: Vec<Xid> = ctx
7055                                    .savepoints
7056                                    .split_off(pos)
7057                                    .into_iter()
7058                                    .map(|(_, x)| x)
7059                                    .collect();
7060                                Some((savepoint_xid, aborted))
7061                            } else {
7062                                None
7063                            }
7064                        };
7065
7066                        match drop_result {
7067                            Some((savepoint_xid, aborted)) => {
7068                                for x in &aborted {
7069                                    mgr.rollback(*x);
7070                                }
7071                                let reverted_updates =
7072                                    self.revive_versioned_updates_since(conn_id, savepoint_xid);
7073                                let revived = self.revive_tombstones_since(conn_id, savepoint_xid);
7074                                (
7075                                    "rollback_to_savepoint",
7076                                    format!(
7077                                        "ROLLBACK TO SAVEPOINT {name} — aborted {} sub_xid(s), reverted {reverted_updates} update(s), revived {revived} tombstone(s)",
7078                                        aborted.len(),
7079                                    ),
7080                                )
7081                            }
7082                            None => (
7083                                "rollback_to_savepoint",
7084                                "ROLLBACK TO outside transaction — no-op".to_string(),
7085                            ),
7086                        }
7087                    }
7088                };
7089                Ok(RuntimeQueryResult::ok_message(
7090                    query.to_string(),
7091                    &msg,
7092                    kind,
7093                ))
7094            }
7095            // Schema + Sequence DDL (Phase 1.3 PG parity).
7096            //
7097            // Schemas are lightweight logical namespaces: a CREATE SCHEMA call
7098            // just registers the name in `red_config` under `schema.{name}`.
7099            // Table lookups still happen by collection name; clients using
7100            // `schema.table` qualified names collapse to collection `schema.table`.
7101            //
7102            // Sequences persist a 64-bit counter + metadata (start, increment)
7103            // in `red_config` under `sequence.{name}.*`. Scalar callers
7104            // `nextval('name')` / `currval('name')` arrive with the MVCC phase
7105            // once we have a proper mutating-function dispatch path; for now the
7106            // DDL just establishes the catalog entry so clients don't error.
7107            QueryExpr::CreateSchema(ref q) => {
7108                let store = self.inner.db.store();
7109                let key = format!("schema.{}", q.name);
7110                if store.get_config(&key).is_some() {
7111                    if q.if_not_exists {
7112                        return Ok(RuntimeQueryResult::ok_message(
7113                            query.to_string(),
7114                            &format!("schema {} already exists — skipped", q.name),
7115                            "create_schema",
7116                        ));
7117                    }
7118                    return Err(RedDBError::Internal(format!(
7119                        "schema {} already exists",
7120                        q.name
7121                    )));
7122                }
7123                store.set_config_tree(&key, &crate::serde_json::Value::Bool(true));
7124                Ok(RuntimeQueryResult::ok_message(
7125                    query.to_string(),
7126                    &format!("schema {} created", q.name),
7127                    "create_schema",
7128                ))
7129            }
7130            QueryExpr::DropSchema(ref q) => {
7131                let store = self.inner.db.store();
7132                let key = format!("schema.{}", q.name);
7133                let existed = store.get_config(&key).is_some();
7134                if !existed && !q.if_exists {
7135                    return Err(RedDBError::Internal(format!(
7136                        "schema {} does not exist",
7137                        q.name
7138                    )));
7139                }
7140                // Remove marker from red_config via set to null.
7141                store.set_config_tree(&key, &crate::serde_json::Value::Null);
7142                let suffix = if q.cascade {
7143                    " (CASCADE accepted — tables untouched)"
7144                } else {
7145                    ""
7146                };
7147                Ok(RuntimeQueryResult::ok_message(
7148                    query.to_string(),
7149                    &format!("schema {} dropped{}", q.name, suffix),
7150                    "drop_schema",
7151                ))
7152            }
7153            QueryExpr::CreateSequence(ref q) => {
7154                let store = self.inner.db.store();
7155                let base = format!("sequence.{}", q.name);
7156                let start_key = format!("{base}.start");
7157                let incr_key = format!("{base}.increment");
7158                let curr_key = format!("{base}.current");
7159                if store.get_config(&start_key).is_some() {
7160                    if q.if_not_exists {
7161                        return Ok(RuntimeQueryResult::ok_message(
7162                            query.to_string(),
7163                            &format!("sequence {} already exists — skipped", q.name),
7164                            "create_sequence",
7165                        ));
7166                    }
7167                    return Err(RedDBError::Internal(format!(
7168                        "sequence {} already exists",
7169                        q.name
7170                    )));
7171                }
7172                // Persist start + increment, and set current so the first
7173                // nextval returns `start`.
7174                let initial_current = q.start - q.increment;
7175                store.set_config_tree(
7176                    &start_key,
7177                    &crate::serde_json::Value::Number(q.start as f64),
7178                );
7179                store.set_config_tree(
7180                    &incr_key,
7181                    &crate::serde_json::Value::Number(q.increment as f64),
7182                );
7183                store.set_config_tree(
7184                    &curr_key,
7185                    &crate::serde_json::Value::Number(initial_current as f64),
7186                );
7187                Ok(RuntimeQueryResult::ok_message(
7188                    query.to_string(),
7189                    &format!(
7190                        "sequence {} created (start={}, increment={})",
7191                        q.name, q.start, q.increment
7192                    ),
7193                    "create_sequence",
7194                ))
7195            }
7196            QueryExpr::DropSequence(ref q) => {
7197                let store = self.inner.db.store();
7198                let base = format!("sequence.{}", q.name);
7199                let existed = store.get_config(&format!("{base}.start")).is_some();
7200                if !existed && !q.if_exists {
7201                    return Err(RedDBError::Internal(format!(
7202                        "sequence {} does not exist",
7203                        q.name
7204                    )));
7205                }
7206                for k in ["start", "increment", "current"] {
7207                    store.set_config_tree(&format!("{base}.{k}"), &crate::serde_json::Value::Null);
7208                }
7209                Ok(RuntimeQueryResult::ok_message(
7210                    query.to_string(),
7211                    &format!("sequence {} dropped", q.name),
7212                    "drop_sequence",
7213                ))
7214            }
7215            // Views — CREATE [MATERIALIZED] VIEW (Phase 2.1 PG parity).
7216            //
7217            // The view definition is stored in-memory on RuntimeInner (not
7218            // persisted). SELECTs that reference the view name will substitute
7219            // the stored `QueryExpr` via `resolve_view_reference` during
7220            // planning (same entry point used by table-name resolution).
7221            //
7222            // Materialized views additionally allocate a slot in
7223            // `MaterializedViewCache`; a REFRESH repopulates that slot.
7224            QueryExpr::CreateView(ref q) => {
7225                let mut views = self.inner.views.write();
7226                if views.contains_key(&q.name) && !q.or_replace {
7227                    if q.if_not_exists {
7228                        return Ok(RuntimeQueryResult::ok_message(
7229                            query.to_string(),
7230                            &format!("view {} already exists — skipped", q.name),
7231                            "create_view",
7232                        ));
7233                    }
7234                    return Err(RedDBError::Internal(format!(
7235                        "view {} already exists",
7236                        q.name
7237                    )));
7238                }
7239                views.insert(q.name.clone(), Arc::new(q.clone()));
7240                drop(views);
7241
7242                // Materialized view: register cache slot (data is empty until REFRESH).
7243                if q.materialized {
7244                    use crate::storage::cache::result::{MaterializedViewDef, RefreshPolicy};
7245                    let refresh = match q.refresh_every_ms {
7246                        Some(ms) => RefreshPolicy::Periodic(std::time::Duration::from_millis(ms)),
7247                        None => RefreshPolicy::Manual,
7248                    };
7249                    let dependencies = collect_table_refs(&q.query);
7250                    let def = MaterializedViewDef {
7251                        name: q.name.clone(),
7252                        query: format!("<parsed view {}>", q.name),
7253                        dependencies: dependencies.clone(),
7254                        refresh,
7255                        retention_duration_ms: q.retention_duration_ms,
7256                    };
7257                    self.inner.materialized_views.write().register(def);
7258
7259                    // Issue #593 slice 9a — persist the descriptor to
7260                    // the system catalog so the definition survives a
7261                    // restart. Upsert semantics (delete-then-insert by
7262                    // name) keep the catalog free of duplicate rows
7263                    // across `CREATE OR REPLACE` churn.
7264                    let descriptor =
7265                        crate::runtime::continuous_materialized_view::MaterializedViewDescriptor {
7266                            name: q.name.clone(),
7267                            source_sql: query.to_string(),
7268                            source_collections: dependencies,
7269                            refresh_every_ms: q.refresh_every_ms,
7270                            retention_duration_ms: q.retention_duration_ms,
7271                        };
7272                    let store = self.inner.db.store();
7273                    crate::runtime::continuous_materialized_view::persist_descriptor(
7274                        store.as_ref(),
7275                        &descriptor,
7276                    )?;
7277
7278                    // Issue #594 slice 9b — provision a Table-shaped
7279                    // backing collection named after the view. The
7280                    // rewriter skips materialized views (see
7281                    // `rewrite_view_refs_inner`) so `SELECT FROM v`
7282                    // resolves to this collection directly. Empty
7283                    // until REFRESH wires through it in 9c.
7284                    self.ensure_materialized_view_backing(&q.name)?;
7285                }
7286                // Plan cache may have cached a plan that didn't know about this
7287                // view — invalidate so future references pick up the new binding.
7288                // Result cache gets flushed too: OR REPLACE must not serve a
7289                // prior execution of the obsolete body.
7290                self.invalidate_plan_cache();
7291                self.invalidate_result_cache();
7292
7293                Ok(RuntimeQueryResult::ok_message(
7294                    query.to_string(),
7295                    &format!(
7296                        "{}view {} created",
7297                        if q.materialized { "materialized " } else { "" },
7298                        q.name
7299                    ),
7300                    "create_view",
7301                ))
7302            }
7303            QueryExpr::DropView(ref q) => {
7304                let mut views = self.inner.views.write();
7305                let removed = views.remove(&q.name);
7306                let existed = removed.is_some();
7307                let removed_materialized =
7308                    removed.as_ref().map(|v| v.materialized).unwrap_or(false);
7309                drop(views);
7310                if q.materialized || existed {
7311                    // Try the materialised cache too — silent if absent.
7312                    self.inner.materialized_views.write().remove(&q.name);
7313                    // Issue #593 slice 9a — remove any persisted
7314                    // catalog row. Idempotent: a no-op when the view
7315                    // was never materialized (no row was ever written).
7316                    let store = self.inner.db.store();
7317                    crate::runtime::continuous_materialized_view::remove_by_name(
7318                        store.as_ref(),
7319                        &q.name,
7320                    )?;
7321                }
7322                // Issue #594 slice 9b — drop the backing collection
7323                // that was provisioned at CREATE time. Only mat views
7324                // ever had one; regular views never did.
7325                if removed_materialized || q.materialized {
7326                    self.drop_materialized_view_backing(&q.name)?;
7327                }
7328                // Drop any plan / result cache entries that baked the
7329                // view body into their QueryExpr.
7330                self.invalidate_plan_cache();
7331                self.invalidate_result_cache();
7332                if !existed && !q.if_exists {
7333                    return Err(RedDBError::Internal(format!(
7334                        "view {} does not exist",
7335                        q.name
7336                    )));
7337                }
7338                self.invalidate_plan_cache();
7339                Ok(RuntimeQueryResult::ok_message(
7340                    query.to_string(),
7341                    &format!("view {} dropped", q.name),
7342                    "drop_view",
7343                ))
7344            }
7345            QueryExpr::RefreshMaterializedView(ref q) => {
7346                // Look up the view definition, execute its underlying query,
7347                // and stash the serialized result in the materialised cache.
7348                let view = {
7349                    let views = self.inner.views.read();
7350                    views.get(&q.name).cloned()
7351                };
7352                let view = match view {
7353                    Some(v) => v,
7354                    None => {
7355                        return Err(RedDBError::Internal(format!(
7356                            "view {} does not exist",
7357                            q.name
7358                        )))
7359                    }
7360                };
7361                if !view.materialized {
7362                    return Err(RedDBError::Internal(format!(
7363                        "view {} is not materialized — REFRESH requires \
7364                         CREATE MATERIALIZED VIEW",
7365                        q.name
7366                    )));
7367                }
7368                // Execute the underlying query fresh.
7369                let started = std::time::Instant::now();
7370                let now_ms = std::time::SystemTime::now()
7371                    .duration_since(std::time::UNIX_EPOCH)
7372                    .map(|d| d.as_millis() as u64)
7373                    .unwrap_or(0);
7374                match self.execute_query_expr((*view.query).clone()) {
7375                    Ok(inner_result) => {
7376                        // Issue #595 slice 9c — atomically replace the
7377                        // backing collection's contents under a single
7378                        // WAL group. Concurrent SELECT from the view
7379                        // sees either the prior or new contents, never
7380                        // partial. A crash before the WAL commit lands
7381                        // leaves the prior contents intact on recovery.
7382                        let entities =
7383                            view_records_to_entities(&q.name, &inner_result.result.records);
7384                        let row_count = entities.len() as u64;
7385                        let store = self.inner.db.store();
7386                        let serialized_records = match store.refresh_collection(&q.name, entities) {
7387                            Ok(records) => records,
7388                            Err(err) => {
7389                                let duration_ms = started.elapsed().as_millis() as u64;
7390                                let msg = err.to_string();
7391                                self.inner
7392                                    .materialized_views
7393                                    .write()
7394                                    .record_refresh_failure(
7395                                        &q.name,
7396                                        msg.clone(),
7397                                        duration_ms,
7398                                        now_ms,
7399                                    );
7400                                return Err(RedDBError::Internal(format!(
7401                                    "REFRESH MATERIALIZED VIEW {}: {msg}",
7402                                    q.name
7403                                )));
7404                            }
7405                        };
7406
7407                        // Issue #596 slice 9d — emit a Refresh
7408                        // ChangeRecord into the logical-WAL spool so
7409                        // replicas deterministically replay the same
7410                        // backing-collection contents via
7411                        // `LogicalChangeApplier::apply_record`.
7412                        if let Some(ref primary) = self.inner.db.replication {
7413                            let lsn = self.inner.cdc.emit(
7414                                crate::replication::cdc::ChangeOperation::Refresh,
7415                                &q.name,
7416                                0,
7417                                "refresh",
7418                            );
7419                            self.invalidate_result_cache_for_table(&q.name);
7420                            let timestamp = std::time::SystemTime::now()
7421                                .duration_since(std::time::UNIX_EPOCH)
7422                                .unwrap_or_default()
7423                                .as_millis() as u64;
7424                            let record = ChangeRecord::for_refresh(
7425                                lsn,
7426                                timestamp,
7427                                q.name.clone(),
7428                                serialized_records,
7429                            )
7430                            .with_term(self.current_replication_term());
7431                            let encoded = record.encode();
7432                            primary.append_logical_record(record.lsn, encoded);
7433                        }
7434
7435                        let duration_ms = started.elapsed().as_millis() as u64;
7436                        let serialized = format!("{:?}", inner_result.result);
7437                        self.inner
7438                            .materialized_views
7439                            .write()
7440                            .record_refresh_success(
7441                                &q.name,
7442                                serialized.into_bytes(),
7443                                row_count,
7444                                duration_ms,
7445                                now_ms,
7446                            );
7447                        // SELECT FROM v now reads through the rewriter
7448                        // skip into the backing collection — drop the
7449                        // result cache so prior empty-backing reads
7450                        // don't shadow the new contents.
7451                        self.invalidate_result_cache();
7452                        Ok(RuntimeQueryResult::ok_message(
7453                            query.to_string(),
7454                            &format!("materialized view {} refreshed", q.name),
7455                            "refresh_materialized_view",
7456                        ))
7457                    }
7458                    Err(err) => {
7459                        let duration_ms = started.elapsed().as_millis() as u64;
7460                        let msg = err.to_string();
7461                        self.inner
7462                            .materialized_views
7463                            .write()
7464                            .record_refresh_failure(&q.name, msg.clone(), duration_ms, now_ms);
7465                        Err(err)
7466                    }
7467                }
7468            }
7469            // Row Level Security (Phase 2.5 PG parity).
7470            //
7471            // Policies live in an in-memory registry keyed by (table, name).
7472            // Enforcement (AND-ing the policy's USING clause into every
7473            // query's WHERE for the table) arrives in Phase 2.5.2 via the
7474            // filter compiler; this dispatch only manages the catalog.
7475            QueryExpr::CreatePolicy(ref q) => {
7476                let key = (q.table.clone(), q.name.clone());
7477                self.inner
7478                    .rls_policies
7479                    .write()
7480                    .insert(key, Arc::new(q.clone()));
7481                self.invalidate_plan_cache();
7482                // Issue #120 — surface policy names in the
7483                // schema-vocabulary so AskPipeline (#121) can resolve
7484                // a policy reference back to its table.
7485                self.schema_vocabulary_apply(
7486                    crate::runtime::schema_vocabulary::DdlEvent::CreatePolicy {
7487                        collection: q.table.clone(),
7488                        policy: q.name.clone(),
7489                    },
7490                );
7491                Ok(RuntimeQueryResult::ok_message(
7492                    query.to_string(),
7493                    &format!("policy {} on {} created", q.name, q.table),
7494                    "create_policy",
7495                ))
7496            }
7497            QueryExpr::DropPolicy(ref q) => {
7498                let removed = self
7499                    .inner
7500                    .rls_policies
7501                    .write()
7502                    .remove(&(q.table.clone(), q.name.clone()))
7503                    .is_some();
7504                if !removed && !q.if_exists {
7505                    return Err(RedDBError::Internal(format!(
7506                        "policy {} on {} does not exist",
7507                        q.name, q.table
7508                    )));
7509                }
7510                self.invalidate_plan_cache();
7511                // Issue #120 — keep the schema-vocabulary policy
7512                // entry in sync.
7513                self.schema_vocabulary_apply(
7514                    crate::runtime::schema_vocabulary::DdlEvent::DropPolicy {
7515                        collection: q.table.clone(),
7516                        policy: q.name.clone(),
7517                    },
7518                );
7519                Ok(RuntimeQueryResult::ok_message(
7520                    query.to_string(),
7521                    &format!("policy {} on {} dropped", q.name, q.table),
7522                    "drop_policy",
7523                ))
7524            }
7525            // Foreign Data Wrappers (Phase 3.2 PG parity).
7526            //
7527            // CREATE SERVER / CREATE FOREIGN TABLE register into the shared
7528            // `ForeignTableRegistry`. The read path consults that registry
7529            // before dispatching a SELECT — when the table name matches a
7530            // registered foreign table, we forward the scan to the wrapper
7531            // and skip the normal collection lookup.
7532            //
7533            // Phase 3.2 is in-memory only; persistence across restarts is a
7534            // 3.2.2 follow-up that mirrors the view registry pattern.
7535            QueryExpr::CreateServer(ref q) => {
7536                use crate::storage::fdw::FdwOptions;
7537                let registry = Arc::clone(&self.inner.foreign_tables);
7538                if registry.server(&q.name).is_some() {
7539                    if q.if_not_exists {
7540                        return Ok(RuntimeQueryResult::ok_message(
7541                            query.to_string(),
7542                            &format!("server {} already exists — skipped", q.name),
7543                            "create_server",
7544                        ));
7545                    }
7546                    return Err(RedDBError::Internal(format!(
7547                        "server {} already exists",
7548                        q.name
7549                    )));
7550                }
7551                let mut opts = FdwOptions::new();
7552                for (k, v) in &q.options {
7553                    opts.values.insert(k.clone(), v.clone());
7554                }
7555                registry
7556                    .create_server(&q.name, &q.wrapper, opts)
7557                    .map_err(|e| RedDBError::Internal(e.to_string()))?;
7558                Ok(RuntimeQueryResult::ok_message(
7559                    query.to_string(),
7560                    &format!("server {} created (wrapper {})", q.name, q.wrapper),
7561                    "create_server",
7562                ))
7563            }
7564            QueryExpr::DropServer(ref q) => {
7565                let existed = self.inner.foreign_tables.drop_server(&q.name);
7566                if !existed && !q.if_exists {
7567                    return Err(RedDBError::Internal(format!(
7568                        "server {} does not exist",
7569                        q.name
7570                    )));
7571                }
7572                Ok(RuntimeQueryResult::ok_message(
7573                    query.to_string(),
7574                    &format!(
7575                        "server {} dropped{}",
7576                        q.name,
7577                        if q.cascade { " (cascade)" } else { "" }
7578                    ),
7579                    "drop_server",
7580                ))
7581            }
7582            QueryExpr::CreateForeignTable(ref q) => {
7583                use crate::storage::fdw::{FdwOptions, ForeignColumn, ForeignTable};
7584                let registry = Arc::clone(&self.inner.foreign_tables);
7585                if registry.foreign_table(&q.name).is_some() {
7586                    if q.if_not_exists {
7587                        return Ok(RuntimeQueryResult::ok_message(
7588                            query.to_string(),
7589                            &format!("foreign table {} already exists — skipped", q.name),
7590                            "create_foreign_table",
7591                        ));
7592                    }
7593                    return Err(RedDBError::Internal(format!(
7594                        "foreign table {} already exists",
7595                        q.name
7596                    )));
7597                }
7598                let mut opts = FdwOptions::new();
7599                for (k, v) in &q.options {
7600                    opts.values.insert(k.clone(), v.clone());
7601                }
7602                let columns: Vec<ForeignColumn> = q
7603                    .columns
7604                    .iter()
7605                    .map(|c| ForeignColumn {
7606                        name: c.name.clone(),
7607                        data_type: c.data_type.clone(),
7608                        not_null: c.not_null,
7609                    })
7610                    .collect();
7611                registry
7612                    .create_foreign_table(ForeignTable {
7613                        name: q.name.clone(),
7614                        server_name: q.server.clone(),
7615                        columns,
7616                        options: opts,
7617                    })
7618                    .map_err(|e| RedDBError::Internal(e.to_string()))?;
7619                self.invalidate_plan_cache();
7620                Ok(RuntimeQueryResult::ok_message(
7621                    query.to_string(),
7622                    &format!("foreign table {} created (server {})", q.name, q.server),
7623                    "create_foreign_table",
7624                ))
7625            }
7626            QueryExpr::DropForeignTable(ref q) => {
7627                let existed = self.inner.foreign_tables.drop_foreign_table(&q.name);
7628                if !existed && !q.if_exists {
7629                    return Err(RedDBError::Internal(format!(
7630                        "foreign table {} does not exist",
7631                        q.name
7632                    )));
7633                }
7634                self.invalidate_plan_cache();
7635                Ok(RuntimeQueryResult::ok_message(
7636                    query.to_string(),
7637                    &format!("foreign table {} dropped", q.name),
7638                    "drop_foreign_table",
7639                ))
7640            }
7641            // COPY table FROM 'path' (Phase 1.5 PG parity).
7642            //
7643            // Stream CSV rows through the shared `CsvImporter`. The collection
7644            // is auto-created on first insert (via `insert_auto`-style path);
7645            // VACUUM/ANALYZE afterwards is up to the caller.
7646            QueryExpr::CopyFrom(ref q) => {
7647                use crate::storage::import::{CsvConfig, CsvImporter};
7648                let store = self.inner.db.store();
7649                let cfg = CsvConfig {
7650                    collection: q.table.clone(),
7651                    has_header: q.has_header,
7652                    delimiter: q.delimiter.map(|c| c as u8).unwrap_or(b','),
7653                    ..CsvConfig::default()
7654                };
7655                let importer = CsvImporter::new(cfg);
7656                let stats = importer
7657                    .import_file(&q.path, store.as_ref())
7658                    .map_err(|e| RedDBError::Internal(format!("COPY failed: {e}")))?;
7659                // Tables are written → invalidate cached plans / result cache.
7660                self.note_table_write(&q.table);
7661                Ok(RuntimeQueryResult::ok_message(
7662                    query.to_string(),
7663                    &format!(
7664                        "COPY imported {} rows into {} ({} errors skipped, {}ms)",
7665                        stats.records_imported, q.table, stats.errors_skipped, stats.duration_ms
7666                    ),
7667                    "copy_from",
7668                ))
7669            }
7670            // Maintenance commands (Phase 1.2 PG parity).
7671            //
7672            // - VACUUM [FULL] [table]: refreshes planner stats for the target
7673            //   collection(s) and — when FULL — triggers a full pager persist
7674            //   (flushes dirty pages + fsync). Also invalidates the result cache
7675            //   so subsequent reads re-execute against the freshly compacted
7676            //   storage. RedDB's segment/btree GC runs continuously via the
7677            //   background lifecycle; explicit space reclamation for sealed
7678            //   segments arrives with Phase 2.3 (MVCC + dead-tuple reclamation).
7679            // - ANALYZE [table]: reruns `analyze_collection` +
7680            //   `persist_table_stats` via `refresh_table_planner_stats` so the
7681            //   planner has fresh histograms, distinct estimates, null counts.
7682            //
7683            // Both commands accept an optional target; omitting the target
7684            // iterates every collection in the store.
7685            QueryExpr::MaintenanceCommand(ref cmd) => {
7686                use crate::storage::query::ast::MaintenanceCommand as Mc;
7687                let store = self.inner.db.store();
7688                let (kind, msg) = match cmd {
7689                    Mc::Analyze { target } => {
7690                        let targets: Vec<String> = match target {
7691                            Some(t) => vec![t.clone()],
7692                            None => store.list_collections(),
7693                        };
7694                        for t in &targets {
7695                            self.refresh_table_planner_stats(t);
7696                        }
7697                        (
7698                            "analyze",
7699                            format!("ANALYZE refreshed stats for {} table(s)", targets.len()),
7700                        )
7701                    }
7702                    Mc::Vacuum { target, full } => {
7703                        let targets: Vec<String> = match target {
7704                            Some(t) => vec![t.clone()],
7705                            None => store.list_collections(),
7706                        };
7707                        let cutoff_xid = self.mvcc_vacuum_cutoff_xid();
7708                        let mut vacuum_stats =
7709                            crate::storage::unified::store::MvccVacuumStats::default();
7710                        for t in &targets {
7711                            let stats = store.vacuum_mvcc_history(t, cutoff_xid).map_err(|e| {
7712                                RedDBError::Internal(format!(
7713                                    "VACUUM MVCC history failed for {t}: {e}"
7714                                ))
7715                            })?;
7716                            if stats.reclaimed_versions > 0 {
7717                                self.rebuild_runtime_indexes_for_table(t)?;
7718                            }
7719                            vacuum_stats.add(&stats);
7720                        }
7721                        self.inner.snapshot_manager.prune_aborted(cutoff_xid);
7722                        // Stats refresh covers every target (same as ANALYZE).
7723                        for t in &targets {
7724                            self.refresh_table_planner_stats(t);
7725                        }
7726                        // FULL forces a pager persist (dirty-page flush + fsync).
7727                        // Regular VACUUM relies on the background writer / segment
7728                        // lifecycle so the command is non-blocking.
7729                        let persisted = if *full {
7730                            match store.persist() {
7731                                Ok(()) => true,
7732                                Err(e) => {
7733                                    return Err(RedDBError::Internal(format!(
7734                                        "VACUUM FULL persist failed: {e:?}"
7735                                    )));
7736                                }
7737                            }
7738                        } else {
7739                            false
7740                        };
7741                        // Result cache depended on pre-vacuum state.
7742                        self.invalidate_result_cache();
7743                        (
7744                            "vacuum",
7745                            format!(
7746                                "VACUUM{} processed {} table(s): scanned_versions={}, retained_versions={}, reclaimed_versions={}, retained_history_versions={}, reclaimed_history_versions={}, retained_tombstones={}, reclaimed_tombstones={}{}",
7747                                if *full { " FULL" } else { "" },
7748                                targets.len(),
7749                                vacuum_stats.scanned_versions,
7750                                vacuum_stats.retained_versions,
7751                                vacuum_stats.reclaimed_versions,
7752                                vacuum_stats.retained_history_versions,
7753                                vacuum_stats.reclaimed_history_versions,
7754                                vacuum_stats.retained_tombstones,
7755                                vacuum_stats.reclaimed_tombstones,
7756                                if persisted {
7757                                    " (pages flushed to disk)"
7758                                } else {
7759                                    ""
7760                                }
7761                            ),
7762                        )
7763                    }
7764                };
7765                Ok(RuntimeQueryResult::ok_message(
7766                    query.to_string(),
7767                    &msg,
7768                    kind,
7769                ))
7770            }
7771            // GRANT / REVOKE / ALTER USER (RBAC milestone).
7772            //
7773            // These hit the AuthStore directly. The privilege-check
7774            // gate at the top of `execute_query_expr` already decided
7775            // whether the caller may even run the statement; here we
7776            // just translate the AST into AuthStore calls.
7777            QueryExpr::Grant(ref g) => self.execute_grant_statement(query, g),
7778            QueryExpr::Revoke(ref r) => self.execute_revoke_statement(query, r),
7779            QueryExpr::AlterUser(ref a) => self.execute_alter_user_statement(query, a),
7780            QueryExpr::CreateIamPolicy { ref id, ref json } => {
7781                self.execute_create_iam_policy(query, id, json)
7782            }
7783            QueryExpr::DropIamPolicy { ref id } => self.execute_drop_iam_policy(query, id),
7784            QueryExpr::AttachPolicy {
7785                ref policy_id,
7786                ref principal,
7787            } => self.execute_attach_policy(query, policy_id, principal),
7788            QueryExpr::DetachPolicy {
7789                ref policy_id,
7790                ref principal,
7791            } => self.execute_detach_policy(query, policy_id, principal),
7792            QueryExpr::ShowPolicies { ref filter } => {
7793                self.execute_show_policies(query, filter.as_ref())
7794            }
7795            QueryExpr::ShowEffectivePermissions {
7796                ref user,
7797                ref resource,
7798            } => self.execute_show_effective_permissions(query, user, resource.as_ref()),
7799            QueryExpr::SimulatePolicy {
7800                ref user,
7801                ref action,
7802                ref resource,
7803            } => self.execute_simulate_policy(query, user, action, resource),
7804            QueryExpr::LintPolicy { ref source } => self.execute_lint_policy(query, source),
7805            QueryExpr::MigratePolicyMode {
7806                ref target,
7807                dry_run,
7808            } => self.execute_migrate_policy_mode(query, target, dry_run),
7809            QueryExpr::CreateMigration(ref q) => self.execute_create_migration(query, q),
7810            QueryExpr::ApplyMigration(ref q) => self.execute_apply_migration(query, q),
7811            QueryExpr::RollbackMigration(ref q) => self.execute_rollback_migration(query, q),
7812            QueryExpr::ExplainMigration(ref q) => self.execute_explain_migration(query, q),
7813        };
7814
7815        if !control_event_specs.is_empty() {
7816            let (outcome, reason) = match &query_result {
7817                Ok(_) => (crate::runtime::control_events::Outcome::Allowed, None),
7818                Err(err) => (control_event_outcome_for_error(err), Some(err.to_string())),
7819            };
7820            for spec in &control_event_specs {
7821                self.emit_control_event(
7822                    spec.kind,
7823                    outcome,
7824                    spec.action,
7825                    spec.resource.clone(),
7826                    reason.clone(),
7827                    spec.fields.clone(),
7828                )?;
7829            }
7830        }
7831
7832        if let (Some(plan), Ok(result)) = (&query_audit_plan, &query_result) {
7833            self.emit_query_audit(
7834                query,
7835                plan,
7836                query_audit_started.elapsed().as_millis() as u64,
7837                result,
7838            );
7839        }
7840
7841        // Decrypt Value::Secret columns in-place before caching, so
7842        // cached results match the post-decrypt shape and repeat
7843        // queries skip the per-row AES-GCM pass.
7844        let mut query_result = query_result;
7845        if let Ok(ref mut result) = query_result {
7846            if result.statement_type == "select" {
7847                self.apply_secret_decryption(result);
7848            }
7849        }
7850
7851        // Cache SELECT results for 30s.
7852        // Skip: pre-serialized JSON (large clone), and result sets > 5 rows.
7853        // Large multi-row results (range scans, filtered scans) are rarely
7854        // repeated with the same literal values so the cache hit rate is near
7855        // zero while the clone cost (100 records × ~16 fields each) is high.
7856        // Aggregations (1 row) and point lookups (1 row) still benefit.
7857        if let Ok(ref result) = query_result {
7858            frame.write_result_cache(self, result, result_cache_scopes);
7859        }
7860
7861        query_result
7862    }
7863
7864    /// Snapshot of every registered materialized view's runtime
7865    /// state — feeds the `red.materialized_views` virtual table.
7866    /// Issue #583 slice 10.
7867    pub fn materialized_view_metadata(
7868        &self,
7869    ) -> Vec<crate::storage::cache::result::MaterializedViewMetadata> {
7870        // Issue #595 slice 9c — `current_row_count` is now scraped
7871        // live from the backing collection rather than read from the
7872        // cache slot. Mirrors the slice-10 invariant on
7873        // `queue_pending_gauge` in #527: the live store is the source
7874        // of truth, the cache slot only carries last-refresh telemetry
7875        // (timing, error, refresh cadence).
7876        let store = self.inner.db.store();
7877        let mut entries = self.inner.materialized_views.read().metadata();
7878        for entry in &mut entries {
7879            if let Some(manager) = store.get_collection(&entry.name) {
7880                entry.current_row_count = manager.count() as u64;
7881            }
7882        }
7883        entries
7884    }
7885
7886    /// Drive scheduled refreshes for materialized views with a
7887    /// `REFRESH EVERY <duration>` clause. Called from the background
7888    /// scheduler thread (and from unit tests with a fake clock via
7889    /// `claim_due_at`). Each invocation atomically claims the set of
7890    /// due views (so two concurrent ticks never double-fire the same
7891    /// view) and runs each refresh through the standard execution
7892    /// path — failures are captured in `last_error` and the prior
7893    /// content stays intact. Issue #583 slice 10.
7894    /// Snapshot of every tracked retention sweeper state — feeds the
7895    /// three extra columns on `red.retention`. Issue #584 slice 12.
7896    pub(crate) fn retention_sweeper_snapshot(
7897        &self,
7898    ) -> Vec<(String, crate::runtime::retention_sweeper::SweeperState)> {
7899        self.inner.retention_sweeper.read().snapshot()
7900    }
7901
7902    /// Drive one tick of the retention sweeper. Iterates collections
7903    /// with a retention policy set, physically deletes at most
7904    /// `batch_size` expired rows per collection, and records the
7905    /// `last_sweep_at_ms` / `rows_swept_total` / pending estimate that
7906    /// `red.retention` exposes. Called from the background sweeper
7907    /// thread; safe to invoke directly from tests with a small batch
7908    /// size to drain rows deterministically. Issue #584 slice 12.
7909    ///
7910    /// Deletes are issued as `DELETE FROM <collection> WHERE
7911    /// <ts_column> < <cutoff>` through the standard `execute_query`
7912    /// chokepoint so WAL participation and snapshot guards apply
7913    /// exactly as for a user-issued DELETE — replicas replay the
7914    /// sweeper's deletes via the same WAL stream with no special
7915    /// handling on the replication side.
7916    ///
7917    /// Batching is enforced by tightening the cutoff: if more than
7918    /// `batch_size` rows are expired, the cutoff is dropped to the
7919    /// `batch_size`-th oldest expired timestamp + 1 so the predicate
7920    /// matches roughly `batch_size` rows; the remainder is reported
7921    /// as `current_rows_pending_sweep_estimate` and drained on the
7922    /// next tick.
7923    pub fn sweep_retention_tick(&self, batch_size: usize) {
7924        if batch_size == 0 {
7925            return;
7926        }
7927        let now_ms = std::time::SystemTime::now()
7928            .duration_since(std::time::UNIX_EPOCH)
7929            .map(|d| d.as_millis() as u64)
7930            .unwrap_or(0);
7931
7932        let store = self.inner.db.store();
7933        let collections = store.list_collections();
7934        for name in collections {
7935            let Some(contract) = self.inner.db.collection_contract(&name) else {
7936                continue;
7937            };
7938            let Some(retention_ms) = contract.retention_duration_ms else {
7939                continue;
7940            };
7941            let Some(ts_column) =
7942                crate::runtime::retention_filter::resolve_timestamp_column(&contract)
7943            else {
7944                continue;
7945            };
7946            let Some(manager) = store.get_collection(&name) else {
7947                continue;
7948            };
7949            let cutoff = (now_ms as i64).saturating_sub(retention_ms as i64);
7950
7951            // Single pass: collect expired timestamps. We keep the
7952            // full Vec rather than a bounded heap because the partial
7953            // sort below is the simplest correct way to find the
7954            // batch-th oldest; for the slice's "1000-row default
7955            // batch" target this is bounded enough for production
7956            // operation, and the alternative (in-place heap of size
7957            // batch+1) is a follow-up optimisation.
7958            let mut expired_ts: Vec<i64> = Vec::new();
7959            manager.for_each_entity(|entity| {
7960                let ts = match ts_column.as_str() {
7961                    "created_at" => Some(entity.created_at as i64),
7962                    "updated_at" => Some(entity.updated_at as i64),
7963                    other => entity
7964                        .data
7965                        .as_row()
7966                        .and_then(|row| row.get_field(other))
7967                        .and_then(|v| match v {
7968                            crate::storage::schema::Value::TimestampMs(t) => Some(*t),
7969                            crate::storage::schema::Value::Timestamp(t) => {
7970                                Some(t.saturating_mul(1_000))
7971                            }
7972                            crate::storage::schema::Value::BigInt(t) => Some(*t),
7973                            crate::storage::schema::Value::UnsignedInteger(t) => {
7974                                i64::try_from(*t).ok()
7975                            }
7976                            crate::storage::schema::Value::Integer(t) => Some(*t),
7977                            _ => None,
7978                        }),
7979                };
7980                if let Some(t) = ts {
7981                    if t < cutoff {
7982                        expired_ts.push(t);
7983                    }
7984                }
7985                true
7986            });
7987
7988            let total_expired = expired_ts.len() as u64;
7989            if total_expired == 0 {
7990                self.inner
7991                    .retention_sweeper
7992                    .write()
7993                    .record_tick(&name, 0, 0, now_ms);
7994                continue;
7995            }
7996
7997            let (effective_cutoff, pending) = if (total_expired as usize) <= batch_size {
7998                (cutoff, 0u64)
7999            } else {
8000                // Tighten the cutoff to the (batch_size)-th oldest
8001                // expired timestamp + 1 so DELETE matches roughly
8002                // `batch_size` rows.
8003                expired_ts.sort_unstable();
8004                let nth = expired_ts[batch_size - 1];
8005                (
8006                    nth.saturating_add(1),
8007                    total_expired.saturating_sub(batch_size as u64),
8008                )
8009            };
8010
8011            let stmt = format!(
8012                "DELETE FROM {} WHERE {} < {}",
8013                name, ts_column, effective_cutoff
8014            );
8015            let deleted = match self.execute_query(&stmt) {
8016                Ok(r) => r.affected_rows,
8017                Err(_) => 0,
8018            };
8019
8020            self.inner
8021                .retention_sweeper
8022                .write()
8023                .record_tick(&name, deleted, pending, now_ms);
8024        }
8025    }
8026
8027    pub fn refresh_due_materialized_views(&self) {
8028        let due = {
8029            let mut cache = self.inner.materialized_views.write();
8030            cache.claim_due_at(std::time::Instant::now())
8031        };
8032        for name in due {
8033            // Round-trip through `execute_query` (rather than the
8034            // prepared-statement `execute_query_expr` fast path, which
8035            // explicitly rejects DDL/maintenance statements). Failures
8036            // are captured inside the RefreshMaterializedView handler
8037            // via `record_refresh_failure`; the scheduler ignores the
8038            // Result so one bad view doesn't halt the loop.
8039            let stmt = format!("REFRESH MATERIALIZED VIEW {}", name);
8040            let _ = self.execute_query(&stmt);
8041        }
8042    }
8043
8044    /// Execute a pre-parsed `QueryExpr` directly, bypassing SQL parsing and the
8045    /// plan cache. Used by the prepared-statement fast path so that `execute_prepared`
8046    /// calls pay zero parse + cache overhead.
8047    ///
8048    /// Applies secret decryption on SELECT results, identical to `execute_query`.
8049    pub fn execute_query_expr(&self, expr: QueryExpr) -> RedDBResult<RuntimeQueryResult> {
8050        let _config_snapshot_guard = ConfigSnapshotGuard::install(Arc::clone(&self.inner.db));
8051        let _secret_store_guard = SecretStoreGuard::install(self.inner.auth_store.read().clone());
8052        // View rewrite (Phase 2.1): substitute any `QueryExpr::Table(tq)`
8053        // whose `tq.table` matches a registered view with the view's
8054        // underlying query. Safe to call even when no views are registered.
8055        let expr = self.rewrite_view_refs(expr);
8056
8057        self.validate_model_operations_before_auth(&expr)?;
8058        // Granular RBAC privilege check. Runs before dispatch so a
8059        // denied caller never reaches storage. Fail-closed: any error
8060        // resolving the action / resource produces PermissionDenied.
8061        if let Err(err) = self.check_query_privilege(&expr) {
8062            return Err(RedDBError::Query(format!("permission denied: {err}")));
8063        }
8064
8065        let statement = query_expr_name(&expr);
8066        let mode = detect_mode(statement);
8067        let query_str = statement;
8068
8069        let result = self.dispatch_expr(expr, query_str, mode)?;
8070        let mut r = result;
8071        if r.statement_type == "select" {
8072            self.apply_secret_decryption(&mut r);
8073        }
8074        Ok(r)
8075    }
8076
8077    pub(super) fn validate_model_operations_before_auth(
8078        &self,
8079        expr: &QueryExpr,
8080    ) -> RedDBResult<()> {
8081        use crate::catalog::CollectionModel;
8082        use crate::runtime::ddl::polymorphic_resolver;
8083        use crate::storage::query::ast::KvCommand;
8084
8085        let system_schema_target = match expr {
8086            QueryExpr::DropTable(q) => Some(q.name.as_str()),
8087            QueryExpr::DropGraph(q) => Some(q.name.as_str()),
8088            QueryExpr::DropVector(q) => Some(q.name.as_str()),
8089            QueryExpr::DropDocument(q) => Some(q.name.as_str()),
8090            QueryExpr::DropKv(q) => Some(q.name.as_str()),
8091            QueryExpr::DropCollection(q) => Some(q.name.as_str()),
8092            QueryExpr::Truncate(q) => Some(q.name.as_str()),
8093            _ => None,
8094        };
8095        if system_schema_target.is_some_and(crate::runtime::impl_ddl::is_system_schema_name) {
8096            return Err(RedDBError::Query("system schema is read-only".to_string()));
8097        }
8098
8099        let expected = match expr {
8100            QueryExpr::DropTable(q) => Some((q.name.as_str(), CollectionModel::Table)),
8101            QueryExpr::DropGraph(q) => Some((q.name.as_str(), CollectionModel::Graph)),
8102            QueryExpr::DropVector(q) => Some((q.name.as_str(), CollectionModel::Vector)),
8103            QueryExpr::DropDocument(q) => Some((q.name.as_str(), CollectionModel::Document)),
8104            QueryExpr::DropKv(q) => Some((q.name.as_str(), q.model)),
8105            QueryExpr::DropCollection(q) => q.model.map(|model| (q.name.as_str(), model)),
8106            QueryExpr::Truncate(q) => q.model.map(|model| (q.name.as_str(), model)),
8107            QueryExpr::KvCommand(cmd) => {
8108                let (collection, model) = match cmd {
8109                    KvCommand::Put {
8110                        collection, model, ..
8111                    }
8112                    | KvCommand::Get {
8113                        collection, model, ..
8114                    }
8115                    | KvCommand::Incr {
8116                        collection, model, ..
8117                    }
8118                    | KvCommand::Cas {
8119                        collection, model, ..
8120                    }
8121                    | KvCommand::Delete {
8122                        collection, model, ..
8123                    } => (collection.as_str(), *model),
8124                    KvCommand::Rotate { collection, .. }
8125                    | KvCommand::History { collection, .. }
8126                    | KvCommand::List { collection, .. }
8127                    | KvCommand::Purge { collection, .. } => {
8128                        (collection.as_str(), CollectionModel::Vault)
8129                    }
8130                    KvCommand::InvalidateTags { collection, .. } => {
8131                        (collection.as_str(), CollectionModel::Kv)
8132                    }
8133                    KvCommand::Watch {
8134                        collection, model, ..
8135                    } => (collection.as_str(), *model),
8136                    KvCommand::Unseal { collection, .. } => {
8137                        (collection.as_str(), CollectionModel::Vault)
8138                    }
8139                };
8140                Some((collection, model))
8141            }
8142            QueryExpr::ConfigCommand(cmd) => {
8143                self.validate_config_command_before_auth(cmd)?;
8144                None
8145            }
8146            _ => None,
8147        };
8148
8149        let Some((name, expected_model)) = expected else {
8150            return Ok(());
8151        };
8152        let snapshot = self.inner.db.catalog_model_snapshot();
8153        let Some(actual_model) = snapshot
8154            .collections
8155            .iter()
8156            .find(|collection| collection.name == name)
8157            .map(|collection| collection.declared_model.unwrap_or(collection.model))
8158        else {
8159            return Ok(());
8160        };
8161        polymorphic_resolver::ensure_model_match(expected_model, actual_model)
8162    }
8163
8164    /// Walk a `QueryExpr` and replace `QueryExpr::Table(tq)` nodes whose
8165    /// `tq.table` matches a registered view name with the view's stored
8166    /// body. Recurses through joins so `SELECT ... FROM t JOIN myview ...`
8167    /// resolves correctly. Pure operation — no side effects.
8168    pub(super) fn rewrite_view_refs(&self, expr: QueryExpr) -> QueryExpr {
8169        // Fast path: no views registered → return original expression.
8170        if self.inner.views.read().is_empty() {
8171            return expr;
8172        }
8173        self.rewrite_view_refs_inner(expr)
8174    }
8175
8176    fn rewrite_view_refs_inner(&self, expr: QueryExpr) -> QueryExpr {
8177        use crate::storage::query::ast::{Filter, TableSource};
8178        match expr {
8179            QueryExpr::Table(mut tq) => {
8180                // 1. If the TableSource is a subquery, recurse into it so
8181                //    `SELECT ... FROM (SELECT ... FROM myview) t` expands.
8182                //    The legacy `table` field (set to a synthetic
8183                //    "__subq_NNNN" sentinel) stays as-is so callers that
8184                //    read it keep compiling.
8185                if let Some(TableSource::Subquery(body)) = tq.source.take() {
8186                    tq.source = Some(TableSource::Subquery(Box::new(
8187                        self.rewrite_view_refs_inner(*body),
8188                    )));
8189                    return QueryExpr::Table(tq);
8190                }
8191
8192                // 2. Restore the source field (took it above for match).
8193                // When the source was `None` or `TableSource::Name(_)`, the
8194                // real lookup key is `tq.table` — check the view registry.
8195                let maybe_view = {
8196                    let views = self.inner.views.read();
8197                    views.get(&tq.table).cloned()
8198                };
8199                let Some(view) = maybe_view else {
8200                    return QueryExpr::Table(tq);
8201                };
8202
8203                // Issue #594 slice 9b — materialized views are read
8204                // from their backing collection, not by substituting
8205                // the body. Returning the TableQuery as-is lets the
8206                // normal table-read path resolve `SELECT FROM v`
8207                // against the collection provisioned at CREATE time.
8208                if view.materialized {
8209                    return QueryExpr::Table(tq);
8210                }
8211
8212                // Recurse into the view body — views may reference other
8213                // views. The recursion yields the final QueryExpr we need
8214                // to merge the outer's filter / limit / offset into.
8215                let inner_expr = self.rewrite_view_refs_inner((*view.query).clone());
8216
8217                // Phase 5: when the body is a Table we merge the outer
8218                // TableQuery's WHERE / LIMIT / OFFSET into it so stacked
8219                // views filter recursively. Non-table bodies (Search,
8220                // Ask, Vector, Graph, Hybrid) can't meaningfully combine
8221                // with an outer Table query today — return the body
8222                // verbatim; outer predicates are lost. Full projection
8223                // merge lands in Phase 5.2.
8224                match inner_expr {
8225                    QueryExpr::Table(mut inner_tq) => {
8226                        if let Some(outer_filter) = tq.filter.take() {
8227                            inner_tq.filter = Some(match inner_tq.filter.take() {
8228                                Some(existing) => {
8229                                    Filter::And(Box::new(existing), Box::new(outer_filter))
8230                                }
8231                                None => outer_filter,
8232                            });
8233                            // Keep the `Expr` form in lock-step with the
8234                            // merged `Filter`. The executor prefers
8235                            // `where_expr` and nulls `filter` when it is
8236                            // present (see `execute_query_inner`), so a
8237                            // stacked view whose outer predicate was only
8238                            // merged into `filter` would silently drop that
8239                            // predicate at eval time (#635).
8240                            inner_tq.where_expr = inner_tq
8241                                .filter
8242                                .as_ref()
8243                                .map(crate::storage::query::sql_lowering::filter_to_expr);
8244                        }
8245                        if let Some(outer_limit) = tq.limit {
8246                            inner_tq.limit = Some(match inner_tq.limit {
8247                                Some(existing) => existing.min(outer_limit),
8248                                None => outer_limit,
8249                            });
8250                        }
8251                        if let Some(outer_offset) = tq.offset {
8252                            inner_tq.offset = Some(match inner_tq.offset {
8253                                Some(existing) => existing + outer_offset,
8254                                None => outer_offset,
8255                            });
8256                        }
8257                        QueryExpr::Table(inner_tq)
8258                    }
8259                    other => other,
8260                }
8261            }
8262            QueryExpr::Join(mut jq) => {
8263                jq.left = Box::new(self.rewrite_view_refs_inner(*jq.left));
8264                jq.right = Box::new(self.rewrite_view_refs_inner(*jq.right));
8265                QueryExpr::Join(jq)
8266            }
8267            // Other variants don't carry nested QueryExpr that can reference
8268            // a view by table name. Return as-is.
8269            other => other,
8270        }
8271    }
8272
8273    /// Internal dispatch: route a `QueryExpr` to the appropriate executor.
8274    /// Shared by `execute_query` (after parse/cache) and `execute_query_expr`
8275    /// (direct call from prepared-statement handler).
8276    fn authorize_relational_table_select(
8277        &self,
8278        mut table: TableQuery,
8279        frame: &dyn super::statement_frame::ReadFrame,
8280    ) -> RedDBResult<Option<TableQuery>> {
8281        if let Some(TableSource::Subquery(inner)) = table.source.take() {
8282            let authorized_inner = self.authorize_relational_select_expr(*inner, frame)?;
8283            table.source = Some(TableSource::Subquery(Box::new(authorized_inner)));
8284            return Ok(Some(table));
8285        }
8286
8287        self.check_table_column_projection_authz(&table, frame)?;
8288
8289        if self.inner.rls_enabled_tables.read().contains(&table.table) {
8290            return Ok(inject_rls_filters(self, frame, table));
8291        }
8292
8293        Ok(Some(table))
8294    }
8295
8296    fn authorize_relational_join_select(
8297        &self,
8298        mut join: JoinQuery,
8299        frame: &dyn super::statement_frame::ReadFrame,
8300    ) -> RedDBResult<Option<JoinQuery>> {
8301        self.check_join_column_projection_authz(&join, frame)?;
8302        join.left = Box::new(self.authorize_relational_join_child(*join.left, frame)?);
8303        join.right = Box::new(self.authorize_relational_join_child(*join.right, frame)?);
8304        Ok(inject_rls_into_join(self, frame, join))
8305    }
8306
8307    fn authorize_relational_join_child(
8308        &self,
8309        expr: QueryExpr,
8310        frame: &dyn super::statement_frame::ReadFrame,
8311    ) -> RedDBResult<QueryExpr> {
8312        match expr {
8313            QueryExpr::Table(mut table) => {
8314                if let Some(TableSource::Subquery(inner)) = table.source.take() {
8315                    let authorized_inner = self.authorize_relational_select_expr(*inner, frame)?;
8316                    table.source = Some(TableSource::Subquery(Box::new(authorized_inner)));
8317                }
8318                Ok(QueryExpr::Table(table))
8319            }
8320            QueryExpr::Join(join) => self
8321                .authorize_relational_join_select(join, frame)?
8322                .map(QueryExpr::Join)
8323                .ok_or_else(|| {
8324                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
8325                }),
8326            other => Ok(other),
8327        }
8328    }
8329
8330    fn authorize_relational_select_expr(
8331        &self,
8332        expr: QueryExpr,
8333        frame: &dyn super::statement_frame::ReadFrame,
8334    ) -> RedDBResult<QueryExpr> {
8335        match expr {
8336            QueryExpr::Table(table) => self
8337                .authorize_relational_table_select(table, frame)?
8338                .map(QueryExpr::Table)
8339                .ok_or_else(|| {
8340                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
8341                }),
8342            QueryExpr::Join(join) => self
8343                .authorize_relational_join_select(join, frame)?
8344                .map(QueryExpr::Join)
8345                .ok_or_else(|| {
8346                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
8347                }),
8348            other => Ok(other),
8349        }
8350    }
8351
8352    fn check_table_column_projection_authz(
8353        &self,
8354        table: &TableQuery,
8355        frame: &dyn super::statement_frame::ReadFrame,
8356    ) -> RedDBResult<()> {
8357        let Some((username, role)) = frame.identity() else {
8358            return Ok(());
8359        };
8360        let Some(auth_store) = self.inner.auth_store.read().clone() else {
8361            return Ok(());
8362        };
8363
8364        let columns = self.resolved_table_projection_columns(table)?;
8365        let request = ColumnAccessRequest::select(table.table.clone(), columns);
8366        let principal = UserId::from_parts(frame.effective_scope(), username);
8367        let ctx = runtime_iam_context(
8368            role,
8369            frame.effective_scope(),
8370            auth_store.principal_is_system_owned(&principal),
8371        );
8372        let outcome = auth_store.check_column_projection_authz(&principal, &request, &ctx);
8373        if outcome.allowed() {
8374            return Ok(());
8375        }
8376
8377        if let Some(denied) = outcome.first_denied_column() {
8378            return Err(RedDBError::Query(format!(
8379                "permission denied: principal=`{username}` cannot select column `{}`",
8380                denied.resource.name
8381            )));
8382        }
8383        Err(RedDBError::Query(format!(
8384            "permission denied: principal=`{username}` cannot select table `{}`",
8385            table.table
8386        )))
8387    }
8388
8389    fn check_join_column_projection_authz(
8390        &self,
8391        join: &JoinQuery,
8392        frame: &dyn super::statement_frame::ReadFrame,
8393    ) -> RedDBResult<()> {
8394        let mut by_table: HashMap<String, BTreeSet<String>> = HashMap::new();
8395        let projections = crate::storage::query::sql_lowering::effective_join_projections(join);
8396        self.collect_join_projection_columns(join, &projections, &mut by_table)?;
8397
8398        for (table, columns) in by_table {
8399            let query = TableQuery {
8400                table,
8401                source: None,
8402                alias: None,
8403                select_items: Vec::new(),
8404                columns: columns.into_iter().map(Projection::Column).collect(),
8405                where_expr: None,
8406                filter: None,
8407                group_by_exprs: Vec::new(),
8408                group_by: Vec::new(),
8409                having_expr: None,
8410                having: None,
8411                order_by: Vec::new(),
8412                limit: None,
8413                limit_param: None,
8414                offset: None,
8415                offset_param: None,
8416                expand: None,
8417                as_of: None,
8418                sessionize: None,
8419            };
8420            self.check_table_column_projection_authz(&query, frame)?;
8421        }
8422        Ok(())
8423    }
8424
8425    fn collect_join_projection_columns(
8426        &self,
8427        join: &JoinQuery,
8428        projections: &[Projection],
8429        out: &mut HashMap<String, BTreeSet<String>>,
8430    ) -> RedDBResult<()> {
8431        let left = table_side_context(join.left.as_ref());
8432        let right = table_side_context(join.right.as_ref());
8433
8434        if projections
8435            .iter()
8436            .any(|projection| matches!(projection, Projection::All))
8437        {
8438            for side in [left.as_ref(), right.as_ref()].into_iter().flatten() {
8439                out.entry(side.table.clone())
8440                    .or_default()
8441                    .extend(self.table_all_projection_columns(&side.table)?);
8442            }
8443            return Ok(());
8444        }
8445
8446        for projection in projections {
8447            collect_projection_columns_for_join_side(
8448                projection,
8449                left.as_ref(),
8450                right.as_ref(),
8451                out,
8452            )?;
8453        }
8454        Ok(())
8455    }
8456
8457    fn resolved_table_projection_columns(&self, table: &TableQuery) -> RedDBResult<Vec<String>> {
8458        let projections = crate::storage::query::sql_lowering::effective_table_projections(table);
8459        if projections
8460            .iter()
8461            .any(|projection| matches!(projection, Projection::All))
8462        {
8463            return self.table_all_projection_columns(&table.table);
8464        }
8465
8466        let mut columns = BTreeSet::new();
8467        for projection in &projections {
8468            collect_projection_columns_for_table(
8469                projection,
8470                &table.table,
8471                table.alias.as_deref(),
8472                &mut columns,
8473            );
8474        }
8475        Ok(columns.into_iter().collect())
8476    }
8477
8478    fn table_all_projection_columns(&self, table: &str) -> RedDBResult<Vec<String>> {
8479        if let Some(contract) = self.inner.db.collection_contract_arc(table) {
8480            let columns: Vec<String> = contract
8481                .declared_columns
8482                .iter()
8483                .map(|column| column.name.clone())
8484                .collect();
8485            if !columns.is_empty() {
8486                return Ok(columns);
8487            }
8488        }
8489
8490        let records = scan_runtime_table_source_records_limited(&self.inner.db, table, Some(1))?;
8491        Ok(records
8492            .first()
8493            .map(|record| {
8494                record
8495                    .column_names()
8496                    .into_iter()
8497                    .map(|column| column.to_string())
8498                    .collect()
8499            })
8500            .unwrap_or_default())
8501    }
8502
8503    fn resolve_table_expr_subqueries(
8504        &self,
8505        mut table: TableQuery,
8506        frame: &dyn super::statement_frame::ReadFrame,
8507    ) -> RedDBResult<TableQuery> {
8508        // Only a `Subquery` source needs recursive resolution. `.take()`
8509        // would otherwise drop a `Name` / `Function` source on the floor
8510        // (the `if let` skips the body but the take already cleared it),
8511        // which silently broke `SELECT * FROM components(g)` — the TVF
8512        // dispatch downstream keys off `TableSource::Function` and never
8513        // fired. Restore any non-subquery source unchanged (issue #795).
8514        match table.source.take() {
8515            Some(TableSource::Subquery(inner)) => {
8516                let inner = self.resolve_select_expr_subqueries(*inner, frame)?;
8517                table.source = Some(TableSource::Subquery(Box::new(inner)));
8518            }
8519            other => table.source = other,
8520        }
8521
8522        let outer_scopes = relation_scopes_for_query(&QueryExpr::Table(table.clone()));
8523        for item in &mut table.select_items {
8524            if let crate::storage::query::ast::SelectItem::Expr { expr, .. } = item {
8525                *expr = self.resolve_expr_subqueries(expr.clone(), &outer_scopes, frame)?;
8526            }
8527        }
8528        if let Some(where_expr) = table.where_expr.take() {
8529            table.where_expr =
8530                Some(self.resolve_expr_subqueries(where_expr, &outer_scopes, frame)?);
8531            table.filter = None;
8532        }
8533        if let Some(having_expr) = table.having_expr.take() {
8534            table.having_expr =
8535                Some(self.resolve_expr_subqueries(having_expr, &outer_scopes, frame)?);
8536            table.having = None;
8537        }
8538        for expr in &mut table.group_by_exprs {
8539            *expr = self.resolve_expr_subqueries(expr.clone(), &outer_scopes, frame)?;
8540        }
8541        for clause in &mut table.order_by {
8542            if let Some(expr) = clause.expr.take() {
8543                clause.expr = Some(self.resolve_expr_subqueries(expr, &outer_scopes, frame)?);
8544            }
8545        }
8546        Ok(table)
8547    }
8548
8549    fn resolve_select_expr_subqueries(
8550        &self,
8551        expr: QueryExpr,
8552        frame: &dyn super::statement_frame::ReadFrame,
8553    ) -> RedDBResult<QueryExpr> {
8554        match expr {
8555            QueryExpr::Table(table) => self
8556                .resolve_table_expr_subqueries(table, frame)
8557                .map(QueryExpr::Table),
8558            QueryExpr::Join(mut join) => {
8559                join.left = Box::new(self.resolve_select_expr_subqueries(*join.left, frame)?);
8560                join.right = Box::new(self.resolve_select_expr_subqueries(*join.right, frame)?);
8561                Ok(QueryExpr::Join(join))
8562            }
8563            other => Ok(other),
8564        }
8565    }
8566
8567    fn resolve_expr_subqueries(
8568        &self,
8569        expr: crate::storage::query::ast::Expr,
8570        outer_scopes: &[String],
8571        frame: &dyn super::statement_frame::ReadFrame,
8572    ) -> RedDBResult<crate::storage::query::ast::Expr> {
8573        use crate::storage::query::ast::Expr;
8574
8575        match expr {
8576            Expr::Subquery { query, span } => {
8577                let values = self.execute_expr_subquery_values(query, outer_scopes, frame)?;
8578                if values.len() > 1 {
8579                    return Err(RedDBError::Query(
8580                        "scalar subquery returned more than one row".to_string(),
8581                    ));
8582                }
8583                Ok(Expr::Literal {
8584                    value: values.into_iter().next().unwrap_or(Value::Null),
8585                    span,
8586                })
8587            }
8588            Expr::BinaryOp { op, lhs, rhs, span } => Ok(Expr::BinaryOp {
8589                op,
8590                lhs: Box::new(self.resolve_expr_subqueries(*lhs, outer_scopes, frame)?),
8591                rhs: Box::new(self.resolve_expr_subqueries(*rhs, outer_scopes, frame)?),
8592                span,
8593            }),
8594            Expr::UnaryOp { op, operand, span } => Ok(Expr::UnaryOp {
8595                op,
8596                operand: Box::new(self.resolve_expr_subqueries(*operand, outer_scopes, frame)?),
8597                span,
8598            }),
8599            Expr::Cast {
8600                inner,
8601                target,
8602                span,
8603            } => Ok(Expr::Cast {
8604                inner: Box::new(self.resolve_expr_subqueries(*inner, outer_scopes, frame)?),
8605                target,
8606                span,
8607            }),
8608            Expr::FunctionCall { name, args, span } => {
8609                let args = args
8610                    .into_iter()
8611                    .map(|arg| self.resolve_expr_subqueries(arg, outer_scopes, frame))
8612                    .collect::<RedDBResult<Vec<_>>>()?;
8613                Ok(Expr::FunctionCall { name, args, span })
8614            }
8615            Expr::Case {
8616                branches,
8617                else_,
8618                span,
8619            } => {
8620                let branches = branches
8621                    .into_iter()
8622                    .map(|(cond, value)| {
8623                        Ok((
8624                            self.resolve_expr_subqueries(cond, outer_scopes, frame)?,
8625                            self.resolve_expr_subqueries(value, outer_scopes, frame)?,
8626                        ))
8627                    })
8628                    .collect::<RedDBResult<Vec<_>>>()?;
8629                let else_ = else_
8630                    .map(|expr| self.resolve_expr_subqueries(*expr, outer_scopes, frame))
8631                    .transpose()?
8632                    .map(Box::new);
8633                Ok(Expr::Case {
8634                    branches,
8635                    else_,
8636                    span,
8637                })
8638            }
8639            Expr::IsNull {
8640                operand,
8641                negated,
8642                span,
8643            } => Ok(Expr::IsNull {
8644                operand: Box::new(self.resolve_expr_subqueries(*operand, outer_scopes, frame)?),
8645                negated,
8646                span,
8647            }),
8648            Expr::InList {
8649                target,
8650                values,
8651                negated,
8652                span,
8653            } => {
8654                let target =
8655                    Box::new(self.resolve_expr_subqueries(*target, outer_scopes, frame)?);
8656                let mut resolved = Vec::new();
8657                for value in values {
8658                    if let Expr::Subquery { query, .. } = value {
8659                        resolved.extend(
8660                            self.execute_expr_subquery_values(query, outer_scopes, frame)?
8661                                .into_iter()
8662                                .map(Expr::lit),
8663                        );
8664                    } else {
8665                        resolved.push(self.resolve_expr_subqueries(value, outer_scopes, frame)?);
8666                    }
8667                }
8668                Ok(Expr::InList {
8669                    target,
8670                    values: resolved,
8671                    negated,
8672                    span,
8673                })
8674            }
8675            Expr::Between {
8676                target,
8677                low,
8678                high,
8679                negated,
8680                span,
8681            } => Ok(Expr::Between {
8682                target: Box::new(self.resolve_expr_subqueries(*target, outer_scopes, frame)?),
8683                low: Box::new(self.resolve_expr_subqueries(*low, outer_scopes, frame)?),
8684                high: Box::new(self.resolve_expr_subqueries(*high, outer_scopes, frame)?),
8685                negated,
8686                span,
8687            }),
8688            other => Ok(other),
8689        }
8690    }
8691
8692    fn execute_expr_subquery_values(
8693        &self,
8694        subquery: crate::storage::query::ast::ExprSubquery,
8695        outer_scopes: &[String],
8696        frame: &dyn super::statement_frame::ReadFrame,
8697    ) -> RedDBResult<Vec<Value>> {
8698        let query = *subquery.query;
8699        if query_references_outer_scope(&query, outer_scopes) {
8700            return Err(RedDBError::Query(
8701                "NOT_YET_SUPPORTED: correlated subqueries are not supported yet; track follow-up issue #470-correlated-subqueries".to_string(),
8702            ));
8703        }
8704        let query = self.rewrite_view_refs(query);
8705        let query = self.resolve_select_expr_subqueries(query, frame)?;
8706        let query = self.authorize_relational_select_expr(query, frame)?;
8707        let result = match query {
8708            QueryExpr::Table(table) => {
8709                execute_runtime_table_query(&self.inner.db, &table, Some(&self.inner.index_store))?
8710            }
8711            QueryExpr::Join(join) => execute_runtime_join_query(&self.inner.db, &join)?,
8712            other => {
8713                return Err(RedDBError::Query(format!(
8714                    "expression subquery must be a SELECT query, got {}",
8715                    query_expr_name(&other)
8716                )))
8717            }
8718        };
8719        first_column_values(result)
8720    }
8721
8722    fn dispatch_expr(
8723        &self,
8724        expr: QueryExpr,
8725        query_str: &str,
8726        mode: QueryMode,
8727    ) -> RedDBResult<RuntimeQueryResult> {
8728        let statement = query_expr_name(&expr);
8729        match expr {
8730            QueryExpr::Graph(_) | QueryExpr::Path(_) => {
8731                // Graph queries are not cacheable as prepared statements.
8732                Err(RedDBError::Query(
8733                    "graph queries cannot be used as prepared statements".to_string(),
8734                ))
8735            }
8736            QueryExpr::Table(table) => {
8737                let scope = self.ai_scope();
8738                let table = self.resolve_table_expr_subqueries(
8739                    table,
8740                    &scope as &dyn super::statement_frame::ReadFrame,
8741                )?;
8742                // Table-valued functions (e.g. components(g)) dispatch to a
8743                // read-only executor before any catalog/virtual-table routing
8744                // (issue #795).
8745                if let Some(TableSource::Function {
8746                    name,
8747                    args,
8748                    named_args,
8749                }) = table.source.clone()
8750                {
8751                    return Ok(RuntimeQueryResult {
8752                        query: query_str.to_string(),
8753                        mode,
8754                        statement,
8755                        engine: "runtime-graph-tvf",
8756                        result: self.execute_table_function(&name, &args, &named_args)?,
8757                        affected_rows: 0,
8758                        statement_type: "select",
8759                        bookmark: None,
8760                    });
8761                }
8762                // Inline-graph TVF (issue #799) on the prepared-statement /
8763                // direct-expr path. Result caching is wired on the
8764                // `execute_query_inner` path; here we just compute and return.
8765                if let Some(TableSource::InlineGraphFunction {
8766                    name,
8767                    nodes,
8768                    edges,
8769                    named_args,
8770                }) = table.source.clone()
8771                {
8772                    return Ok(RuntimeQueryResult {
8773                        query: query_str.to_string(),
8774                        mode,
8775                        statement,
8776                        engine: "runtime-graph-tvf-inline",
8777                        result: self.execute_inline_graph_function(
8778                            &name,
8779                            &nodes,
8780                            &edges,
8781                            &named_args,
8782                        )?,
8783                        affected_rows: 0,
8784                        statement_type: "select",
8785                        bookmark: None,
8786                    });
8787                }
8788                if super::red_schema::is_virtual_table(&table.table) {
8789                    return Ok(RuntimeQueryResult {
8790                        query: query_str.to_string(),
8791                        mode,
8792                        statement,
8793                        engine: "runtime-red-schema",
8794                        result: super::red_schema::red_query(
8795                            self,
8796                            &table.table,
8797                            &table,
8798                            &scope as &dyn super::statement_frame::ReadFrame,
8799                        )?,
8800                        affected_rows: 0,
8801                        statement_type: "select",
8802                        bookmark: None,
8803                    });
8804                }
8805                // `<graph>.<output>` analytics virtual view (issue #800).
8806                if let Some(view_result) = self.try_resolve_analytics_view(
8807                    &table,
8808                    &scope as &dyn super::statement_frame::ReadFrame,
8809                )? {
8810                    return Ok(RuntimeQueryResult {
8811                        query: query_str.to_string(),
8812                        mode,
8813                        statement,
8814                        engine: "runtime-graph-analytics-view",
8815                        result: view_result,
8816                        affected_rows: 0,
8817                        statement_type: "select",
8818                        bookmark: None,
8819                    });
8820                }
8821                let Some(table_with_rls) = self.authorize_relational_table_select(
8822                    table,
8823                    &scope as &dyn super::statement_frame::ReadFrame,
8824                )?
8825                else {
8826                    return Ok(RuntimeQueryResult {
8827                        query: query_str.to_string(),
8828                        mode,
8829                        statement,
8830                        engine: "runtime-table-rls",
8831                        result: crate::storage::query::unified::UnifiedResult::empty(),
8832                        affected_rows: 0,
8833                        statement_type: "select",
8834                        bookmark: None,
8835                    });
8836                };
8837                Ok(RuntimeQueryResult {
8838                    query: query_str.to_string(),
8839                    mode,
8840                    statement,
8841                    engine: "runtime-table",
8842                    result: execute_runtime_table_query(
8843                        &self.inner.db,
8844                        &table_with_rls,
8845                        Some(&self.inner.index_store),
8846                    )?,
8847                    affected_rows: 0,
8848                    statement_type: "select",
8849                    bookmark: None,
8850                })
8851            }
8852            QueryExpr::Join(join) => {
8853                let scope = self.ai_scope();
8854                let Some(join_with_rls) = self.authorize_relational_join_select(
8855                    join,
8856                    &scope as &dyn super::statement_frame::ReadFrame,
8857                )?
8858                else {
8859                    return Ok(RuntimeQueryResult {
8860                        query: query_str.to_string(),
8861                        mode,
8862                        statement,
8863                        engine: "runtime-join-rls",
8864                        result: crate::storage::query::unified::UnifiedResult::empty(),
8865                        affected_rows: 0,
8866                        statement_type: "select",
8867                        bookmark: None,
8868                    });
8869                };
8870                Ok(RuntimeQueryResult {
8871                    query: query_str.to_string(),
8872                    mode,
8873                    statement,
8874                    engine: "runtime-join",
8875                    result: execute_runtime_join_query(&self.inner.db, &join_with_rls)?,
8876                    affected_rows: 0,
8877                    statement_type: "select",
8878                    bookmark: None,
8879                })
8880            }
8881            QueryExpr::Vector(vector) => Ok(RuntimeQueryResult {
8882                query: query_str.to_string(),
8883                mode,
8884                statement,
8885                engine: "runtime-vector",
8886                result: execute_runtime_vector_query(&self.inner.db, &vector)?,
8887                affected_rows: 0,
8888                statement_type: "select",
8889                bookmark: None,
8890            }),
8891            QueryExpr::Hybrid(hybrid) => Ok(RuntimeQueryResult {
8892                query: query_str.to_string(),
8893                mode,
8894                statement,
8895                engine: "runtime-hybrid",
8896                result: execute_runtime_hybrid_query(&self.inner.db, &hybrid)?,
8897                affected_rows: 0,
8898                statement_type: "select",
8899                bookmark: None,
8900            }),
8901            QueryExpr::Insert(ref insert) if super::red_schema::is_virtual_table(&insert.table) => {
8902                Err(RedDBError::Query(
8903                    super::red_schema::READ_ONLY_ERROR.to_string(),
8904                ))
8905            }
8906            QueryExpr::Update(ref update) if super::red_schema::is_virtual_table(&update.table) => {
8907                Err(RedDBError::Query(
8908                    super::red_schema::READ_ONLY_ERROR.to_string(),
8909                ))
8910            }
8911            QueryExpr::Delete(ref delete) if super::red_schema::is_virtual_table(&delete.table) => {
8912                Err(RedDBError::Query(
8913                    super::red_schema::READ_ONLY_ERROR.to_string(),
8914                ))
8915            }
8916            QueryExpr::Insert(ref insert) => self
8917                .with_deferred_store_wal_for_dml(self.insert_may_emit_events(insert), || {
8918                    self.execute_insert(query_str, insert)
8919                }),
8920            QueryExpr::Update(ref update) => self
8921                .with_deferred_store_wal_for_dml(self.update_may_emit_events(update), || {
8922                    self.execute_update(query_str, update)
8923                }),
8924            QueryExpr::Delete(ref delete) => self
8925                .with_deferred_store_wal_for_dml(self.delete_may_emit_events(delete), || {
8926                    self.execute_delete(query_str, delete)
8927                }),
8928            QueryExpr::SearchCommand(ref cmd) => self.execute_search_command(query_str, cmd),
8929            QueryExpr::Ask(ref ask) => self.execute_ask(query_str, ask),
8930            _ => Err(RedDBError::Query(format!(
8931                "prepared-statement execution does not support {statement} statements"
8932            ))),
8933        }
8934    }
8935
8936    /// Dispatch a graph-collection table-valued function call in FROM
8937    /// position (e.g. `SELECT * FROM components(g)`).
8938    ///
8939    /// Validates the function name and arity here, materializes the whole
8940    /// active graph read-only, then runs the algorithm via the shared
8941    /// `dispatch_graph_algorithm` path. Never mutates the catalog or store.
8942    fn execute_table_function(
8943        &self,
8944        name: &str,
8945        args: &[String],
8946        named_args: &[(String, f64)],
8947    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
8948        if !is_graph_tvf_name(name) {
8949            return Err(RedDBError::Query(format!("unknown table function: {name}")));
8950        }
8951        // Every graph-collection TVF takes exactly one graph argument.
8952        if args.len() != 1 {
8953            return Err(RedDBError::Query(format!(
8954                "table function '{name}' takes exactly 1 graph argument, got {}",
8955                args.len()
8956            )));
8957        }
8958
8959        // Read-only materialization of the full active graph. Passing `None`
8960        // for the projection uses the full graph store. Like #795/#796, the
8961        // v0 form runs over the whole graph store regardless of the collection
8962        // argument value. Materialization never mutates any store.
8963        let (nodes, edges) = self.materialize_whole_graph_abstract()?;
8964        self.dispatch_graph_algorithm(name, nodes, edges, named_args)
8965    }
8966
8967    /// Dispatch an inline-graph table-valued function call in FROM position
8968    /// (e.g. `SELECT * FROM components(nodes => (…), edges => (…))`, issue
8969    /// #799).
8970    ///
8971    /// Materializes the two subqueries through the normal read path (so RLS,
8972    /// column authz, and MVCC visibility all apply), constructs the abstract
8973    /// graph — the first column of `nodes` is the node id; the first two-or-
8974    /// three columns of `edges` are `(source, target [, weight])` — then runs
8975    /// the same algorithm path used by the graph-collection form. Read-only.
8976    fn execute_inline_graph_function(
8977        &self,
8978        name: &str,
8979        nodes_query: &QueryExpr,
8980        edges_query: &QueryExpr,
8981        named_args: &[(String, f64)],
8982    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
8983        if !is_graph_tvf_name(name) {
8984            return Err(RedDBError::Query(format!("unknown table function: {name}")));
8985        }
8986
8987        let node_result = self.execute_query_expr(nodes_query.clone())?.result;
8988        let nodes = inline_node_ids(name, &node_result)?;
8989
8990        let edge_result = self.execute_query_expr(edges_query.clone())?.result;
8991        let edges = inline_edges(name, &edge_result)?;
8992
8993        self.dispatch_graph_algorithm(name, nodes, edges, named_args)
8994    }
8995
8996    /// Materialize the whole active graph read-only into the abstract
8997    /// `(nodes, edges)` inputs the pure graph algorithms consume.
8998    fn materialize_whole_graph_abstract(
8999        &self,
9000    ) -> RedDBResult<(
9001        Vec<String>,
9002        Vec<(
9003            String,
9004            String,
9005            crate::storage::engine::graph_algorithms::Weight,
9006        )>,
9007    )> {
9008        use crate::storage::engine::graph_algorithms;
9009
9010        let graph = super::graph_dsl::materialize_graph_with_projection(
9011            self.inner.db.store().as_ref(),
9012            None,
9013        )?;
9014        let nodes: Vec<String> = graph.iter_nodes().map(|n| n.id.clone()).collect();
9015        let edges: Vec<(String, String, graph_algorithms::Weight)> = graph
9016            .iter_all_edges()
9017            .into_iter()
9018            .map(|e| (e.source_id, e.target_id, e.weight))
9019            .collect();
9020        Ok((nodes, edges))
9021    }
9022
9023    /// Resolve a `<graph>.<output>` analytics virtual view (issue #800).
9024    ///
9025    /// Returns `Ok(None)` when `table` is not an analytics view — either the
9026    /// name is not dotted, a real collection of that exact name exists (a real
9027    /// collection always wins; no shadowing), the suffix is not a recognised
9028    /// analytics output, or the parent is not a graph. Returns `Ok(Some(_))`
9029    /// with the freshly computed result when it does resolve, and an error when
9030    /// the parent graph exists but the output is not enabled, a declared
9031    /// algorithm is unsupported, or the parent collection's policy denies the
9032    /// read.
9033    ///
9034    /// The view is recomputed on every call (no result-cache write) so it
9035    /// always reflects the current graph data, satisfying the on-demand
9036    /// recompute contract for this slice.
9037    fn try_resolve_analytics_view(
9038        &self,
9039        table: &TableQuery,
9040        frame: &dyn super::statement_frame::ReadFrame,
9041    ) -> RedDBResult<Option<crate::storage::query::unified::UnifiedResult>> {
9042        let full = table.table.as_str();
9043        let Some(dot) = full.rfind('.') else {
9044            return Ok(None);
9045        };
9046        // A real collection literally named `g.communities` always wins.
9047        if self.inner.db.store().get_collection(full).is_some() {
9048            return Ok(None);
9049        }
9050        let graph_name = &full[..dot];
9051        let output_name = &full[dot + 1..];
9052        let Some(output) = crate::catalog::AnalyticsOutput::from_str(output_name) else {
9053            return Ok(None);
9054        };
9055
9056        let contracts = self.inner.db.collection_contracts();
9057        let Some(contract) = contracts.iter().find(|c| c.name == graph_name) else {
9058            return Ok(None);
9059        };
9060        if contract.declared_model != crate::catalog::CollectionModel::Graph {
9061            return Ok(None);
9062        }
9063        let Some(view) = contract
9064            .analytics_config
9065            .iter()
9066            .find(|view| view.output == output)
9067        else {
9068            // The parent graph exists but this output was not declared — a
9069            // clear error beats the misleading "collection not found".
9070            return Err(RedDBError::Query(format!(
9071                "analytics output '{output_name}' is not enabled on graph '{graph_name}'; declare it with WITH ANALYTICS (...)"
9072            )));
9073        };
9074
9075        // Policy inheritance (AC5): route through the parent graph collection's
9076        // read authorization. A policy or RLS rule that denies the parent
9077        // denies its analytics views transitively.
9078        let parent_query = TableQuery::new(graph_name);
9079        if self
9080            .authorize_relational_table_select(parent_query, frame)?
9081            .is_none()
9082        {
9083            return Err(RedDBError::Query(format!(
9084                "permission denied: policy on graph '{graph_name}' denies analytics view '{output_name}'"
9085            )));
9086        }
9087
9088        let (algorithm, named_args) = analytics_view_algorithm(graph_name, view)?;
9089        let (nodes, edges) = self.materialize_whole_graph_abstract()?;
9090        let result = self.dispatch_graph_algorithm(&algorithm, nodes, edges, &named_args)?;
9091        Ok(Some(result))
9092    }
9093
9094    /// Shared algorithm dispatch over abstract `(nodes, edges)` inputs.
9095    ///
9096    /// Both the graph-collection form and the inline-graph form route here so
9097    /// named-argument validation and the projected row shape stay identical
9098    /// across the two signatures (issue #799). Projects each algorithm's
9099    /// native output shape.
9100    fn dispatch_graph_algorithm(
9101        &self,
9102        name: &str,
9103        nodes: Vec<String>,
9104        edges: Vec<(
9105            String,
9106            String,
9107            crate::storage::engine::graph_algorithms::Weight,
9108        )>,
9109        named_args: &[(String, f64)],
9110    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
9111        use crate::storage::engine::graph_algorithms;
9112        use crate::storage::query::unified::UnifiedResult;
9113        use crate::storage::schema::Value;
9114
9115        if name.eq_ignore_ascii_case("components") {
9116            reject_named_args(name, named_args)?;
9117            let assignment = graph_algorithms::connected_components(&nodes, &edges);
9118            let mut result =
9119                UnifiedResult::with_columns(vec!["node_id".into(), "island_id".into()]);
9120            for (node_id, island_id) in assignment {
9121                let mut record = UnifiedRecord::new();
9122                record.set("node_id", Value::text(node_id));
9123                record.set("island_id", Value::Integer(island_id as i64));
9124                result.push(record);
9125            }
9126            return Ok(result);
9127        }
9128
9129        if name.eq_ignore_ascii_case("louvain") {
9130            // The only supported named argument is `resolution` (γ). It
9131            // defaults to 1.0 (classic modularity) and must be a finite,
9132            // strictly positive number — a non-positive (or NaN/inf)
9133            // resolution has no sensible meaning.
9134            let resolution = louvain_resolution(named_args)?;
9135            let assignment = graph_algorithms::louvain(&nodes, &edges, resolution);
9136            let mut result =
9137                UnifiedResult::with_columns(vec!["node_id".into(), "community_id".into()]);
9138            for (node_id, community_id) in assignment {
9139                let mut record = UnifiedRecord::new();
9140                record.set("node_id", Value::text(node_id));
9141                record.set("community_id", Value::Integer(community_id as i64));
9142                result.push(record);
9143            }
9144            return Ok(result);
9145        }
9146
9147        if name.eq_ignore_ascii_case("degree_centrality") {
9148            reject_named_args(name, named_args)?;
9149            let assignment = abstract_degree_centrality(&nodes, &edges);
9150            let mut result = UnifiedResult::with_columns(vec!["node_id".into(), "degree".into()]);
9151            for (node_id, degree) in assignment {
9152                let mut record = UnifiedRecord::new();
9153                record.set("node_id", Value::text(node_id));
9154                record.set("degree", Value::Integer(degree as i64));
9155                result.push(record);
9156            }
9157            return Ok(result);
9158        }
9159
9160        if name.eq_ignore_ascii_case("shortest_path") {
9161            // Scalar named arguments: `src` and `dst` are required node ids,
9162            // `max_hops` is an optional non-negative edge-count cap. Node ids
9163            // in the graph store are integer entity ids rendered as strings, so
9164            // each id arg must be a non-negative whole number; reject anything
9165            // else (fractional, negative, NaN/inf) with a clear message.
9166            let mut src: Option<String> = None;
9167            let mut dst: Option<String> = None;
9168            let mut max_hops: Option<usize> = None;
9169            let as_node_id = |key: &str, value: f64| -> RedDBResult<String> {
9170                if !value.is_finite() || value < 0.0 || value.fract() != 0.0 {
9171                    return Err(RedDBError::Query(format!(
9172                        "table function 'shortest_path' argument '{key}' must be a non-negative integer node id, got {value}"
9173                    )));
9174                }
9175                Ok((value as i64).to_string())
9176            };
9177            for (key, value) in named_args {
9178                if key.eq_ignore_ascii_case("src") {
9179                    src = Some(as_node_id("src", *value)?);
9180                } else if key.eq_ignore_ascii_case("dst") {
9181                    dst = Some(as_node_id("dst", *value)?);
9182                } else if key.eq_ignore_ascii_case("max_hops") {
9183                    if !value.is_finite() || *value < 0.0 || value.fract() != 0.0 {
9184                        return Err(RedDBError::Query(format!(
9185                            "table function 'shortest_path' max_hops must be a non-negative integer, got {value}"
9186                        )));
9187                    }
9188                    max_hops = Some(*value as usize);
9189                } else {
9190                    return Err(RedDBError::Query(format!(
9191                        "table function 'shortest_path' has no named argument '{key}' (expected 'src', 'dst', 'max_hops')"
9192                    )));
9193                }
9194            }
9195            let src = src.ok_or_else(|| {
9196                RedDBError::Query(
9197                    "table function 'shortest_path' requires named argument 'src'".to_string(),
9198                )
9199            })?;
9200            let dst = dst.ok_or_else(|| {
9201                RedDBError::Query(
9202                    "table function 'shortest_path' requires named argument 'dst'".to_string(),
9203                )
9204            })?;
9205
9206            // Columns are always present; an unreachable pair (within the
9207            // optional `max_hops` budget) simply yields zero rows — never an
9208            // error. `hop` is the 0-based index from the source;
9209            // `cumulative_weight` is the running path weight (0 at the source,
9210            // the total at the destination). Edges are treated as undirected,
9211            // consistent with `components` / `louvain`.
9212            let mut result = UnifiedResult::with_columns(vec![
9213                "hop".into(),
9214                "node_id".into(),
9215                "cumulative_weight".into(),
9216            ]);
9217            if let Some(path) =
9218                graph_algorithms::shortest_path(&nodes, &edges, &src, &dst, max_hops)
9219            {
9220                for (hop, (node_id, cumulative_weight)) in path.into_iter().enumerate() {
9221                    let mut record = UnifiedRecord::new();
9222                    record.set("hop", Value::Integer(hop as i64));
9223                    record.set("node_id", Value::text(node_id));
9224                    record.set("cumulative_weight", Value::Float(cumulative_weight));
9225                    result.push(record);
9226                }
9227            }
9228            return Ok(result);
9229        }
9230        // ── Centrality family (issue #797): each returns rows `(node_id,
9231        // score)` over the abstract `(nodes, edges)` graph. Like the other
9232        // graph TVFs the graph is treated as undirected and scores are
9233        // deterministic; the inline-graph form shares this dispatch. ──
9234        if name.eq_ignore_ascii_case("betweenness") {
9235            reject_named_args(name, named_args)?;
9236            return Ok(Self::centrality_result(graph_algorithms::betweenness(
9237                &nodes, &edges,
9238            )));
9239        }
9240        if name.eq_ignore_ascii_case("eigenvector") {
9241            // Optional `max_iterations` (positive integer, default 100) and
9242            // `tolerance` (finite, strictly positive, default 1e-6).
9243            let mut max_iterations = 100_usize;
9244            let mut tolerance = 1e-6_f64;
9245            for (key, value) in named_args {
9246                if key.eq_ignore_ascii_case("max_iterations") {
9247                    max_iterations = parse_positive_iterations("eigenvector", value)?;
9248                } else if key.eq_ignore_ascii_case("tolerance") {
9249                    if !value.is_finite() || *value <= 0.0 {
9250                        return Err(RedDBError::Query(format!(
9251                            "table function 'eigenvector' tolerance must be > 0, got {value}"
9252                        )));
9253                    }
9254                    tolerance = *value;
9255                } else {
9256                    return Err(RedDBError::Query(format!(
9257                        "table function 'eigenvector' has no named argument '{key}' (expected 'max_iterations' or 'tolerance')"
9258                    )));
9259                }
9260            }
9261            return Ok(Self::centrality_result(graph_algorithms::eigenvector(
9262                &nodes,
9263                &edges,
9264                max_iterations,
9265                tolerance,
9266            )));
9267        }
9268        if name.eq_ignore_ascii_case("pagerank") {
9269            // Optional `damping` (in (0, 1), default 0.85) and `max_iterations`
9270            // (positive integer, default 100).
9271            let mut damping = 0.85_f64;
9272            let mut max_iterations = 100_usize;
9273            for (key, value) in named_args {
9274                if key.eq_ignore_ascii_case("damping") {
9275                    if !value.is_finite() || *value <= 0.0 || *value >= 1.0 {
9276                        return Err(RedDBError::Query(format!(
9277                            "table function 'pagerank' damping must be in (0, 1), got {value}"
9278                        )));
9279                    }
9280                    damping = *value;
9281                } else if key.eq_ignore_ascii_case("max_iterations") {
9282                    max_iterations = parse_positive_iterations("pagerank", value)?;
9283                } else {
9284                    return Err(RedDBError::Query(format!(
9285                        "table function 'pagerank' has no named argument '{key}' (expected 'damping' or 'max_iterations')"
9286                    )));
9287                }
9288            }
9289            return Ok(Self::centrality_result(graph_algorithms::pagerank(
9290                &nodes,
9291                &edges,
9292                damping,
9293                max_iterations,
9294            )));
9295        }
9296        Err(RedDBError::Query(format!("unknown table function: {name}")))
9297    }
9298
9299    /// `components(<graph_collection>)` — returns rows `(node_id, island_id)`.
9300    ///
9301    /// Materializes the active graph (nodes + weighted edges) read-only and
9302    /// runs the pure `graph_algorithms::connected_components`. Edges are
9303    /// treated as undirected; island ids are deterministic (ascending order of
9304    /// each component's smallest node).
9305    fn execute_components_tvf(
9306        &self,
9307        _collection: &str,
9308    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
9309        use crate::storage::engine::graph_algorithms;
9310        use crate::storage::query::unified::UnifiedResult;
9311        use crate::storage::schema::Value;
9312
9313        // Read-only materialization of the full active graph. The named
9314        // collection identifies the active graph scope; passing `None` for the
9315        // projection uses the full graph store (the same result
9316        // `active_graph_projection` yields when no projection is registered).
9317        // Materialization never mutates any store.
9318        let graph = super::graph_dsl::materialize_graph_with_projection(
9319            self.inner.db.store().as_ref(),
9320            None,
9321        )?;
9322
9323        // Materialize abstract inputs for the pure algorithm.
9324        let nodes: Vec<String> = graph.iter_nodes().map(|n| n.id.clone()).collect();
9325        let edges: Vec<(String, String, graph_algorithms::Weight)> = graph
9326            .iter_all_edges()
9327            .into_iter()
9328            .map(|e| (e.source_id, e.target_id, e.weight))
9329            .collect();
9330
9331        let assignment = graph_algorithms::connected_components(&nodes, &edges);
9332
9333        // Project into a UnifiedResult with columns ["node_id", "island_id"].
9334        let mut result = UnifiedResult::with_columns(vec!["node_id".into(), "island_id".into()]);
9335        for (node_id, island_id) in assignment {
9336            let mut record = UnifiedRecord::new();
9337            record.set("node_id", Value::text(node_id));
9338            record.set("island_id", Value::Integer(island_id as i64));
9339            result.push(record);
9340        }
9341        Ok(result)
9342    }
9343
9344    /// `louvain(<graph> [, resolution => <f64>])` — returns rows
9345    /// `(node_id, community_id)` (issue #796).
9346    ///
9347    /// Materializes the active graph (nodes + weighted edges) read-only and
9348    /// runs the pure, deterministic `graph_algorithms::louvain`. Edges are
9349    /// treated as undirected; community ids are assigned in ascending order of
9350    /// each community's smallest node, so identical input + resolution always
9351    /// yields identical rows. Like `components`, the v0 form runs over the
9352    /// whole graph store regardless of the collection argument value.
9353    fn execute_louvain_tvf(
9354        &self,
9355        _collection: &str,
9356        resolution: f64,
9357    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
9358        use crate::storage::engine::graph_algorithms;
9359        use crate::storage::query::unified::UnifiedResult;
9360        use crate::storage::schema::Value;
9361
9362        let graph = super::graph_dsl::materialize_graph_with_projection(
9363            self.inner.db.store().as_ref(),
9364            None,
9365        )?;
9366
9367        let nodes: Vec<String> = graph.iter_nodes().map(|n| n.id.clone()).collect();
9368        let edges: Vec<(String, String, graph_algorithms::Weight)> = graph
9369            .iter_all_edges()
9370            .into_iter()
9371            .map(|e| (e.source_id, e.target_id, e.weight))
9372            .collect();
9373
9374        let assignment = graph_algorithms::louvain(&nodes, &edges, resolution);
9375
9376        // Project into a UnifiedResult with columns ["node_id", "community_id"].
9377        let mut result = UnifiedResult::with_columns(vec!["node_id".into(), "community_id".into()]);
9378        for (node_id, community_id) in assignment {
9379            let mut record = UnifiedRecord::new();
9380            record.set("node_id", Value::text(node_id));
9381            record.set("community_id", Value::Integer(community_id as i64));
9382            result.push(record);
9383        }
9384        Ok(result)
9385    }
9386
9387    /// Project `(node_id, score)` centrality rows into a `UnifiedResult` with
9388    /// columns `["node_id", "score"]`; scores are `Value::Float`.
9389    fn centrality_result(
9390        rows: Vec<(String, f64)>,
9391    ) -> crate::storage::query::unified::UnifiedResult {
9392        use crate::storage::query::unified::UnifiedResult;
9393        use crate::storage::schema::Value;
9394        let mut result = UnifiedResult::with_columns(vec!["node_id".into(), "score".into()]);
9395        for (node_id, score) in rows {
9396            let mut record = UnifiedRecord::new();
9397            record.set("node_id", Value::text(node_id));
9398            record.set("score", Value::Float(score));
9399            result.push(record);
9400        }
9401        result
9402    }
9403
9404    /// Ultra-fast path: detect `SELECT * FROM table WHERE _entity_id = N` by string pattern
9405    /// and execute it without SQL parsing or planning. Returns None if pattern doesn't match.
9406    fn try_fast_entity_lookup(&self, query: &str) -> Option<RedDBResult<RuntimeQueryResult>> {
9407        // Pattern: "SELECT * FROM <table> WHERE _entity_id = <id>"
9408        // or "SELECT * FROM <table> WHERE _entity_id =<id>"
9409        let q = query.trim();
9410        if !q.starts_with("SELECT") && !q.starts_with("select") {
9411            return None;
9412        }
9413
9414        // Find "WHERE _entity_id = " or "WHERE _entity_id ="
9415        let where_pos = q
9416            .find("WHERE _entity_id")
9417            .or_else(|| q.find("where _entity_id"))?;
9418        let after_field = &q[where_pos + 16..].trim_start(); // skip "WHERE _entity_id"
9419        let after_eq = after_field.strip_prefix('=')?.trim_start();
9420
9421        // Parse the entity ID number
9422        let id_str = after_eq.trim();
9423        let entity_id: u64 = id_str.parse().ok()?;
9424
9425        // Extract table name: between "FROM " and " WHERE"
9426        let from_pos = q.find("FROM ").or_else(|| q.find("from "))? + 5;
9427        let table = q[from_pos..where_pos].trim();
9428        if table.is_empty()
9429            || table.contains(' ') && !table.contains(" AS ") && !table.contains(" as ")
9430        {
9431            return None; // complex query, fall through
9432        }
9433        let table_name = table.split_whitespace().next()?;
9434
9435        // Direct entity lookup — skips SQL parse, plan cache, result
9436        // cache, view rewriter, RLS gate. Safe because the gating in
9437        // `execute_query` guarantees no scope override / no
9438        // transaction context is active. MVCC visibility is still
9439        // honoured against the current snapshot.
9440        let store = self.inner.db.store();
9441        let entity = store
9442            .get(
9443                table_name,
9444                crate::storage::unified::EntityId::new(entity_id),
9445            )
9446            .filter(entity_visible_under_current_snapshot)
9447            .filter(|entity| {
9448                self.inner
9449                    .db
9450                    .replica_allows_entity_at_read(table_name, entity)
9451            });
9452
9453        let count = if entity.is_some() { 1u64 } else { 0 };
9454
9455        // Materialize a record so downstream consumers that walk
9456        // `result.records` (embedded runtime API, decrypt pass, CLI)
9457        // see the row. Previously only `pre_serialized_json` was
9458        // filled, which caused those consumers to see zero rows and
9459        // skewed benchmarks.
9460        let records: Vec<crate::storage::query::unified::UnifiedRecord> = entity
9461            .as_ref()
9462            .and_then(|e| runtime_table_record_from_entity(e.clone()))
9463            .into_iter()
9464            .collect();
9465
9466        let json = match entity {
9467            Some(ref e) => execute_runtime_serialize_single_entity(e),
9468            None => r#"{"columns":[],"record_count":0,"selection":{"scope":"any"},"records":[]}"#
9469                .to_string(),
9470        };
9471
9472        Some(Ok(RuntimeQueryResult {
9473            query: query.to_string(),
9474            mode: crate::storage::query::modes::QueryMode::Sql,
9475            statement: "select",
9476            engine: "fast-entity-lookup",
9477            result: crate::storage::query::unified::UnifiedResult {
9478                columns: Vec::new(),
9479                records,
9480                stats: crate::storage::query::unified::QueryStats {
9481                    rows_scanned: count,
9482                    ..Default::default()
9483                },
9484                pre_serialized_json: Some(json),
9485            },
9486            affected_rows: 0,
9487            statement_type: "select",
9488            bookmark: None,
9489        }))
9490    }
9491
9492    fn result_cache_backend(&self) -> RuntimeResultCacheBackend {
9493        match self
9494            .config_string(RESULT_CACHE_BACKEND_KEY, RESULT_CACHE_DEFAULT_BACKEND)
9495            .as_str()
9496        {
9497            "blob_cache" => RuntimeResultCacheBackend::BlobCache,
9498            "shadow" => RuntimeResultCacheBackend::Shadow,
9499            _ => RuntimeResultCacheBackend::Legacy,
9500        }
9501    }
9502
9503    /// Result-cache kill-switch (issue #802). When `false`, reads and
9504    /// writes are short-circuited so every query recomputes — used for
9505    /// debugging and to bound staleness without a restart.
9506    fn result_cache_enabled(&self) -> bool {
9507        self.config_bool(RESULT_CACHE_ENABLED_KEY, true)
9508    }
9509
9510    /// Configurable per-entry TTL in seconds (issue #802), defaulting to
9511    /// the former `RESULT_CACHE_TTL_SECS` constant.
9512    fn result_cache_ttl_secs(&self) -> u64 {
9513        self.config_u64(RESULT_CACHE_TTL_KEY, RESULT_CACHE_TTL_SECS)
9514    }
9515
9516    /// Configurable LRU capacity in entries (issue #802), defaulting to
9517    /// the former `RESULT_CACHE_MAX_ENTRIES` constant. Zero is treated as
9518    /// "no cached entries retained".
9519    fn result_cache_capacity(&self) -> usize {
9520        self.config_u64(RESULT_CACHE_CAPACITY_KEY, RESULT_CACHE_MAX_ENTRIES as u64) as usize
9521    }
9522
9523    /// Snapshot of the result-cache observability counters (issue #802):
9524    /// `(hits, misses, evictions)`. Surfaced under `red.metrics`.
9525    pub fn result_cache_metrics(&self) -> (u64, u64, u64) {
9526        use std::sync::atomic::Ordering::Relaxed;
9527        (
9528            self.inner.result_cache_hits.load(Relaxed),
9529            self.inner.result_cache_misses.load(Relaxed),
9530            self.inner.result_cache_evictions.load(Relaxed),
9531        )
9532    }
9533
9534    fn record_result_cache_evictions(&self, evicted: u64) {
9535        if evicted > 0 {
9536            self.inner
9537                .result_cache_evictions
9538                .fetch_add(evicted, std::sync::atomic::Ordering::Relaxed);
9539        }
9540    }
9541
9542    pub(super) fn get_result_cache_entry(&self, key: &str) -> Option<RuntimeQueryResult> {
9543        if !self.result_cache_enabled() {
9544            return None;
9545        }
9546        let hit = self.get_result_cache_entry_inner(key);
9547        let counter = if hit.is_some() {
9548            &self.inner.result_cache_hits
9549        } else {
9550            &self.inner.result_cache_misses
9551        };
9552        counter.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
9553        hit
9554    }
9555
9556    fn get_result_cache_entry_inner(&self, key: &str) -> Option<RuntimeQueryResult> {
9557        match self.result_cache_backend() {
9558            RuntimeResultCacheBackend::Legacy => self.get_legacy_result_cache_entry(key),
9559            RuntimeResultCacheBackend::BlobCache => self.get_blob_result_cache_entry(key),
9560            RuntimeResultCacheBackend::Shadow => {
9561                let legacy = self.get_legacy_result_cache_entry(key);
9562                let blob = self.get_blob_result_cache_entry(key);
9563                if let (Some(ref legacy), Some(ref blob)) = (&legacy, &blob) {
9564                    if result_cache_fingerprint(legacy) != result_cache_fingerprint(blob) {
9565                        self.inner
9566                            .result_cache_shadow_divergences
9567                            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
9568                        tracing::warn!(
9569                            key,
9570                            metric = crate::runtime::METRIC_CACHE_SHADOW_DIVERGENCE_TOTAL,
9571                            "result cache shadow backend diverged from legacy"
9572                        );
9573                    }
9574                }
9575                legacy
9576            }
9577        }
9578    }
9579
9580    fn get_legacy_result_cache_entry(&self, key: &str) -> Option<RuntimeQueryResult> {
9581        let ttl = self.result_cache_ttl_secs();
9582        let cache = self.inner.result_cache.read();
9583        cache.0.get(key).and_then(|entry| {
9584            if entry.cached_at.elapsed().as_secs() < ttl {
9585                Some(entry.result.clone())
9586            } else {
9587                None
9588            }
9589        })
9590    }
9591
9592    fn get_blob_result_cache_entry(&self, key: &str) -> Option<RuntimeQueryResult> {
9593        let hit = self
9594            .inner
9595            .result_blob_cache
9596            .get(RESULT_CACHE_BLOB_NAMESPACE, key)?;
9597        {
9598            let cache = self.inner.result_blob_entries.read();
9599            if let Some(entry) = cache.0.get(key) {
9600                return Some(entry.result.clone());
9601            }
9602        }
9603
9604        let (result, scopes) = decode_result_cache_payload(hit.value())?;
9605        let mut cache = self.inner.result_blob_entries.write();
9606        let (ref mut map, ref mut order) = *cache;
9607        if !map.contains_key(key) {
9608            order.push_back(key.to_string());
9609        }
9610        map.insert(
9611            key.to_string(),
9612            RuntimeResultCacheEntry {
9613                result: result.clone(),
9614                cached_at: std::time::Instant::now(),
9615                scopes,
9616            },
9617        );
9618        let evicted = trim_result_cache(map, order, self.result_cache_capacity());
9619        drop(cache);
9620        self.record_result_cache_evictions(evicted);
9621        Some(result)
9622    }
9623
9624    pub(super) fn put_result_cache_entry(&self, key: &str, entry: RuntimeResultCacheEntry) {
9625        if !self.result_cache_enabled() {
9626            return;
9627        }
9628        match self.result_cache_backend() {
9629            RuntimeResultCacheBackend::Legacy => self.put_legacy_result_cache_entry(key, entry),
9630            RuntimeResultCacheBackend::BlobCache => self.put_blob_result_cache_entry(key, entry),
9631            RuntimeResultCacheBackend::Shadow => {
9632                self.put_legacy_result_cache_entry(key, entry.clone());
9633                self.put_blob_result_cache_entry(key, entry);
9634            }
9635        }
9636    }
9637
9638    fn put_legacy_result_cache_entry(&self, key: &str, entry: RuntimeResultCacheEntry) {
9639        let capacity = self.result_cache_capacity();
9640        let mut cache = self.inner.result_cache.write();
9641        let (ref mut map, ref mut order) = *cache;
9642        if !map.contains_key(key) {
9643            order.push_back(key.to_string());
9644        }
9645        map.insert(key.to_string(), entry);
9646        let evicted = trim_result_cache(map, order, capacity);
9647        drop(cache);
9648        self.record_result_cache_evictions(evicted);
9649    }
9650
9651    fn put_blob_result_cache_entry(&self, key: &str, entry: RuntimeResultCacheEntry) {
9652        let policy = crate::storage::cache::BlobCachePolicy::default()
9653            .ttl_ms(self.result_cache_ttl_secs() * 1000)
9654            .priority(200);
9655        let dependencies = entry.scopes.iter().cloned().collect::<Vec<_>>();
9656        let bytes = encode_result_cache_payload(&entry)
9657            .unwrap_or_else(|| result_cache_fingerprint(&entry.result).into_bytes());
9658        let put = crate::storage::cache::BlobCachePut::new(bytes)
9659            .with_dependencies(dependencies)
9660            .with_policy(policy);
9661        if self
9662            .inner
9663            .result_blob_cache
9664            .put(RESULT_CACHE_BLOB_NAMESPACE, key, put)
9665            .is_err()
9666        {
9667            return;
9668        }
9669
9670        let capacity = self.result_cache_capacity();
9671        let mut cache = self.inner.result_blob_entries.write();
9672        let (ref mut map, ref mut order) = *cache;
9673        if !map.contains_key(key) {
9674            order.push_back(key.to_string());
9675        }
9676        map.insert(key.to_string(), entry);
9677        let evicted = trim_result_cache(map, order, capacity);
9678        drop(cache);
9679        self.record_result_cache_evictions(evicted);
9680    }
9681
9682    pub fn result_cache_shadow_divergences(&self) -> u64 {
9683        self.inner
9684            .result_cache_shadow_divergences
9685            .load(std::sync::atomic::Ordering::Relaxed)
9686    }
9687
9688    /// Invalidate the result cache (call after any write operation).
9689    /// Full clear — use for DDL (DROP TABLE, schema changes) or when table is unknown.
9690    pub fn invalidate_result_cache(&self) {
9691        let mut cache = self.inner.result_cache.write();
9692        cache.0.clear();
9693        cache.1.clear();
9694        let mut blob_entries = self.inner.result_blob_entries.write();
9695        blob_entries.0.clear();
9696        blob_entries.1.clear();
9697        self.inner
9698            .result_blob_cache
9699            .invalidate_namespace(RESULT_CACHE_BLOB_NAMESPACE);
9700        let mut ask_entries = self.inner.ask_answer_cache_entries.write();
9701        ask_entries.0.clear();
9702        ask_entries.1.clear();
9703        self.inner
9704            .result_blob_cache
9705            .invalidate_namespace(ASK_ANSWER_CACHE_NAMESPACE);
9706    }
9707
9708    /// Invalidate only result cache entries that declared a dependency on `table`.
9709    /// Cheaper than a full clear: unrelated tables keep their cached results.
9710    pub(crate) fn invalidate_result_cache_for_table(&self, table: &str) {
9711        // Hot-path probe both backends before taking write locks. The blob
9712        // backend is node-local, same as the legacy result cache.
9713        let legacy_has_match = {
9714            let cache = self.inner.result_cache.read();
9715            let (ref map, _) = *cache;
9716            !map.is_empty() && map.values().any(|entry| entry.scopes.contains(table))
9717        };
9718        let blob_has_match = {
9719            let cache = self.inner.result_blob_entries.read();
9720            let (ref map, _) = *cache;
9721            !map.is_empty() && map.values().any(|entry| entry.scopes.contains(table))
9722        };
9723        if legacy_has_match {
9724            let mut cache = self.inner.result_cache.write();
9725            let (ref mut map, ref mut order) = *cache;
9726            map.retain(|_, entry| !entry.scopes.contains(table));
9727            order.retain(|key| map.contains_key(key));
9728        }
9729
9730        if matches!(
9731            self.result_cache_backend(),
9732            RuntimeResultCacheBackend::BlobCache | RuntimeResultCacheBackend::Shadow
9733        ) {
9734            let mut blob_entries = self.inner.result_blob_entries.write();
9735            let (ref mut blob_map, ref mut blob_order) = *blob_entries;
9736            blob_map.clear();
9737            blob_order.clear();
9738            self.inner
9739                .result_blob_cache
9740                .invalidate_namespace(RESULT_CACHE_BLOB_NAMESPACE);
9741        } else if blob_has_match {
9742            let mut blob_entries = self.inner.result_blob_entries.write();
9743            let (ref mut blob_map, ref mut blob_order) = *blob_entries;
9744            blob_map.retain(|_, entry| !entry.scopes.contains(table));
9745            blob_order.retain(|key| blob_map.contains_key(key));
9746        }
9747        let mut ask_entries = self.inner.ask_answer_cache_entries.write();
9748        ask_entries.0.clear();
9749        ask_entries.1.clear();
9750        self.inner
9751            .result_blob_cache
9752            .invalidate_namespace(ASK_ANSWER_CACHE_NAMESPACE);
9753    }
9754
9755    pub(crate) fn invalidate_plan_cache(&self) {
9756        self.inner.query_cache.write().clear();
9757        self.inner
9758            .ddl_epoch
9759            .fetch_add(1, std::sync::atomic::Ordering::Release);
9760    }
9761
9762    /// Read the monotonic DDL epoch counter. Bumped by every
9763    /// `invalidate_plan_cache` call so prepared-statement holders can
9764    /// detect schema drift between PREPARE and EXECUTE.
9765    pub fn ddl_epoch(&self) -> u64 {
9766        self.inner
9767            .ddl_epoch
9768            .load(std::sync::atomic::Ordering::Acquire)
9769    }
9770
9771    pub(crate) fn clear_table_planner_stats(&self, table: &str) {
9772        let store = self.inner.db.store();
9773        crate::storage::query::planner::stats_catalog::clear_table_stats(store.as_ref(), table);
9774        self.invalidate_plan_cache();
9775    }
9776
9777    /// Replay `tenant_tables.*.column` keys from red_config at boot so
9778    /// `CREATE TABLE ... TENANT BY (col)` declarations persist across
9779    /// restarts (Phase 2.5.4). Reads every row of the `red_config`
9780    /// collection, picks the keys matching the tenant-marker shape,
9781    /// and calls `register_tenant_table` for each.
9782    ///
9783    /// Safe no-op when `red_config` doesn't exist (first boot on a
9784    /// fresh datadir).
9785    pub(crate) fn rehydrate_tenant_tables(&self) {
9786        let store = self.inner.db.store();
9787        let Some(manager) = store.get_collection("red_config") else {
9788            return;
9789        };
9790        // Replay in insertion order (SegmentManager iteration). Multiple
9791        // toggles on the same table leave several rows behind — the
9792        // last one processed wins because each register/unregister
9793        // call overwrites the in-memory state.
9794        for entity in manager.query_all(|_| true) {
9795            let crate::storage::unified::entity::EntityData::Row(row) = &entity.data else {
9796                continue;
9797            };
9798            let Some(named) = &row.named else { continue };
9799            let Some(crate::storage::schema::Value::Text(key)) = named.get("key") else {
9800                continue;
9801            };
9802            // Shape: tenant_tables.{table}.column
9803            let Some(rest) = key.strip_prefix("tenant_tables.") else {
9804                continue;
9805            };
9806            let Some((table, suffix)) = rest.rsplit_once('.') else {
9807                // Issue #205 — a `tenant_tables.*` row that doesn't
9808                // split cleanly is a schema-shape regression: the
9809                // metadata writer must always emit the `.column`
9810                // suffix, so reaching this branch means an upgrade
9811                // with incompatible state or external tampering.
9812                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
9813                    collection: "red_config".to_string(),
9814                    detail: format!("malformed tenant_tables key: {key}"),
9815                }
9816                .emit_global();
9817                continue;
9818            };
9819            if suffix != "column" {
9820                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
9821                    collection: "red_config".to_string(),
9822                    detail: format!("unexpected tenant_tables suffix: {key}"),
9823                }
9824                .emit_global();
9825                continue;
9826            }
9827            match named.get("value") {
9828                Some(crate::storage::schema::Value::Text(column)) => {
9829                    self.register_tenant_table(table, column);
9830                }
9831                // Null / missing value = DISABLE TENANCY marker.
9832                Some(crate::storage::schema::Value::Null) | None => {
9833                    self.unregister_tenant_table(table);
9834                }
9835                _ => {}
9836            }
9837        }
9838    }
9839
9840    /// Replay every persisted `MaterializedViewDescriptor` from the
9841    /// `red_materialized_view_defs` system collection (issue #593
9842    /// slice 9a). For each descriptor, re-parse the original SQL,
9843    /// extract the `QueryExpr::CreateView` it produced, and populate
9844    /// the in-memory registries (`inner.views` and
9845    /// `inner.materialized_views`) directly — no write paths run, so
9846    /// rehydrate does not re-persist what it just read.
9847    ///
9848    /// Malformed rows (missing `name`/`source_sql`, parse errors) are
9849    /// skipped with a `SchemaCorruption` operator event so a single
9850    /// bad entry does not block startup.
9851    pub(crate) fn rehydrate_materialized_view_descriptors(&self) {
9852        let store = self.inner.db.store();
9853        let descriptors = crate::runtime::continuous_materialized_view::load_all(store.as_ref());
9854        for descriptor in descriptors {
9855            let parsed = match crate::storage::query::parser::parse(&descriptor.source_sql) {
9856                Ok(qc) => qc,
9857                Err(err) => {
9858                    crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
9859                        collection:
9860                            crate::runtime::continuous_materialized_view::CATALOG_COLLECTION
9861                                .to_string(),
9862                        detail: format!(
9863                            "failed to re-parse materialized-view source for {}: {err}",
9864                            descriptor.name
9865                        ),
9866                    }
9867                    .emit_global();
9868                    continue;
9869                }
9870            };
9871            let crate::storage::query::ast::QueryExpr::CreateView(create) = parsed.query else {
9872                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
9873                    collection: crate::runtime::continuous_materialized_view::CATALOG_COLLECTION
9874                        .to_string(),
9875                    detail: format!(
9876                        "materialized-view source for {} did not re-parse as CREATE VIEW",
9877                        descriptor.name
9878                    ),
9879                }
9880                .emit_global();
9881                continue;
9882            };
9883            // Populate in-memory view registry.
9884            let view_name = create.name.clone();
9885            self.inner
9886                .views
9887                .write()
9888                .insert(view_name.clone(), Arc::new(create));
9889            // Materialized cache slot (data empty until next REFRESH).
9890            use crate::storage::cache::result::{MaterializedViewDef, RefreshPolicy};
9891            let refresh = match descriptor.refresh_every_ms {
9892                Some(ms) => RefreshPolicy::Periodic(std::time::Duration::from_millis(ms)),
9893                None => RefreshPolicy::Manual,
9894            };
9895            let def = MaterializedViewDef {
9896                name: view_name.clone(),
9897                query: format!("<parsed view {}>", view_name),
9898                dependencies: descriptor.source_collections.clone(),
9899                refresh,
9900                retention_duration_ms: descriptor.retention_duration_ms,
9901            };
9902            self.inner.materialized_views.write().register(def);
9903        }
9904        // A rehydrated view shape may differ from any plans the cache
9905        // bootstrapped before this method ran — flush to be safe.
9906        self.invalidate_plan_cache();
9907    }
9908
9909    pub(crate) fn rehydrate_declared_column_schemas(&self) {
9910        let store = self.inner.db.store();
9911        for contract in self.inner.db.collection_contracts() {
9912            let columns: Vec<String> = contract
9913                .declared_columns
9914                .iter()
9915                .map(|column| column.name.clone())
9916                .collect();
9917            let Some(manager) = store.get_collection(&contract.name) else {
9918                continue;
9919            };
9920            manager.set_column_schema_if_empty(columns);
9921        }
9922    }
9923
9924    /// Register a table as tenant-scoped (Phase 2.5.4). Installs the
9925    /// in-memory column mapping, the implicit RLS policy, and enables
9926    /// row-level security on the table. Idempotent — re-registering
9927    /// the same `(table, column)` replaces the prior auto-policy.
9928    pub fn register_tenant_table(&self, table: &str, column: &str) {
9929        use crate::storage::query::ast::{
9930            CompareOp, CreatePolicyQuery, Expr, FieldRef, Filter, Span,
9931        };
9932        self.inner
9933            .tenant_tables
9934            .write()
9935            .insert(table.to_string(), column.to_string());
9936
9937        // Build the policy: col = CURRENT_TENANT()
9938        // Uses CompareExpr so the comparison happens at runtime against
9939        // the thread-local tenant value read by the CURRENT_TENANT
9940        // scalar. Spans are synthetic — there's no source location for
9941        // an auto-generated policy.
9942        let lhs = Expr::Column {
9943            field: FieldRef::TableColumn {
9944                table: table.to_string(),
9945                column: column.to_string(),
9946            },
9947            span: Span::synthetic(),
9948        };
9949        let rhs = Expr::FunctionCall {
9950            name: "CURRENT_TENANT".to_string(),
9951            args: Vec::new(),
9952            span: Span::synthetic(),
9953        };
9954        let policy_filter = Filter::CompareExpr {
9955            lhs,
9956            op: CompareOp::Eq,
9957            rhs,
9958        };
9959
9960        let policy = CreatePolicyQuery {
9961            name: "__tenant_iso".to_string(),
9962            table: table.to_string(),
9963            action: None, // None = ALL actions (SELECT/INSERT/UPDATE/DELETE)
9964            role: None,   // None = every role
9965            using: Box::new(policy_filter),
9966            // Auto-tenancy defaults to Table targets. Collections of
9967            // other kinds (graph / vector / queue / timeseries) that
9968            // opt in via `ALTER ... ENABLE TENANCY` should use the
9969            // matching kind — but for now we keep the auto-policy
9970            // kind-agnostic so the evaluator can apply it to any
9971            // entity living in the collection.
9972            target_kind: crate::storage::query::ast::PolicyTargetKind::Table,
9973        };
9974
9975        // Replace any prior auto-policy for this table (column rename).
9976        self.inner.rls_policies.write().insert(
9977            (table.to_string(), "__tenant_iso".to_string()),
9978            Arc::new(policy),
9979        );
9980        self.inner
9981            .rls_enabled_tables
9982            .write()
9983            .insert(table.to_string());
9984
9985        // Auto-build a hash index on the tenant column. Every read/write
9986        // against a tenant-scoped table carries an implicit
9987        // `col = CURRENT_TENANT()` predicate from the auto-policy, so an
9988        // index on that column is on the hot path of every query. Without
9989        // it, every SELECT/UPDATE/DELETE degrades to a full scan.
9990        self.ensure_tenant_index(table, column);
9991    }
9992
9993    /// Auto-create the hash index that backs the tenant-iso RLS predicate.
9994    /// Skipped when:
9995    ///   * the column is dotted (nested path — flat secondary indices
9996    ///     don't cover those today; RLS still works via the policy)
9997    ///   * `__tenant_idx_{table}` already exists (idempotent on rehydrate)
9998    ///   * the user already registered an index whose first column matches
9999    ///     (avoids redundant duplicates of a user-defined composite)
10000    fn ensure_tenant_index(&self, table: &str, column: &str) {
10001        if column.contains('.') {
10002            return;
10003        }
10004        let index_name = format!("__tenant_idx_{table}");
10005        let registry = self.inner.index_store.list_indices(table);
10006        if registry.iter().any(|idx| idx.name == index_name) {
10007            return;
10008        }
10009        if registry
10010            .iter()
10011            .any(|idx| idx.columns.first().map(|c| c.as_str()) == Some(column))
10012        {
10013            return;
10014        }
10015
10016        let store = self.inner.db.store();
10017        let Some(manager) = store.get_collection(table) else {
10018            return;
10019        };
10020        let entities = manager.query_all(|_| true);
10021        let entity_fields: Vec<(
10022            crate::storage::unified::EntityId,
10023            Vec<(String, crate::storage::schema::Value)>,
10024        )> = entities
10025            .iter()
10026            .map(|e| {
10027                let fields = match &e.data {
10028                    crate::storage::EntityData::Row(row) => {
10029                        if let Some(ref named) = row.named {
10030                            named.iter().map(|(k, v)| (k.clone(), v.clone())).collect()
10031                        } else if let Some(ref schema) = row.schema {
10032                            schema
10033                                .iter()
10034                                .zip(row.columns.iter())
10035                                .map(|(k, v)| (k.clone(), v.clone()))
10036                                .collect()
10037                        } else {
10038                            Vec::new()
10039                        }
10040                    }
10041                    crate::storage::EntityData::Node(node) => node
10042                        .properties
10043                        .iter()
10044                        .map(|(k, v)| (k.clone(), v.clone()))
10045                        .collect(),
10046                    _ => Vec::new(),
10047                };
10048                (e.id, fields)
10049            })
10050            .collect();
10051
10052        let columns = vec![column.to_string()];
10053        if self
10054            .inner
10055            .index_store
10056            .create_index(
10057                &index_name,
10058                table,
10059                &columns,
10060                super::index_store::IndexMethodKind::Hash,
10061                false,
10062                &entity_fields,
10063            )
10064            .is_err()
10065        {
10066            return;
10067        }
10068        self.inner
10069            .index_store
10070            .register(super::index_store::RegisteredIndex {
10071                name: index_name,
10072                collection: table.to_string(),
10073                columns,
10074                method: super::index_store::IndexMethodKind::Hash,
10075                unique: false,
10076            });
10077        self.invalidate_plan_cache();
10078    }
10079
10080    /// Drop the auto-generated tenant index, if one exists. Called from
10081    /// `unregister_tenant_table` so DISABLE TENANCY / DROP TABLE clean up.
10082    fn drop_tenant_index(&self, table: &str) {
10083        let index_name = format!("__tenant_idx_{table}");
10084        self.inner.index_store.drop_index(&index_name, table);
10085    }
10086
10087    /// Retrieve the tenant column for a table, if any (Phase 2.5.4).
10088    /// Used by the INSERT auto-fill path to know which column to
10089    /// populate with `current_tenant()` when the user didn't name it.
10090    pub fn tenant_column(&self, table: &str) -> Option<String> {
10091        self.inner.tenant_tables.read().get(table).cloned()
10092    }
10093
10094    /// Remove a table's tenant registration (Phase 2.5.4). Called by
10095    /// DROP TABLE / ALTER TABLE DISABLE TENANCY. Removes the auto-policy
10096    /// but leaves any user-installed explicit policies intact.
10097    pub fn unregister_tenant_table(&self, table: &str) {
10098        self.inner.tenant_tables.write().remove(table);
10099        self.inner
10100            .rls_policies
10101            .write()
10102            .remove(&(table.to_string(), "__tenant_iso".to_string()));
10103        self.drop_tenant_index(table);
10104        // Only clear RLS enablement if no other policies remain.
10105        let has_other_policies = self
10106            .inner
10107            .rls_policies
10108            .read()
10109            .keys()
10110            .any(|(t, _)| t == table);
10111        if !has_other_policies {
10112            self.inner.rls_enabled_tables.write().remove(table);
10113        }
10114    }
10115
10116    /// Record that the running transaction has marked `id` in `collection`
10117    /// for deletion (Phase 2.3.2b MVCC tombstones). `stamper_xid` is the
10118    /// xid that was written into `xmax` — either the parent txn xid or
10119    /// the innermost savepoint sub-xid. Savepoint rollback filters by
10120    /// this xid to revive only its own tombstones.
10121    pub(crate) fn record_pending_tombstone(
10122        &self,
10123        conn_id: u64,
10124        collection: &str,
10125        id: crate::storage::unified::entity::EntityId,
10126        stamper_xid: crate::storage::transaction::snapshot::Xid,
10127        previous_xmax: crate::storage::transaction::snapshot::Xid,
10128    ) {
10129        self.inner
10130            .pending_tombstones
10131            .write()
10132            .entry(conn_id)
10133            .or_default()
10134            .push((collection.to_string(), id, stamper_xid, previous_xmax));
10135    }
10136
10137    pub(crate) fn record_pending_versioned_update(
10138        &self,
10139        conn_id: u64,
10140        collection: &str,
10141        old_id: crate::storage::unified::entity::EntityId,
10142        new_id: crate::storage::unified::entity::EntityId,
10143        stamper_xid: crate::storage::transaction::snapshot::Xid,
10144        previous_xmax: crate::storage::transaction::snapshot::Xid,
10145    ) {
10146        self.inner
10147            .pending_versioned_updates
10148            .write()
10149            .entry(conn_id)
10150            .or_default()
10151            .push((
10152                collection.to_string(),
10153                old_id,
10154                new_id,
10155                stamper_xid,
10156                previous_xmax,
10157            ));
10158    }
10159
10160    fn with_deferred_store_wal_if_transaction<T>(
10161        &self,
10162        f: impl FnOnce() -> RedDBResult<T>,
10163    ) -> RedDBResult<T> {
10164        let conn_id = current_connection_id();
10165        if !self.inner.tx_contexts.read().contains_key(&conn_id) {
10166            return f();
10167        }
10168
10169        crate::storage::UnifiedStore::begin_deferred_store_wal_capture();
10170        let result = f();
10171        let captured = crate::storage::UnifiedStore::take_deferred_store_wal_capture();
10172        match result {
10173            Ok(value) => {
10174                self.record_pending_store_wal_actions(conn_id, captured);
10175                Ok(value)
10176            }
10177            Err(err) => Err(err),
10178        }
10179    }
10180
10181    fn with_deferred_store_wal_for_dml<T>(
10182        &self,
10183        capture_autocommit_events: bool,
10184        f: impl FnOnce() -> RedDBResult<T>,
10185    ) -> RedDBResult<T> {
10186        let conn_id = current_connection_id();
10187        if self.inner.tx_contexts.read().contains_key(&conn_id) {
10188            return self.with_deferred_store_wal_if_transaction(f);
10189        }
10190        if !capture_autocommit_events {
10191            return f();
10192        }
10193
10194        crate::storage::UnifiedStore::begin_deferred_store_wal_capture();
10195        let result = f();
10196        let captured = crate::storage::UnifiedStore::take_deferred_store_wal_capture();
10197        self.inner
10198            .db
10199            .store()
10200            .append_deferred_store_wal_actions(captured)
10201            .map_err(|err| RedDBError::Internal(err.to_string()))?;
10202        result
10203    }
10204
10205    fn insert_may_emit_events(&self, query: &InsertQuery) -> bool {
10206        !query.suppress_events
10207            && self.collection_has_event_subscriptions_for_operation(
10208                &query.table,
10209                crate::catalog::SubscriptionOperation::Insert,
10210            )
10211    }
10212
10213    fn update_may_emit_events(&self, query: &UpdateQuery) -> bool {
10214        !query.suppress_events
10215            && self.collection_has_event_subscriptions_for_operation(
10216                &query.table,
10217                crate::catalog::SubscriptionOperation::Update,
10218            )
10219    }
10220
10221    fn delete_may_emit_events(&self, query: &DeleteQuery) -> bool {
10222        !query.suppress_events
10223            && self.collection_has_event_subscriptions_for_operation(
10224                &query.table,
10225                crate::catalog::SubscriptionOperation::Delete,
10226            )
10227    }
10228
10229    fn collection_has_event_subscriptions_for_operation(
10230        &self,
10231        collection: &str,
10232        operation: crate::catalog::SubscriptionOperation,
10233    ) -> bool {
10234        let Some(contract) = self.db().collection_contract_arc(collection) else {
10235            return false;
10236        };
10237        contract.subscriptions.iter().any(|subscription| {
10238            subscription.enabled
10239                && (subscription.ops_filter.is_empty()
10240                    || subscription.ops_filter.contains(&operation))
10241        })
10242    }
10243
10244    fn record_pending_store_wal_actions(
10245        &self,
10246        conn_id: u64,
10247        actions: crate::storage::unified::DeferredStoreWalActions,
10248    ) {
10249        if actions.is_empty() {
10250            return;
10251        }
10252        let mut guard = self.inner.pending_store_wal_actions.write();
10253        guard.entry(conn_id).or_default().extend(actions);
10254    }
10255
10256    fn flush_pending_store_wal_actions(&self, conn_id: u64) -> RedDBResult<()> {
10257        let Some(actions) = self
10258            .inner
10259            .pending_store_wal_actions
10260            .write()
10261            .remove(&conn_id)
10262        else {
10263            return Ok(());
10264        };
10265        self.inner
10266            .db
10267            .store()
10268            .append_deferred_store_wal_actions(actions)
10269            .map_err(|err| RedDBError::Internal(err.to_string()))
10270    }
10271
10272    fn discard_pending_store_wal_actions(&self, conn_id: u64) {
10273        self.inner
10274            .pending_store_wal_actions
10275            .write()
10276            .remove(&conn_id);
10277    }
10278
10279    fn xid_conflicts_with_snapshot(
10280        &self,
10281        xid: crate::storage::transaction::snapshot::Xid,
10282        snapshot: &crate::storage::transaction::snapshot::Snapshot,
10283        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
10284    ) -> bool {
10285        xid != 0
10286            && !own_xids.contains(&xid)
10287            && !self.inner.snapshot_manager.is_aborted(xid)
10288            && !self.inner.snapshot_manager.is_active(xid)
10289            && (xid > snapshot.xid || snapshot.in_progress.contains(&xid))
10290    }
10291
10292    fn conflict_error(
10293        collection: &str,
10294        logical_id: crate::storage::unified::entity::EntityId,
10295        xid: crate::storage::transaction::snapshot::Xid,
10296    ) -> RedDBError {
10297        RedDBError::Query(format!(
10298            "serialization conflict: table row {collection}/{} was modified by concurrent transaction {xid}",
10299            logical_id.raw()
10300        ))
10301    }
10302
10303    fn check_logical_row_conflict(
10304        &self,
10305        collection: &str,
10306        logical_id: crate::storage::unified::entity::EntityId,
10307        excluded_ids: &[crate::storage::unified::entity::EntityId],
10308        snapshot: &crate::storage::transaction::snapshot::Snapshot,
10309        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
10310    ) -> RedDBResult<()> {
10311        let store = self.inner.db.store();
10312        let Some(manager) = store.get_collection(collection) else {
10313            return Ok(());
10314        };
10315
10316        for candidate in manager.query_all(|_| true) {
10317            if excluded_ids.contains(&candidate.id) || candidate.logical_id() != logical_id {
10318                continue;
10319            }
10320            if self.xid_conflicts_with_snapshot(candidate.xmin, snapshot, own_xids) {
10321                return Err(Self::conflict_error(collection, logical_id, candidate.xmin));
10322            }
10323            if self.xid_conflicts_with_snapshot(candidate.xmax, snapshot, own_xids) {
10324                return Err(Self::conflict_error(collection, logical_id, candidate.xmax));
10325            }
10326        }
10327        Ok(())
10328    }
10329
10330    pub(crate) fn check_table_row_write_conflicts(
10331        &self,
10332        conn_id: u64,
10333        snapshot: &crate::storage::transaction::snapshot::Snapshot,
10334        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
10335    ) -> RedDBResult<()> {
10336        let versioned_updates = self
10337            .inner
10338            .pending_versioned_updates
10339            .read()
10340            .get(&conn_id)
10341            .cloned()
10342            .unwrap_or_default();
10343        let tombstones = self
10344            .inner
10345            .pending_tombstones
10346            .read()
10347            .get(&conn_id)
10348            .cloned()
10349            .unwrap_or_default();
10350
10351        let store = self.inner.db.store();
10352        for (collection, old_id, new_id, xid, previous_xmax) in versioned_updates {
10353            let Some(manager) = store.get_collection(&collection) else {
10354                continue;
10355            };
10356            let Some(old) = manager.get(old_id) else {
10357                continue;
10358            };
10359            let logical_id = old.logical_id();
10360            if self.xid_conflicts_with_snapshot(previous_xmax, snapshot, own_xids) {
10361                return Err(Self::conflict_error(&collection, logical_id, previous_xmax));
10362            }
10363            if old.xmax != xid && self.xid_conflicts_with_snapshot(old.xmax, snapshot, own_xids) {
10364                return Err(Self::conflict_error(&collection, logical_id, old.xmax));
10365            }
10366            self.check_logical_row_conflict(
10367                &collection,
10368                logical_id,
10369                &[old_id, new_id],
10370                snapshot,
10371                own_xids,
10372            )?;
10373        }
10374
10375        for (collection, id, xid, previous_xmax) in tombstones {
10376            let Some(manager) = store.get_collection(&collection) else {
10377                continue;
10378            };
10379            let Some(entity) = manager.get(id) else {
10380                continue;
10381            };
10382            let logical_id = entity.logical_id();
10383            if self.xid_conflicts_with_snapshot(previous_xmax, snapshot, own_xids) {
10384                return Err(Self::conflict_error(&collection, logical_id, previous_xmax));
10385            }
10386            if entity.xmax != xid
10387                && self.xid_conflicts_with_snapshot(entity.xmax, snapshot, own_xids)
10388            {
10389                return Err(Self::conflict_error(&collection, logical_id, entity.xmax));
10390            }
10391            self.check_logical_row_conflict(&collection, logical_id, &[id], snapshot, own_xids)?;
10392        }
10393
10394        Ok(())
10395    }
10396
10397    pub(crate) fn restore_pending_write_stamps(&self, conn_id: u64) {
10398        let versioned_updates = self
10399            .inner
10400            .pending_versioned_updates
10401            .read()
10402            .get(&conn_id)
10403            .cloned()
10404            .unwrap_or_default();
10405        let tombstones = self
10406            .inner
10407            .pending_tombstones
10408            .read()
10409            .get(&conn_id)
10410            .cloned()
10411            .unwrap_or_default();
10412
10413        let store = self.inner.db.store();
10414        for (collection, old_id, _new_id, xid, _previous_xmax) in versioned_updates {
10415            if let Some(manager) = store.get_collection(&collection) {
10416                if let Some(mut entity) = manager.get(old_id) {
10417                    entity.set_xmax(xid);
10418                    let _ = manager.update(entity);
10419                }
10420            }
10421        }
10422        for (collection, id, xid, _previous_xmax) in tombstones {
10423            if let Some(manager) = store.get_collection(&collection) {
10424                if let Some(mut entity) = manager.get(id) {
10425                    entity.set_xmax(xid);
10426                    let _ = manager.update(entity);
10427                }
10428            }
10429        }
10430    }
10431
10432    pub(crate) fn finalize_pending_versioned_updates(&self, conn_id: u64) {
10433        self.inner
10434            .pending_versioned_updates
10435            .write()
10436            .remove(&conn_id);
10437    }
10438
10439    pub(crate) fn revive_pending_versioned_updates(&self, conn_id: u64) {
10440        let Some(pending) = self
10441            .inner
10442            .pending_versioned_updates
10443            .write()
10444            .remove(&conn_id)
10445        else {
10446            return;
10447        };
10448
10449        let store = self.inner.db.store();
10450        for (collection, old_id, new_id, xid, previous_xmax) in pending {
10451            if let Some(manager) = store.get_collection(&collection) {
10452                if let Some(mut old) = manager.get(old_id) {
10453                    if old.xmax == xid {
10454                        old.set_xmax(previous_xmax);
10455                        let _ = manager.update(old);
10456                    }
10457                }
10458            }
10459            let _ = store.delete_batch(&collection, &[new_id]);
10460        }
10461    }
10462
10463    pub(crate) fn revive_versioned_updates_since(&self, conn_id: u64, stamper_xid: u64) -> usize {
10464        let mut guard = self.inner.pending_versioned_updates.write();
10465        let Some(pending) = guard.get_mut(&conn_id) else {
10466            return 0;
10467        };
10468
10469        let store = self.inner.db.store();
10470        let mut reverted = 0usize;
10471        pending.retain(|(collection, old_id, new_id, xid, previous_xmax)| {
10472            if *xid < stamper_xid {
10473                return true;
10474            }
10475            if let Some(manager) = store.get_collection(collection) {
10476                if let Some(mut old) = manager.get(*old_id) {
10477                    if old.xmax == *xid {
10478                        old.set_xmax(*previous_xmax);
10479                        let _ = manager.update(old);
10480                    }
10481                }
10482            }
10483            let _ = store.delete_batch(collection, &[*new_id]);
10484            reverted += 1;
10485            false
10486        });
10487        if pending.is_empty() {
10488            guard.remove(&conn_id);
10489        }
10490        reverted
10491    }
10492
10493    /// Flush tombstones on COMMIT. The xmax stamp is already the durable
10494    /// delete marker; commit only drops the rollback journal and emits
10495    /// side effects. Physical reclamation is left for VACUUM so old
10496    /// snapshots can still resolve the pre-delete row version.
10497    pub(crate) fn finalize_pending_tombstones(&self, conn_id: u64) {
10498        let Some(pending) = self.inner.pending_tombstones.write().remove(&conn_id) else {
10499            return;
10500        };
10501        if pending.is_empty() {
10502            return;
10503        }
10504
10505        let store = self.inner.db.store();
10506        for (collection, id, _xid, _previous_xmax) in pending {
10507            store.context_index().remove_entity(id);
10508            self.cdc_emit(
10509                crate::replication::cdc::ChangeOperation::Delete,
10510                &collection,
10511                id.raw(),
10512                "entity",
10513            );
10514        }
10515    }
10516
10517    /// Revive tombstones on ROLLBACK — reset `xmax` to 0 so the tuples
10518    /// become visible again to future snapshots. Best-effort: a row
10519    /// already reclaimed by a concurrent VACUUM stays gone, but VACUUM
10520    /// never reclaims tuples whose xmax is still referenced by any
10521    /// active snapshot, so this case is only reachable via external
10522    /// storage corruption.
10523    pub(crate) fn revive_pending_tombstones(&self, conn_id: u64) {
10524        let Some(pending) = self.inner.pending_tombstones.write().remove(&conn_id) else {
10525            return;
10526        };
10527
10528        let store = self.inner.db.store();
10529        for (collection, id, xid, previous_xmax) in pending {
10530            let Some(manager) = store.get_collection(&collection) else {
10531                continue;
10532            };
10533            if let Some(mut entity) = manager.get(id) {
10534                if entity.xmax == xid {
10535                    entity.set_xmax(previous_xmax);
10536                    let _ = manager.update(entity);
10537                }
10538            }
10539        }
10540    }
10541
10542    /// Slice C of PRD #718 — accessor for the local wait registry.
10543    pub fn queue_wait_registry(
10544        &self,
10545    ) -> std::sync::Arc<crate::runtime::queue_wait_registry::QueueWaitRegistry> {
10546        self.inner.queue_wait_registry.clone()
10547    }
10548
10549    /// Buffer a `(scope, queue)` wake on the current connection so it
10550    /// fires post-COMMIT, or notify immediately if no transaction is
10551    /// open (autocommit path). The wait registry only ever observes
10552    /// notifies for committed work — rollback drops the buffer.
10553    pub(crate) fn record_queue_wake(&self, scope: &str, queue: &str) {
10554        if self.current_xid().is_some() {
10555            let conn_id = current_connection_id();
10556            self.inner
10557                .pending_queue_wakes
10558                .write()
10559                .entry(conn_id)
10560                .or_default()
10561                .push((scope.to_string(), queue.to_string()));
10562            return;
10563        }
10564        self.inner.queue_wait_registry.notify(scope, queue);
10565    }
10566
10567    pub(crate) fn finalize_pending_queue_wakes(&self, conn_id: u64) {
10568        let Some(pending) = self.inner.pending_queue_wakes.write().remove(&conn_id) else {
10569            return;
10570        };
10571        for (scope, queue) in pending {
10572            self.inner.queue_wait_registry.notify(&scope, &queue);
10573        }
10574    }
10575
10576    pub(crate) fn discard_pending_queue_wakes(&self, conn_id: u64) {
10577        self.inner.pending_queue_wakes.write().remove(&conn_id);
10578    }
10579
10580    pub(crate) fn finalize_pending_kv_watch_events(&self, conn_id: u64) {
10581        let Some(pending) = self.inner.pending_kv_watch_events.write().remove(&conn_id) else {
10582            return;
10583        };
10584        for event in pending {
10585            self.cdc_emit_kv(
10586                event.op,
10587                &event.collection,
10588                &event.key,
10589                0,
10590                event.before,
10591                event.after,
10592            );
10593        }
10594    }
10595
10596    pub(crate) fn discard_pending_kv_watch_events(&self, conn_id: u64) {
10597        self.inner.pending_kv_watch_events.write().remove(&conn_id);
10598    }
10599
10600    /// Materialise the entire graph store while applying MVCC visibility
10601    /// AND per-collection RLS to each candidate node and edge. Mirrors
10602    /// `materialize_graph` but routes every entity through the same
10603    /// gate the SELECT path uses, with the correct `PolicyTargetKind`
10604    /// per entity kind (`Nodes` for graph nodes, `Edges` for graph
10605    /// edges). Returns the filtered `GraphStore` plus the
10606    /// `node_id → properties` map the executor needs for `RETURN n.*`
10607    /// projections.
10608    fn materialize_graph_with_rls(
10609        &self,
10610    ) -> RedDBResult<(
10611        crate::storage::engine::GraphStore,
10612        std::collections::HashMap<
10613            String,
10614            std::collections::HashMap<String, crate::storage::schema::Value>,
10615        >,
10616        crate::storage::query::unified::EdgeProperties,
10617    )> {
10618        use crate::storage::engine::GraphStore;
10619        use crate::storage::query::ast::{PolicyAction, PolicyTargetKind};
10620        use crate::storage::unified::entity::{EntityData, EntityKind};
10621        use std::collections::{HashMap, HashSet};
10622
10623        let store = self.inner.db.store();
10624        let snap_ctx = capture_current_snapshot();
10625        let role = current_auth_identity().map(|(_, r)| r.as_str().to_string());
10626
10627        let graph = GraphStore::new();
10628        let mut node_properties: HashMap<String, HashMap<String, crate::storage::schema::Value>> =
10629            HashMap::new();
10630        let mut edge_properties: crate::storage::query::unified::EdgeProperties = HashMap::new();
10631        let mut allowed_nodes: HashSet<String> = HashSet::new();
10632
10633        // Per-collection cached compiled filters — Nodes-kind for
10634        // first pass, Edges-kind for the second. None entries mean
10635        // "RLS enabled, zero matching policy → deny all of this kind".
10636        let mut node_rls: HashMap<String, Option<crate::storage::query::ast::Filter>> =
10637            HashMap::new();
10638        let mut edge_rls: HashMap<String, Option<crate::storage::query::ast::Filter>> =
10639            HashMap::new();
10640
10641        let collections = store.list_collections();
10642
10643        // First pass — gather nodes.
10644        for collection in &collections {
10645            let Some(manager) = store.get_collection(collection) else {
10646                continue;
10647            };
10648            let entities = manager.query_all(|_| true);
10649            for entity in entities {
10650                if !entity_visible_with_context(snap_ctx.as_ref(), &entity) {
10651                    continue;
10652                }
10653                let EntityKind::GraphNode(ref node) = entity.kind else {
10654                    continue;
10655                };
10656                if !node_passes_rls(self, collection, role.as_deref(), &mut node_rls, &entity) {
10657                    continue;
10658                }
10659                let id_str = entity.id.raw().to_string();
10660                graph
10661                    .add_node_with_label(
10662                        &id_str,
10663                        &node.label,
10664                        &super::graph_node_label(&node.node_type),
10665                    )
10666                    .map_err(|err| RedDBError::Query(err.to_string()))?;
10667                allowed_nodes.insert(id_str.clone());
10668                if let EntityData::Node(node_data) = &entity.data {
10669                    node_properties.insert(id_str, node_data.properties.clone());
10670                }
10671            }
10672        }
10673
10674        // Second pass — gather edges. An edge appears only when both
10675        // endpoint nodes survived the RLS pass AND the edge itself
10676        // passes its own RLS gate.
10677        for collection in &collections {
10678            let Some(manager) = store.get_collection(collection) else {
10679                continue;
10680            };
10681            let entities = manager.query_all(|_| true);
10682            for entity in entities {
10683                if !entity_visible_with_context(snap_ctx.as_ref(), &entity) {
10684                    continue;
10685                }
10686                let EntityKind::GraphEdge(ref edge) = entity.kind else {
10687                    continue;
10688                };
10689                if !allowed_nodes.contains(&edge.from_node)
10690                    || !allowed_nodes.contains(&edge.to_node)
10691                {
10692                    continue;
10693                }
10694                if !edge_passes_rls(self, collection, role.as_deref(), &mut edge_rls, &entity) {
10695                    continue;
10696                }
10697                let weight = match &entity.data {
10698                    EntityData::Edge(e) => e.weight,
10699                    _ => edge.weight as f32 / 1000.0,
10700                };
10701                let edge_label = super::graph_edge_label(&edge.label);
10702                graph
10703                    .add_edge_with_label(&edge.from_node, &edge.to_node, &edge_label, weight)
10704                    .map_err(|err| RedDBError::Query(err.to_string()))?;
10705                if let EntityData::Edge(edge_data) = &entity.data {
10706                    edge_properties.insert(
10707                        (edge.from_node.clone(), edge_label, edge.to_node.clone()),
10708                        edge_data.properties.clone(),
10709                    );
10710                }
10711            }
10712        }
10713
10714        // Suppress unused-PolicyAction/PolicyTargetKind warnings — both
10715        // are used inside the helper closures via the per-kind helpers
10716        // declared at the bottom of this file.
10717        let _ = (PolicyAction::Select, PolicyTargetKind::Nodes);
10718
10719        Ok((graph, node_properties, edge_properties))
10720    }
10721
10722    /// Phase 1.1 MVCC universal: post-save hook that stamps `xmin` on a
10723    /// freshly-inserted entity when the current connection holds an
10724    /// open transaction. Used by graph / vector / queue / timeseries
10725    /// write paths that go through the DevX builder API (`db.node(...)
10726    /// .save()` and friends) — those live in the storage crate and
10727    /// can't reach `current_xid()` without crossing layers, so the
10728    /// application layer calls this helper right after `save()` to
10729    /// finalise the MVCC stamp.
10730    ///
10731    /// Autocommit (outside BEGIN) is a no-op — no extra lookup or
10732    /// write, so the non-transactional hot path stays untouched.
10733    ///
10734    /// Best-effort: if the collection or entity disappears between
10735    /// the save and the stamp (concurrent DROP), we silently skip.
10736    pub(crate) fn stamp_xmin_if_in_txn(
10737        &self,
10738        collection: &str,
10739        id: crate::storage::unified::entity::EntityId,
10740    ) {
10741        let Some(xid) = self.current_xid() else {
10742            return;
10743        };
10744        let store = self.inner.db.store();
10745        let Some(manager) = store.get_collection(collection) else {
10746            return;
10747        };
10748        if let Some(mut entity) = manager.get(id) {
10749            entity.set_xmin(xid);
10750            let _ = manager.update(entity);
10751        }
10752    }
10753
10754    /// Revive tombstones stamped by `stamper_xid` or any sub-xid
10755    /// allocated after it (Phase 2.3.2e savepoint rollback). Any
10756    /// pending entries with `xid < stamper_xid` stay queued because
10757    /// they belong to the enclosing scope — they'll either flush on
10758    /// COMMIT or revive on an outer ROLLBACK TO SAVEPOINT.
10759    ///
10760    /// Returns the number of tuples whose `xmax` was wiped back to 0.
10761    pub(crate) fn revive_tombstones_since(&self, conn_id: u64, stamper_xid: u64) -> usize {
10762        let mut guard = self.inner.pending_tombstones.write();
10763        let Some(pending) = guard.get_mut(&conn_id) else {
10764            return 0;
10765        };
10766
10767        let store = self.inner.db.store();
10768        let mut revived = 0usize;
10769        pending.retain(|(collection, id, xid, previous_xmax)| {
10770            if *xid < stamper_xid {
10771                // Stamped before the savepoint — keep in queue.
10772                return true;
10773            }
10774            if let Some(manager) = store.get_collection(collection) {
10775                if let Some(mut entity) = manager.get(*id) {
10776                    if entity.xmax == *xid {
10777                        entity.set_xmax(*previous_xmax);
10778                        let _ = manager.update(entity);
10779                        revived += 1;
10780                    }
10781                }
10782            }
10783            false
10784        });
10785        if pending.is_empty() {
10786            guard.remove(&conn_id);
10787        }
10788        revived
10789    }
10790
10791    /// Return the snapshot the current connection should use for visibility
10792    /// checks (Phase 2.3 PG parity).
10793    ///
10794    /// * If the connection is inside a BEGIN-wrapped transaction, reuse
10795    ///   the snapshot stored in its `TxnContext`.
10796    /// * Otherwise (autocommit), capture a fresh snapshot tied to an
10797    ///   implicit xid=0 — the read path treats pre-MVCC rows as always
10798    ///   visible so this degrades to "see everything committed".
10799    pub fn current_snapshot(&self) -> crate::storage::transaction::snapshot::Snapshot {
10800        let conn_id = current_connection_id();
10801        if let Some(ctx) = self.inner.tx_contexts.read().get(&conn_id).cloned() {
10802            return ctx.snapshot;
10803        }
10804        // Autocommit: take a fresh snapshot bounded by `peek_next_xid` so
10805        // every already-committed xid (which is strictly less) passes the
10806        // `xmin <= snap.xid` gate, while concurrently-active xids land in
10807        // the `in_progress` set and stay hidden until they commit. Using
10808        // xid=0 would incorrectly hide every MVCC-stamped tuple.
10809        let high_water = self.inner.snapshot_manager.peek_next_xid();
10810        self.inner.snapshot_manager.snapshot(high_water)
10811    }
10812
10813    /// Xid of the current connection's active transaction, or `None` when
10814    /// running outside a BEGIN/COMMIT block. Write paths call this to
10815    /// decide whether to stamp `xmin`/`xmax` on tuples.
10816    /// Phase 2.3.2e: when a savepoint is open, `writer_xid` returns the
10817    /// sub-xid so new writes can be selectively rolled back. Otherwise
10818    /// the parent txn's xid is returned, matching pre-savepoint
10819    /// behaviour. Callers that need the enclosing *transaction* xid
10820    /// (e.g. VACUUM min-active calculations) should read `ctx.xid`
10821    /// directly.
10822    pub fn current_xid(&self) -> Option<crate::storage::transaction::snapshot::Xid> {
10823        let conn_id = current_connection_id();
10824        self.inner
10825            .tx_contexts
10826            .read()
10827            .get(&conn_id)
10828            .map(|ctx| ctx.writer_xid())
10829    }
10830
10831    /// `true` when the given connection id has an open `BEGIN`. Issue
10832    /// #760 — `OpenStream` consults this to refuse output streams that
10833    /// would otherwise collide with an interactive transaction (see
10834    /// ADR 0029 "Transaction interaction"). HTTP requests pre-dating the
10835    /// connection-id plumbing run with id `0`, which never carries a
10836    /// transaction context, so this returns `false` on those paths.
10837    pub fn connection_in_transaction(&self, conn_id: u64) -> bool {
10838        self.inner.tx_contexts.read().contains_key(&conn_id)
10839    }
10840
10841    /// Access the shared `SnapshotManager` — useful for VACUUM to compute
10842    /// the oldest-active xid when reclaiming dead tuples.
10843    pub fn snapshot_manager(&self) -> Arc<crate::storage::transaction::snapshot::SnapshotManager> {
10844        Arc::clone(&self.inner.snapshot_manager)
10845    }
10846
10847    fn mvcc_vacuum_cutoff_xid(&self) -> crate::storage::transaction::snapshot::Xid {
10848        let manager = &self.inner.snapshot_manager;
10849        let next_xid = manager.peek_next_xid();
10850        let mut cutoff = next_xid;
10851        if let Some(oldest_active) = manager.oldest_active_xid() {
10852            cutoff = cutoff.min(oldest_active);
10853        }
10854        if let Some(oldest_pinned) = manager.oldest_pinned_xid() {
10855            cutoff = cutoff.min(oldest_pinned);
10856        }
10857        let retention_xids = self.config_u64("runtime.mvcc.vacuum_retention_xids", 0);
10858        if retention_xids > 0 {
10859            cutoff = cutoff.min(next_xid.saturating_sub(retention_xids));
10860        }
10861        cutoff
10862    }
10863
10864    fn rebuild_runtime_indexes_for_table(&self, table: &str) -> RedDBResult<()> {
10865        let registered = self.inner.index_store.list_indices(table);
10866        if registered.is_empty() {
10867            return Ok(());
10868        }
10869        let store = self.inner.db.store();
10870        let Some(manager) = store.get_collection(table) else {
10871            return Ok(());
10872        };
10873        let entity_fields = manager
10874            .query_all(|entity| matches!(entity.kind, crate::storage::EntityKind::TableRow { .. }))
10875            .into_iter()
10876            .map(|entity| (entity.id, table_row_index_fields(&entity)))
10877            .collect::<Vec<_>>();
10878
10879        for index in registered {
10880            self.inner.index_store.drop_index(&index.name, table);
10881            self.inner
10882                .index_store
10883                .create_index(
10884                    &index.name,
10885                    table,
10886                    &index.columns,
10887                    index.method,
10888                    index.unique,
10889                    &entity_fields,
10890                )
10891                .map_err(RedDBError::Internal)?;
10892            self.inner.index_store.register(index);
10893        }
10894        self.invalidate_plan_cache();
10895        Ok(())
10896    }
10897
10898    /// Own-tx xids (parent + open/released savepoints) for the current
10899    /// connection. Transports + tests that build a `SnapshotContext`
10900    /// manually (outside the `execute_query` scope) need this set so
10901    /// the writer's own uncommitted tuples stay visible to self.
10902    pub fn current_txn_own_xids(
10903        &self,
10904    ) -> std::collections::HashSet<crate::storage::transaction::snapshot::Xid> {
10905        let mut set = std::collections::HashSet::new();
10906        if let Some(ctx) = self.inner.tx_contexts.read().get(&current_connection_id()) {
10907            set.insert(ctx.xid);
10908            for (_, sub) in &ctx.savepoints {
10909                set.insert(*sub);
10910            }
10911            for sub in &ctx.released_sub_xids {
10912                set.insert(*sub);
10913            }
10914        }
10915        set
10916    }
10917
10918    /// Access the shared `ForeignTableRegistry` (Phase 3.2 PG parity).
10919    ///
10920    /// Callers use this to check whether a table name is a registered
10921    /// foreign table (`registry.is_foreign_table(name)`) and, if so, to
10922    /// scan it (`registry.scan(name)`). The read-path rewriter consults
10923    /// this before dispatching into native-collection lookup.
10924    pub fn foreign_tables(&self) -> Arc<crate::storage::fdw::ForeignTableRegistry> {
10925        Arc::clone(&self.inner.foreign_tables)
10926    }
10927
10928    /// Is Row-Level Security enabled for this table? (Phase 2.5 PG parity)
10929    pub fn is_rls_enabled(&self, table: &str) -> bool {
10930        self.inner.rls_enabled_tables.read().contains(table)
10931    }
10932
10933    /// Collect the USING predicates that apply to this `(table, role, action)`.
10934    ///
10935    /// Returned filters should be OR-combined (a row passes RLS when *any*
10936    /// matching policy accepts it) and then AND-ed into the query's WHERE.
10937    /// When the table has RLS disabled this returns an empty Vec — callers
10938    /// can fast-path back to the unfiltered read.
10939    pub fn matching_rls_policies(
10940        &self,
10941        table: &str,
10942        role: Option<&str>,
10943        action: crate::storage::query::ast::PolicyAction,
10944    ) -> Vec<crate::storage::query::ast::Filter> {
10945        // Default kind = Table preserves the pre-Phase-2.5.5 behaviour:
10946        // callers that don't name a kind only see Table-scoped
10947        // policies (which is what execute SELECT / UPDATE / DELETE
10948        // expect).
10949        self.matching_rls_policies_for_kind(
10950            table,
10951            role,
10952            action,
10953            crate::storage::query::ast::PolicyTargetKind::Table,
10954        )
10955    }
10956
10957    /// Kind-aware variant used by cross-model scans (Phase 2.5.5).
10958    ///
10959    /// Graph scans request `Nodes` / `Edges`, vector ANN requests
10960    /// `Vectors`, queue consumers request `Messages`, and timeseries
10961    /// range scans request `Points`. Policies tagged with a
10962    /// different kind are skipped so a graph-scoped policy doesn't
10963    /// accidentally gate a table SELECT on the same collection.
10964    pub fn matching_rls_policies_for_kind(
10965        &self,
10966        table: &str,
10967        role: Option<&str>,
10968        action: crate::storage::query::ast::PolicyAction,
10969        kind: crate::storage::query::ast::PolicyTargetKind,
10970    ) -> Vec<crate::storage::query::ast::Filter> {
10971        if !self.is_rls_enabled(table) {
10972            return Vec::new();
10973        }
10974        let policies = self.inner.rls_policies.read();
10975        policies
10976            .iter()
10977            .filter_map(|((t, _), p)| {
10978                if t != table {
10979                    return None;
10980                }
10981                // Kind gate — Table policies also apply to every
10982                // other kind *iff* the policy predicate evaluates
10983                // against entity fields that exist uniformly; the
10984                // caller's kind filter is the stricter check, so
10985                // match literally. Auto-tenancy policies stamp
10986                // Table and the caller passes the concrete kind —
10987                // we allow Table policies to apply cross-kind for
10988                // backwards compat.
10989                if p.target_kind != kind
10990                    && p.target_kind != crate::storage::query::ast::PolicyTargetKind::Table
10991                {
10992                    return None;
10993                }
10994                // Action gate — `None` means "ALL" actions.
10995                if let Some(a) = p.action {
10996                    if a != action {
10997                        return None;
10998                    }
10999                }
11000                // Role gate — `None` means "any role".
11001                if let Some(p_role) = p.role.as_deref() {
11002                    match role {
11003                        Some(r) if r == p_role => {}
11004                        _ => return None,
11005                    }
11006                }
11007                Some((*p.using).clone())
11008            })
11009            .collect()
11010    }
11011
11012    pub(crate) fn refresh_table_planner_stats(&self, table: &str) {
11013        let store = self.inner.db.store();
11014        if let Some(stats) =
11015            crate::storage::query::planner::stats_catalog::analyze_collection(store.as_ref(), table)
11016        {
11017            crate::storage::query::planner::stats_catalog::persist_table_stats(
11018                store.as_ref(),
11019                &stats,
11020            );
11021        } else {
11022            crate::storage::query::planner::stats_catalog::clear_table_stats(store.as_ref(), table);
11023        }
11024        self.invalidate_plan_cache();
11025    }
11026
11027    pub(crate) fn note_table_write(&self, table: &str) {
11028        // Skip the write lock when the table is already marked
11029        // dirty. With single-row UPDATEs in a loop this used to
11030        // grab the planner_dirty_tables write lock N times even
11031        // though the first call already flipped the flag.
11032        let already_dirty = self.inner.planner_dirty_tables.read().contains(table);
11033        if !already_dirty {
11034            self.inner
11035                .planner_dirty_tables
11036                .write()
11037                .insert(table.to_string());
11038        }
11039        self.invalidate_result_cache_for_table(table);
11040    }
11041
11042    /// Wrap the planner's `RuntimeQueryExplain` as rows on a
11043    /// `RuntimeQueryResult` so callers over the SQL interface see the
11044    /// plan tree in the same shape a SELECT produces.
11045    ///
11046    /// Columns: `op`, `source`, `est_rows`, `est_cost`, `depth`.
11047    /// Nodes are walked depth-first; `depth` counts from 0 at the
11048    /// root so a text renderer can indent without re-walking.
11049    fn explain_as_rows(&self, raw_query: &str, inner_sql: &str) -> RedDBResult<RuntimeQueryResult> {
11050        let explain = self.explain_query(inner_sql)?;
11051
11052        let columns = vec![
11053            "op".to_string(),
11054            "source".to_string(),
11055            "est_rows".to_string(),
11056            "est_cost".to_string(),
11057            "depth".to_string(),
11058        ];
11059
11060        let mut records: Vec<crate::storage::query::unified::UnifiedRecord> = Vec::new();
11061
11062        // Prepend `CteScan` markers when the query carried a leading
11063        // WITH clause. The CTE bodies are already inlined into the
11064        // main plan tree, but operators reading EXPLAIN need to see
11065        // which named CTEs were resolved — without this row the plan
11066        // would look indistinguishable from a hand-inlined query.
11067        for name in &explain.cte_materializations {
11068            use std::sync::Arc;
11069            let mut rec = crate::storage::query::unified::UnifiedRecord::default();
11070            rec.set_arc(Arc::from("op"), Value::text("CteScan".to_string()));
11071            rec.set_arc(Arc::from("source"), Value::text(name.clone()));
11072            rec.set_arc(Arc::from("est_rows"), Value::Float(0.0));
11073            rec.set_arc(Arc::from("est_cost"), Value::Float(0.0));
11074            rec.set_arc(Arc::from("depth"), Value::Integer(0));
11075            records.push(rec);
11076        }
11077
11078        walk_plan_node(&explain.logical_plan.root, 0, &mut records);
11079
11080        let result = crate::storage::query::unified::UnifiedResult {
11081            columns,
11082            records,
11083            stats: Default::default(),
11084            pre_serialized_json: None,
11085        };
11086
11087        Ok(RuntimeQueryResult {
11088            query: raw_query.to_string(),
11089            mode: explain.mode,
11090            statement: "explain",
11091            engine: "runtime-explain",
11092            result,
11093            affected_rows: 0,
11094            statement_type: "select",
11095            bookmark: None,
11096        })
11097    }
11098
11099    // -----------------------------------------------------------------
11100    // Granular RBAC — privilege gate + GRANT/REVOKE/ALTER USER dispatch
11101    // -----------------------------------------------------------------
11102
11103    /// Project a `QueryExpr` to the (action, resource) pair the
11104    /// privilege engine cares about. Returns `Ok(())` for statements
11105    /// that don't touch user data (transaction control, SHOW, SET, etc.).
11106    pub(super) fn check_query_privilege(
11107        &self,
11108        expr: &crate::storage::query::ast::QueryExpr,
11109    ) -> Result<(), String> {
11110        use crate::auth::privileges::{Action, AuthzContext, Resource};
11111        use crate::auth::UserId;
11112        use crate::storage::query::ast::QueryExpr;
11113
11114        // No auth store wired (embedded mode / fresh DB / tests) → bypass.
11115        // The bootstrap path itself goes through `execute_query` so this
11116        // is the only sensible default; once auth is wired, the gate
11117        // becomes active.
11118        let auth_store = match self.inner.auth_store.read().clone() {
11119            Some(s) => s,
11120            None => return Ok(()),
11121        };
11122
11123        // Resolve principal + role from the thread-local identity.
11124        // Anonymous (no identity) is allowed to read the bootstrap path
11125        // only when auth_store says so; we treat missing identity as
11126        // platform-admin-equivalent here so embedded test harnesses
11127        // continue to work without setting an identity.
11128        let (username, role) = match current_auth_identity() {
11129            Some(p) => p,
11130            None => return Ok(()),
11131        };
11132        let tenant = current_tenant();
11133
11134        let ctx = AuthzContext {
11135            principal: &username,
11136            effective_role: role,
11137            tenant: tenant.as_deref(),
11138        };
11139        let principal_id = UserId::from_parts(tenant.as_deref(), &username);
11140
11141        // Map QueryExpr → (Action, Resource).
11142        let (action, resource) = match expr {
11143            QueryExpr::Table(t) => (Action::Select, Resource::table_from_name(&t.table)),
11144            QueryExpr::QueueSelect(q) => {
11145                return self.check_queue_op_privilege(
11146                    &auth_store,
11147                    &principal_id,
11148                    role,
11149                    tenant.as_deref(),
11150                    "queue:peek",
11151                    &q.queue,
11152                );
11153            }
11154            QueryExpr::QueueCommand(cmd) => {
11155                use crate::storage::query::ast::QueueCommand;
11156                let (queue, action_verb) = match cmd {
11157                    QueueCommand::Push { queue, .. } => (queue.as_str(), "queue:enqueue"),
11158                    QueueCommand::Pop { queue, .. }
11159                    | QueueCommand::GroupRead { queue, .. }
11160                    | QueueCommand::Claim { queue, .. } => (queue.as_str(), "queue:read"),
11161                    QueueCommand::Peek { queue, .. }
11162                    | QueueCommand::Len { queue }
11163                    | QueueCommand::Pending { queue, .. } => (queue.as_str(), "queue:peek"),
11164                    QueueCommand::Ack { queue, .. } => (queue.as_str(), "queue:ack"),
11165                    QueueCommand::Nack {
11166                        queue, delay_ms, ..
11167                    } => {
11168                        // Per-failure retry overrides re-shape retry
11169                        // behaviour for everyone draining the queue and
11170                        // gate on the dedicated `queue:retry` verb so
11171                        // operators can grant base NACK without granting
11172                        // the override capability.
11173                        let verb = if delay_ms.is_some() {
11174                            "queue:retry"
11175                        } else {
11176                            "queue:nack"
11177                        };
11178                        (queue.as_str(), verb)
11179                    }
11180                    QueueCommand::Purge { queue } => (queue.as_str(), "queue:purge"),
11181                    // `GroupCreate` is part of the consumer-setup
11182                    // surface — read-side, never destructive.
11183                    QueueCommand::GroupCreate { queue, .. } => (queue.as_str(), "queue:read"),
11184                    QueueCommand::Move { source, .. } => (source.as_str(), "queue:dlq:move"),
11185                };
11186                return self.check_queue_op_privilege(
11187                    &auth_store,
11188                    &principal_id,
11189                    role,
11190                    tenant.as_deref(),
11191                    action_verb,
11192                    queue,
11193                );
11194            }
11195            QueryExpr::Graph(g) => {
11196                // MATCH … RETURN is the explorer's pattern-traversal
11197                // surface — gate on `graph:traverse` (#757).
11198                self.check_graph_op_privilege(
11199                    &auth_store,
11200                    &principal_id,
11201                    role,
11202                    tenant.as_deref(),
11203                    "graph:traverse",
11204                )?;
11205                if auth_store.iam_authorization_enabled() {
11206                    self.check_graph_property_projection_privilege(
11207                        &auth_store,
11208                        &principal_id,
11209                        role,
11210                        tenant.as_deref(),
11211                        g,
11212                    )?;
11213                    return Ok(());
11214                }
11215                return Ok(());
11216            }
11217            QueryExpr::Path(_) => {
11218                // PATH FROM … TO … is a path-traversal query — gates
11219                // on `graph:traverse` like neighborhood/shortest-path
11220                // (#757).
11221                return self.check_graph_op_privilege(
11222                    &auth_store,
11223                    &principal_id,
11224                    role,
11225                    tenant.as_deref(),
11226                    "graph:traverse",
11227                );
11228            }
11229            QueryExpr::GraphCommand(cmd) => {
11230                use crate::storage::query::ast::GraphCommand;
11231                let action_verb = match cmd {
11232                    // Metadata / property reads.
11233                    GraphCommand::Properties { .. } => "graph:read",
11234                    // Traversal / pattern-walk surface.
11235                    GraphCommand::Neighborhood { .. }
11236                    | GraphCommand::Traverse { .. }
11237                    | GraphCommand::ShortestPath { .. } => "graph:traverse",
11238                    // Analytics algorithms — expensive enough that Red
11239                    // UI needs to gate the runner independently of
11240                    // ordinary traversal.
11241                    GraphCommand::Centrality { .. }
11242                    | GraphCommand::Community { .. }
11243                    | GraphCommand::Components { .. }
11244                    | GraphCommand::Cycles { .. }
11245                    | GraphCommand::Clustering
11246                    | GraphCommand::TopologicalSort => "graph:algorithm:run",
11247                };
11248                return self.check_graph_op_privilege(
11249                    &auth_store,
11250                    &principal_id,
11251                    role,
11252                    tenant.as_deref(),
11253                    action_verb,
11254                );
11255            }
11256            QueryExpr::Vector(v) => {
11257                if auth_store.iam_authorization_enabled() {
11258                    self.check_vector_op_privilege(
11259                        &auth_store,
11260                        &principal_id,
11261                        role,
11262                        tenant.as_deref(),
11263                        "vector:search",
11264                        &v.collection,
11265                    )?;
11266                    self.check_table_like_column_projection_privilege(
11267                        &auth_store,
11268                        &principal_id,
11269                        role,
11270                        tenant.as_deref(),
11271                        &v.collection,
11272                        &["content".to_string()],
11273                    )?;
11274                    return Ok(());
11275                }
11276                return Ok(());
11277            }
11278            QueryExpr::SearchCommand(cmd) => {
11279                use crate::storage::query::ast::SearchCommand;
11280                if auth_store.iam_authorization_enabled() {
11281                    // `SEARCH SIMILAR [..] COLLECTION <c>` and `SEARCH
11282                    // HYBRID ... COLLECTION <c>` are the same UI
11283                    // affordances as `VECTOR SEARCH` / hybrid joins —
11284                    // Red UI must see the same `vector:search` envelope
11285                    // so a single toolbar grant is sufficient.
11286                    let collection = match cmd {
11287                        SearchCommand::Similar { collection, .. }
11288                        | SearchCommand::Hybrid { collection, .. } => Some(collection.as_str()),
11289                        _ => None,
11290                    };
11291                    if let Some(c) = collection {
11292                        self.check_vector_op_privilege(
11293                            &auth_store,
11294                            &principal_id,
11295                            role,
11296                            tenant.as_deref(),
11297                            "vector:search",
11298                            c,
11299                        )?;
11300                        return Ok(());
11301                    }
11302                }
11303                return Ok(());
11304            }
11305            QueryExpr::Hybrid(h) => {
11306                if auth_store.iam_authorization_enabled() {
11307                    // The vector half of a hybrid search is gated under
11308                    // the same `vector:search` verb as a standalone
11309                    // VECTOR SEARCH — Red UI's hybrid-search toolbar
11310                    // must surface the same UI-safe denial envelope
11311                    // when the principal lacks the grant. The
11312                    // structured half is dispatched to its own gate via
11313                    // the inner query during execution.
11314                    self.check_vector_op_privilege(
11315                        &auth_store,
11316                        &principal_id,
11317                        role,
11318                        tenant.as_deref(),
11319                        "vector:search",
11320                        &h.vector.collection,
11321                    )?;
11322                    return Ok(());
11323                }
11324                return Ok(());
11325            }
11326            QueryExpr::Insert(i) => (Action::Insert, Resource::table_from_name(&i.table)),
11327            QueryExpr::Update(u) => (Action::Update, Resource::table_from_name(&u.table)),
11328            QueryExpr::Delete(d) => (Action::Delete, Resource::table_from_name(&d.table)),
11329            // Joins inherit the read privilege from any constituent
11330            // table — for now we emit a single Select on the database
11331            // (admins bypass; non-admins need a Database/Schema grant).
11332            QueryExpr::Join(_) => (Action::Select, Resource::Database),
11333            // GRANT / REVOKE / ALTER USER are authority statements;
11334            // require Admin (the helper methods enforce).
11335            QueryExpr::Grant(_) | QueryExpr::Revoke(_) | QueryExpr::AlterUser(_) => {
11336                return if role == crate::auth::Role::Admin {
11337                    Ok(())
11338                } else {
11339                    Err(format!(
11340                        "principal=`{}` role=`{:?}` cannot issue ACL/auth DDL",
11341                        username, role
11342                    ))
11343                };
11344            }
11345            QueryExpr::CreateIamPolicy { id, .. } => {
11346                return self.check_policy_management_privilege(
11347                    &auth_store,
11348                    &principal_id,
11349                    role,
11350                    tenant.as_deref(),
11351                    "policy:put",
11352                    "policy",
11353                    id,
11354                );
11355            }
11356            QueryExpr::DropIamPolicy { id } => {
11357                return self.check_policy_management_privilege(
11358                    &auth_store,
11359                    &principal_id,
11360                    role,
11361                    tenant.as_deref(),
11362                    "policy:drop",
11363                    "policy",
11364                    id,
11365                );
11366            }
11367            QueryExpr::AttachPolicy { policy_id, .. } => {
11368                return self.check_policy_management_privilege(
11369                    &auth_store,
11370                    &principal_id,
11371                    role,
11372                    tenant.as_deref(),
11373                    "policy:attach",
11374                    "policy",
11375                    policy_id,
11376                );
11377            }
11378            QueryExpr::DetachPolicy { policy_id, .. } => {
11379                return self.check_policy_management_privilege(
11380                    &auth_store,
11381                    &principal_id,
11382                    role,
11383                    tenant.as_deref(),
11384                    "policy:detach",
11385                    "policy",
11386                    policy_id,
11387                );
11388            }
11389            QueryExpr::ShowPolicies { .. } | QueryExpr::ShowEffectivePermissions { .. } => {
11390                return Ok(());
11391            }
11392            QueryExpr::SimulatePolicy { .. } => {
11393                return self.check_policy_management_privilege(
11394                    &auth_store,
11395                    &principal_id,
11396                    role,
11397                    tenant.as_deref(),
11398                    "policy:simulate",
11399                    "policy",
11400                    "*",
11401                );
11402            }
11403            QueryExpr::LintPolicy { .. } => {
11404                // Linting is a read-only inspection — gate it like
11405                // simulate (policy management role).
11406                return self.check_policy_management_privilege(
11407                    &auth_store,
11408                    &principal_id,
11409                    role,
11410                    tenant.as_deref(),
11411                    "policy:simulate",
11412                    "policy",
11413                    "*",
11414                );
11415            }
11416            QueryExpr::MigratePolicyMode { dry_run, .. } => {
11417                // DRY RUN is a pre-flight inspection (policy:simulate).
11418                // The actual mode flip is a privileged mutation under
11419                // the policy:put action (it persists a new enforcement
11420                // mode to the vault KV through `set_enforcement_mode`).
11421                let action = if *dry_run {
11422                    "policy:simulate"
11423                } else {
11424                    "policy:put"
11425                };
11426                return self.check_policy_management_privilege(
11427                    &auth_store,
11428                    &principal_id,
11429                    role,
11430                    tenant.as_deref(),
11431                    action,
11432                    "policy",
11433                    "*",
11434                );
11435            }
11436            // DROP and TRUNCATE — Write-role gate + per-collection IAM policy
11437            // when IAM mode is active. Other DDL stays role-only for now.
11438            QueryExpr::DropTable(q) => {
11439                return self.check_ddl_collection_privilege(
11440                    &auth_store,
11441                    &principal_id,
11442                    role,
11443                    tenant.as_deref(),
11444                    &username,
11445                    "drop",
11446                    &q.name,
11447                );
11448            }
11449            QueryExpr::DropGraph(q) => {
11450                return self.check_ddl_collection_privilege(
11451                    &auth_store,
11452                    &principal_id,
11453                    role,
11454                    tenant.as_deref(),
11455                    &username,
11456                    "drop",
11457                    &q.name,
11458                );
11459            }
11460            QueryExpr::DropVector(q) => {
11461                return self.check_ddl_collection_privilege(
11462                    &auth_store,
11463                    &principal_id,
11464                    role,
11465                    tenant.as_deref(),
11466                    &username,
11467                    "drop",
11468                    &q.name,
11469                );
11470            }
11471            QueryExpr::DropDocument(q) => {
11472                return self.check_ddl_collection_privilege(
11473                    &auth_store,
11474                    &principal_id,
11475                    role,
11476                    tenant.as_deref(),
11477                    &username,
11478                    "drop",
11479                    &q.name,
11480                );
11481            }
11482            QueryExpr::DropKv(q) => {
11483                return self.check_ddl_collection_privilege(
11484                    &auth_store,
11485                    &principal_id,
11486                    role,
11487                    tenant.as_deref(),
11488                    &username,
11489                    "drop",
11490                    &q.name,
11491                );
11492            }
11493            QueryExpr::DropCollection(q) => {
11494                return self.check_ddl_collection_privilege(
11495                    &auth_store,
11496                    &principal_id,
11497                    role,
11498                    tenant.as_deref(),
11499                    &username,
11500                    "drop",
11501                    &q.name,
11502                );
11503            }
11504            QueryExpr::Truncate(q) => {
11505                return self.check_ddl_collection_privilege(
11506                    &auth_store,
11507                    &principal_id,
11508                    role,
11509                    tenant.as_deref(),
11510                    &username,
11511                    "truncate",
11512                    &q.name,
11513                );
11514            }
11515            // Remaining DDL (#753) — hybrid policy-aware gate. Specific
11516            // create/alter/drop verbs gate operations with a clear
11517            // per-collection target so Red UI can author fine-grained
11518            // policies (`create on collection:users`). Namespace-level
11519            // and grouped DDL fall back to broader `schema:admin` /
11520            // `schema:write` verbs against a `schema:<name>` resource.
11521            // All branches share the [`check_ddl_object_privilege`]
11522            // helper so allows / denies produce the same structured
11523            // "principal=… action=… resource=<kind>:<name> denied by
11524            // IAM policy" reason the Red UI security read contracts
11525            // (#740) already render.
11526            QueryExpr::CreateTable(q) => {
11527                return self.check_ddl_object_privilege(
11528                    &auth_store,
11529                    &principal_id,
11530                    role,
11531                    tenant.as_deref(),
11532                    &username,
11533                    "create",
11534                    "collection",
11535                    &q.name,
11536                    crate::auth::Role::Write,
11537                );
11538            }
11539            QueryExpr::CreateCollection(q) => {
11540                return self.check_ddl_object_privilege(
11541                    &auth_store,
11542                    &principal_id,
11543                    role,
11544                    tenant.as_deref(),
11545                    &username,
11546                    "create",
11547                    "collection",
11548                    &q.name,
11549                    crate::auth::Role::Write,
11550                );
11551            }
11552            QueryExpr::CreateVector(q) => {
11553                return self.check_ddl_object_privilege(
11554                    &auth_store,
11555                    &principal_id,
11556                    role,
11557                    tenant.as_deref(),
11558                    &username,
11559                    "create",
11560                    "collection",
11561                    &q.name,
11562                    crate::auth::Role::Write,
11563                );
11564            }
11565            QueryExpr::AlterTable(q) => {
11566                return self.check_ddl_object_privilege(
11567                    &auth_store,
11568                    &principal_id,
11569                    role,
11570                    tenant.as_deref(),
11571                    &username,
11572                    "alter",
11573                    "collection",
11574                    &q.name,
11575                    crate::auth::Role::Write,
11576                );
11577            }
11578            QueryExpr::CreateIndex(q) => {
11579                return self.check_ddl_object_privilege(
11580                    &auth_store,
11581                    &principal_id,
11582                    role,
11583                    tenant.as_deref(),
11584                    &username,
11585                    "create",
11586                    "collection",
11587                    &q.table,
11588                    crate::auth::Role::Write,
11589                );
11590            }
11591            QueryExpr::DropIndex(q) => {
11592                return self.check_ddl_object_privilege(
11593                    &auth_store,
11594                    &principal_id,
11595                    role,
11596                    tenant.as_deref(),
11597                    &username,
11598                    "drop",
11599                    "collection",
11600                    &q.table,
11601                    crate::auth::Role::Write,
11602                );
11603            }
11604            QueryExpr::CreateSchema(q) => {
11605                return self.check_ddl_object_privilege(
11606                    &auth_store,
11607                    &principal_id,
11608                    role,
11609                    tenant.as_deref(),
11610                    &username,
11611                    "schema:admin",
11612                    "schema",
11613                    &q.name,
11614                    crate::auth::Role::Admin,
11615                );
11616            }
11617            QueryExpr::DropSchema(q) => {
11618                return self.check_ddl_object_privilege(
11619                    &auth_store,
11620                    &principal_id,
11621                    role,
11622                    tenant.as_deref(),
11623                    &username,
11624                    "schema:admin",
11625                    "schema",
11626                    &q.name,
11627                    crate::auth::Role::Admin,
11628                );
11629            }
11630            QueryExpr::CreateSequence(q) => {
11631                return self.check_ddl_object_privilege(
11632                    &auth_store,
11633                    &principal_id,
11634                    role,
11635                    tenant.as_deref(),
11636                    &username,
11637                    "create",
11638                    "collection",
11639                    &q.name,
11640                    crate::auth::Role::Write,
11641                );
11642            }
11643            QueryExpr::DropSequence(q) => {
11644                return self.check_ddl_object_privilege(
11645                    &auth_store,
11646                    &principal_id,
11647                    role,
11648                    tenant.as_deref(),
11649                    &username,
11650                    "drop",
11651                    "collection",
11652                    &q.name,
11653                    crate::auth::Role::Write,
11654                );
11655            }
11656            QueryExpr::CreateView(q) => {
11657                return self.check_ddl_object_privilege(
11658                    &auth_store,
11659                    &principal_id,
11660                    role,
11661                    tenant.as_deref(),
11662                    &username,
11663                    "create",
11664                    "collection",
11665                    &q.name,
11666                    crate::auth::Role::Write,
11667                );
11668            }
11669            QueryExpr::DropView(q) => {
11670                return self.check_ddl_object_privilege(
11671                    &auth_store,
11672                    &principal_id,
11673                    role,
11674                    tenant.as_deref(),
11675                    &username,
11676                    "drop",
11677                    "collection",
11678                    &q.name,
11679                    crate::auth::Role::Write,
11680                );
11681            }
11682            QueryExpr::RefreshMaterializedView(q) => {
11683                return self.check_ddl_object_privilege(
11684                    &auth_store,
11685                    &principal_id,
11686                    role,
11687                    tenant.as_deref(),
11688                    &username,
11689                    "alter",
11690                    "collection",
11691                    &q.name,
11692                    crate::auth::Role::Write,
11693                );
11694            }
11695            QueryExpr::CreatePolicy(q) => {
11696                return self.check_ddl_object_privilege(
11697                    &auth_store,
11698                    &principal_id,
11699                    role,
11700                    tenant.as_deref(),
11701                    &username,
11702                    "create",
11703                    "collection",
11704                    &q.table,
11705                    crate::auth::Role::Write,
11706                );
11707            }
11708            QueryExpr::DropPolicy(q) => {
11709                return self.check_ddl_object_privilege(
11710                    &auth_store,
11711                    &principal_id,
11712                    role,
11713                    tenant.as_deref(),
11714                    &username,
11715                    "drop",
11716                    "collection",
11717                    &q.table,
11718                    crate::auth::Role::Write,
11719                );
11720            }
11721            QueryExpr::CreateServer(q) => {
11722                return self.check_ddl_object_privilege(
11723                    &auth_store,
11724                    &principal_id,
11725                    role,
11726                    tenant.as_deref(),
11727                    &username,
11728                    "schema:admin",
11729                    "schema",
11730                    &q.name,
11731                    crate::auth::Role::Admin,
11732                );
11733            }
11734            QueryExpr::DropServer(q) => {
11735                return self.check_ddl_object_privilege(
11736                    &auth_store,
11737                    &principal_id,
11738                    role,
11739                    tenant.as_deref(),
11740                    &username,
11741                    "schema:admin",
11742                    "schema",
11743                    &q.name,
11744                    crate::auth::Role::Admin,
11745                );
11746            }
11747            QueryExpr::CreateForeignTable(q) => {
11748                return self.check_ddl_object_privilege(
11749                    &auth_store,
11750                    &principal_id,
11751                    role,
11752                    tenant.as_deref(),
11753                    &username,
11754                    "schema:write",
11755                    "schema",
11756                    &q.name,
11757                    crate::auth::Role::Write,
11758                );
11759            }
11760            QueryExpr::DropForeignTable(q) => {
11761                return self.check_ddl_object_privilege(
11762                    &auth_store,
11763                    &principal_id,
11764                    role,
11765                    tenant.as_deref(),
11766                    &username,
11767                    "schema:write",
11768                    "schema",
11769                    &q.name,
11770                    crate::auth::Role::Write,
11771                );
11772            }
11773            QueryExpr::CreateTimeSeries(q) => {
11774                return self.check_ddl_object_privilege(
11775                    &auth_store,
11776                    &principal_id,
11777                    role,
11778                    tenant.as_deref(),
11779                    &username,
11780                    "create",
11781                    "collection",
11782                    &q.name,
11783                    crate::auth::Role::Write,
11784                );
11785            }
11786            QueryExpr::CreateMetric(q) => {
11787                return self.check_ddl_object_privilege(
11788                    &auth_store,
11789                    &principal_id,
11790                    role,
11791                    tenant.as_deref(),
11792                    &username,
11793                    "create",
11794                    "collection",
11795                    &q.path,
11796                    crate::auth::Role::Write,
11797                );
11798            }
11799            QueryExpr::AlterMetric(q) => {
11800                return self.check_ddl_object_privilege(
11801                    &auth_store,
11802                    &principal_id,
11803                    role,
11804                    tenant.as_deref(),
11805                    &username,
11806                    "alter",
11807                    "collection",
11808                    &q.path,
11809                    crate::auth::Role::Write,
11810                );
11811            }
11812            QueryExpr::CreateSlo(q) => {
11813                return self.check_ddl_object_privilege(
11814                    &auth_store,
11815                    &principal_id,
11816                    role,
11817                    tenant.as_deref(),
11818                    &username,
11819                    "create",
11820                    "collection",
11821                    &q.path,
11822                    crate::auth::Role::Write,
11823                );
11824            }
11825            QueryExpr::DropTimeSeries(q) => {
11826                return self.check_ddl_object_privilege(
11827                    &auth_store,
11828                    &principal_id,
11829                    role,
11830                    tenant.as_deref(),
11831                    &username,
11832                    "drop",
11833                    "collection",
11834                    &q.name,
11835                    crate::auth::Role::Write,
11836                );
11837            }
11838            QueryExpr::CreateQueue(q) => {
11839                return self.check_ddl_object_privilege(
11840                    &auth_store,
11841                    &principal_id,
11842                    role,
11843                    tenant.as_deref(),
11844                    &username,
11845                    "create",
11846                    "collection",
11847                    &q.name,
11848                    crate::auth::Role::Write,
11849                );
11850            }
11851            QueryExpr::AlterQueue(q) => {
11852                return self.check_ddl_object_privilege(
11853                    &auth_store,
11854                    &principal_id,
11855                    role,
11856                    tenant.as_deref(),
11857                    &username,
11858                    "alter",
11859                    "collection",
11860                    &q.name,
11861                    crate::auth::Role::Write,
11862                );
11863            }
11864            QueryExpr::DropQueue(q) => {
11865                return self.check_ddl_object_privilege(
11866                    &auth_store,
11867                    &principal_id,
11868                    role,
11869                    tenant.as_deref(),
11870                    &username,
11871                    "drop",
11872                    "collection",
11873                    &q.name,
11874                    crate::auth::Role::Write,
11875                );
11876            }
11877            QueryExpr::CreateTree(q) => {
11878                return self.check_ddl_object_privilege(
11879                    &auth_store,
11880                    &principal_id,
11881                    role,
11882                    tenant.as_deref(),
11883                    &username,
11884                    "create",
11885                    "collection",
11886                    &q.collection,
11887                    crate::auth::Role::Write,
11888                );
11889            }
11890            QueryExpr::DropTree(q) => {
11891                return self.check_ddl_object_privilege(
11892                    &auth_store,
11893                    &principal_id,
11894                    role,
11895                    tenant.as_deref(),
11896                    &username,
11897                    "drop",
11898                    "collection",
11899                    &q.collection,
11900                    crate::auth::Role::Write,
11901                );
11902            }
11903            // Migration DDL — CREATE MIGRATION is grouped DDL on the
11904            // schema namespace; uses the `schema:write` fallback verb
11905            // (no obvious per-collection target).
11906            QueryExpr::CreateMigration(q) => {
11907                return self.check_ddl_object_privilege(
11908                    &auth_store,
11909                    &principal_id,
11910                    role,
11911                    tenant.as_deref(),
11912                    &username,
11913                    "schema:write",
11914                    "schema",
11915                    &q.name,
11916                    crate::auth::Role::Write,
11917                );
11918            }
11919            // APPLY / ROLLBACK change data and schema — require Admin.
11920            QueryExpr::ApplyMigration(_) | QueryExpr::RollbackMigration(_) => {
11921                return if role == crate::auth::Role::Admin {
11922                    Ok(())
11923                } else {
11924                    Err(format!(
11925                        "principal=`{}` role=`{:?}` cannot issue APPLY/ROLLBACK MIGRATION",
11926                        username, role
11927                    ))
11928                };
11929            }
11930            // EXPLAIN MIGRATION is read-only — any authenticated principal.
11931            QueryExpr::ExplainMigration(_) => return Ok(()),
11932            // Everything else (SET, SHOW, transaction control, graph
11933            // commands, queue/tree commands, MaintenanceCommand …)
11934            // is allowed for any authenticated principal.
11935            _ => return Ok(()),
11936        };
11937
11938        if auth_store.iam_authorization_enabled() {
11939            let iam_action = legacy_action_to_iam(action);
11940            let iam_resource = legacy_resource_to_iam(&resource, tenant.as_deref());
11941            let iam_ctx = runtime_iam_context(
11942                role,
11943                tenant.as_deref(),
11944                auth_store.principal_is_system_owned(&principal_id),
11945            );
11946            if !auth_store.check_policy_authz_with_role(
11947                &principal_id,
11948                iam_action,
11949                &iam_resource,
11950                &iam_ctx,
11951                role,
11952            ) {
11953                return Err(format!(
11954                    "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
11955                    username, iam_action, iam_resource.kind, iam_resource.name
11956                ));
11957            }
11958
11959            if let QueryExpr::Table(table) = expr {
11960                self.check_table_column_projection_privilege(
11961                    &auth_store,
11962                    &principal_id,
11963                    &iam_ctx,
11964                    table,
11965                )?;
11966            }
11967
11968            if let QueryExpr::Update(update) = expr {
11969                let columns = update_set_target_columns(update);
11970                if !columns.is_empty() {
11971                    let request = column_access_request_for_table_update(&update.table, columns);
11972                    let outcome =
11973                        auth_store.check_column_projection_authz(&principal_id, &request, &iam_ctx);
11974                    if let Some(denied) = outcome.first_denied_column() {
11975                        return Err(format!(
11976                            "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM column policy",
11977                            username, iam_action, denied.resource.kind, denied.resource.name
11978                        ));
11979                    }
11980                    if !outcome.allowed() {
11981                        return Err(format!(
11982                            "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
11983                            username,
11984                            iam_action,
11985                            outcome.table_resource.kind,
11986                            outcome.table_resource.name
11987                        ));
11988                    }
11989                }
11990
11991                if let Some(columns) = update_returning_columns_for_policy(self, update) {
11992                    let request = column_access_request_for_table_select(&update.table, columns);
11993                    let outcome =
11994                        auth_store.check_column_projection_authz(&principal_id, &request, &iam_ctx);
11995                    if let Some(denied) = outcome.first_denied_column() {
11996                        return Err(format!(
11997                            "principal=`{}` action=`select` resource=`{}:{}` denied by IAM column policy",
11998                            username, denied.resource.kind, denied.resource.name
11999                        ));
12000                    }
12001                    if !outcome.allowed() {
12002                        return Err(format!(
12003                            "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
12004                            username, outcome.table_resource.kind, outcome.table_resource.name
12005                        ));
12006                    }
12007                }
12008            }
12009
12010            Ok(())
12011        } else {
12012            auth_store
12013                .check_grant(&ctx, action, &resource)
12014                .map_err(|e| e.to_string())
12015        }
12016    }
12017
12018    fn check_table_column_projection_privilege(
12019        &self,
12020        auth_store: &Arc<crate::auth::store::AuthStore>,
12021        principal: &crate::auth::UserId,
12022        ctx: &crate::auth::policies::EvalContext,
12023        table: &crate::storage::query::ast::TableQuery,
12024    ) -> Result<(), String> {
12025        use crate::auth::{ColumnAccessRequest, ColumnDecisionEffect};
12026
12027        let columns = requested_table_columns_for_policy(table);
12028        if columns.is_empty() {
12029            return Ok(());
12030        }
12031
12032        let request = ColumnAccessRequest::select(table.table.clone(), columns);
12033        let outcome = auth_store.check_column_projection_authz(principal, &request, ctx);
12034        if outcome.allowed() {
12035            return Ok(());
12036        }
12037
12038        if !matches!(
12039            outcome.table_decision,
12040            crate::auth::policies::Decision::Allow { .. }
12041                | crate::auth::policies::Decision::AdminBypass
12042        ) {
12043            return Err(format!(
12044                "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
12045                principal, outcome.table_resource.kind, outcome.table_resource.name
12046            ));
12047        }
12048
12049        let denied = outcome
12050            .first_denied_column()
12051            .filter(|decision| decision.effective == ColumnDecisionEffect::Denied);
12052        match denied {
12053            Some(decision) => Err(format!(
12054                "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
12055                principal, decision.resource.kind, decision.resource.name
12056            )),
12057            None => Ok(()),
12058        }
12059    }
12060
12061    fn check_graph_property_projection_privilege(
12062        &self,
12063        auth_store: &Arc<crate::auth::store::AuthStore>,
12064        principal: &crate::auth::UserId,
12065        role: crate::auth::Role,
12066        tenant: Option<&str>,
12067        query: &crate::storage::query::ast::GraphQuery,
12068    ) -> Result<(), String> {
12069        let columns = explicit_graph_projection_properties(query);
12070        if columns.is_empty() {
12071            return Ok(());
12072        }
12073        self.check_table_like_column_projection_privilege(
12074            auth_store, principal, role, tenant, "graph", &columns,
12075        )
12076    }
12077
12078    fn check_table_like_column_projection_privilege(
12079        &self,
12080        auth_store: &Arc<crate::auth::store::AuthStore>,
12081        principal: &crate::auth::UserId,
12082        role: crate::auth::Role,
12083        tenant: Option<&str>,
12084        table: &str,
12085        columns: &[String],
12086    ) -> Result<(), String> {
12087        let iam_ctx = runtime_iam_context(
12088            role,
12089            tenant,
12090            auth_store.principal_is_system_owned(principal),
12091        );
12092        let request =
12093            crate::auth::ColumnAccessRequest::select(table.to_string(), columns.iter().cloned());
12094        let outcome = auth_store.check_column_projection_authz(principal, &request, &iam_ctx);
12095        if outcome.allowed() {
12096            return Ok(());
12097        }
12098        let denied = outcome
12099            .first_denied_column()
12100            .map(|d| d.resource.name.clone())
12101            .unwrap_or_else(|| format!("{table}.<unknown>"));
12102        Err(format!(
12103            "principal=`{}` action=`select` resource=`column:{}` denied by IAM policy",
12104            principal, denied
12105        ))
12106    }
12107
12108    fn check_policy_management_privilege(
12109        &self,
12110        auth_store: &Arc<crate::auth::store::AuthStore>,
12111        principal: &crate::auth::UserId,
12112        role: crate::auth::Role,
12113        tenant: Option<&str>,
12114        action: &str,
12115        resource_kind: &str,
12116        resource_name: &str,
12117    ) -> Result<(), String> {
12118        let ctx = runtime_iam_context(
12119            role,
12120            tenant,
12121            auth_store.principal_is_system_owned(principal),
12122        );
12123
12124        if !auth_store.iam_authorization_enabled() {
12125            return if role == crate::auth::Role::Admin {
12126                Ok(())
12127            } else {
12128                Err(format!(
12129                    "principal=`{}` role=`{:?}` cannot issue ACL/auth DDL",
12130                    principal, role
12131                ))
12132            };
12133        }
12134
12135        let mut resource = crate::auth::policies::ResourceRef::new(
12136            resource_kind.to_string(),
12137            resource_name.to_string(),
12138        );
12139        if let Some(t) = tenant {
12140            resource = resource.with_tenant(t.to_string());
12141        }
12142        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
12143            Ok(())
12144        } else {
12145            Err(format!(
12146                "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
12147                principal, action, resource.kind, resource.name
12148            ))
12149        }
12150    }
12151
12152    fn check_managed_config_write_for_set_config(&self, key: &str) -> RedDBResult<()> {
12153        let Some(auth_store) = self.inner.auth_store.read().clone() else {
12154            return Ok(());
12155        };
12156        let (username, role) = current_auth_identity()
12157            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
12158        let tenant = current_tenant();
12159        let principal = crate::auth::UserId::from_parts(tenant.as_deref(), &username);
12160        let ctx = runtime_iam_context(
12161            role,
12162            tenant.as_deref(),
12163            auth_store.principal_is_system_owned(&principal),
12164        );
12165        let gate = crate::auth::managed_config::ManagedConfigGate::new(
12166            self.inner.config_registry.as_ref(),
12167        );
12168        match gate.check_write(&auth_store, &principal, &ctx, key) {
12169            crate::auth::managed_config::ManagedConfigDecision::PassThrough { .. }
12170            | crate::auth::managed_config::ManagedConfigDecision::Allow { .. } => Ok(()),
12171            crate::auth::managed_config::ManagedConfigDecision::Deny { reason, .. } => {
12172                Err(RedDBError::Query(format!(
12173                    "permission denied: managed config mutation blocked for `{key}`: {reason}"
12174                )))
12175            }
12176        }
12177    }
12178
12179    /// IAM privilege check for a granular queue operation (issue #755 /
12180    /// PRD #735).
12181    ///
12182    /// Each queue operation maps to a stable verb in
12183    /// [`crate::auth::action_catalog`] (`queue:enqueue`, `queue:read`,
12184    /// `queue:peek`, `queue:ack`, `queue:nack`, `queue:retry`,
12185    /// `queue:dlq:move`, `queue:purge`, `queue:presence:read`). The
12186    /// resource is `queue:<name>` scoped to the current tenant. In
12187    /// legacy mode (no IAM authorization configured) the check is a
12188    /// no-op — the role gates in `execute_queue_command` still apply
12189    /// and the legacy `select` / `write` grant table continues to
12190    /// govern queue access. In IAM-enabled mode a missing granular
12191    /// grant yields a structured, UI-safe error of the form
12192    /// `principal=… action=queue:… resource=queue:… denied by IAM
12193    /// policy` so Red UI can surface the failing toolbar action.
12194    fn check_queue_op_privilege(
12195        &self,
12196        auth_store: &Arc<crate::auth::store::AuthStore>,
12197        principal: &crate::auth::UserId,
12198        role: crate::auth::Role,
12199        tenant: Option<&str>,
12200        action: &str,
12201        queue: &str,
12202    ) -> Result<(), String> {
12203        if !auth_store.iam_authorization_enabled() {
12204            return Ok(());
12205        }
12206        let mut resource =
12207            crate::auth::policies::ResourceRef::new("queue".to_string(), queue.to_string());
12208        if let Some(t) = tenant {
12209            resource = resource.with_tenant(t.to_string());
12210        }
12211        let ctx = runtime_iam_context(
12212            role,
12213            tenant,
12214            auth_store.principal_is_system_owned(principal),
12215        );
12216        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
12217            Ok(())
12218        } else {
12219            Err(format!(
12220                "principal=`{}` action=`{}` resource=`queue:{}` denied by IAM policy",
12221                principal, action, queue
12222            ))
12223        }
12224    }
12225
12226    /// IAM privilege check for a graph operation (issue #757 / PRD
12227    /// #735).
12228    ///
12229    /// Each graph operation maps to a stable verb in
12230    /// [`crate::auth::action_catalog`] — `graph:read` for
12231    /// metadata/property lookups, `graph:traverse` for MATCH / PATH /
12232    /// NEIGHBORHOOD / TRAVERSE / SHORTEST_PATH, and
12233    /// `graph:algorithm:run` for analytics algorithms (centrality,
12234    /// community, components, cycles, clustering, topological sort).
12235    /// The resource is `graph:*` scoped to the current tenant — the
12236    /// runtime today operates on a singleton graph store so the name
12237    /// has no concrete identifier; policies grant the explorer
12238    /// surface by writing `graph:*` as the resource pattern.
12239    ///
12240    /// In legacy mode (no IAM authorization configured) the check is
12241    /// a no-op so the existing role-based defaults continue to
12242    /// govern. In IAM-enabled mode a missing grant produces the
12243    /// UI-safe envelope `principal=… action=graph:… resource=graph:*
12244    /// denied by IAM policy` Red UI keys on.
12245    fn check_graph_op_privilege(
12246        &self,
12247        auth_store: &Arc<crate::auth::store::AuthStore>,
12248        principal: &crate::auth::UserId,
12249        role: crate::auth::Role,
12250        tenant: Option<&str>,
12251        action: &str,
12252    ) -> Result<(), String> {
12253        if !auth_store.iam_authorization_enabled() {
12254            return Ok(());
12255        }
12256        let mut resource =
12257            crate::auth::policies::ResourceRef::new("graph".to_string(), "*".to_string());
12258        if let Some(t) = tenant {
12259            resource = resource.with_tenant(t.to_string());
12260        }
12261        let ctx = runtime_iam_context(
12262            role,
12263            tenant,
12264            auth_store.principal_is_system_owned(principal),
12265        );
12266        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
12267            Ok(())
12268        } else {
12269            Err(format!(
12270                "principal=`{}` action=`{}` resource=`graph:*` denied by IAM policy",
12271                principal, action
12272            ))
12273        }
12274    }
12275
12276    /// IAM privilege check for a granular vector operation (issue #756
12277    /// / PRD #735).
12278    ///
12279    /// Each vector operation maps to a stable verb in
12280    /// [`crate::auth::action_catalog`] (`vector:read`, `vector:search`,
12281    /// `vector:artifact:read`, `vector:artifact:rebuild`,
12282    /// `vector:admin`). The resource is `vector:<collection>` scoped to
12283    /// the current tenant. In legacy mode (no IAM authorization
12284    /// configured) the check is a no-op — the role gates and existing
12285    /// `select` / column-projection grants continue to govern access.
12286    /// In IAM-enabled mode a missing granular grant yields a
12287    /// structured, UI-safe error of the form `principal=…
12288    /// action=vector:… resource=vector:… denied by IAM policy` so Red
12289    /// UI can surface the failing toolbar action.
12290    fn check_vector_op_privilege(
12291        &self,
12292        auth_store: &Arc<crate::auth::store::AuthStore>,
12293        principal: &crate::auth::UserId,
12294        role: crate::auth::Role,
12295        tenant: Option<&str>,
12296        action: &str,
12297        collection: &str,
12298    ) -> Result<(), String> {
12299        if !auth_store.iam_authorization_enabled() {
12300            return Ok(());
12301        }
12302        let mut resource =
12303            crate::auth::policies::ResourceRef::new("vector".to_string(), collection.to_string());
12304        if let Some(t) = tenant {
12305            resource = resource.with_tenant(t.to_string());
12306        }
12307        let ctx = runtime_iam_context(
12308            role,
12309            tenant,
12310            auth_store.principal_is_system_owned(principal),
12311        );
12312        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
12313            Ok(())
12314        } else {
12315            Err(format!(
12316                "principal=`{}` action=`{}` resource=`vector:{}` denied by IAM policy",
12317                principal, action, collection
12318            ))
12319        }
12320    }
12321
12322    /// IAM privilege check for DROP / TRUNCATE on a named collection.
12323    ///
12324    /// Delegates to [`check_ddl_object_privilege`] with `resource_kind =
12325    /// "collection"`. Kept as a thin wrapper so the existing DROP/TRUNCATE
12326    /// callsites stay readable.
12327    fn check_ddl_collection_privilege(
12328        &self,
12329        auth_store: &Arc<crate::auth::store::AuthStore>,
12330        principal: &crate::auth::UserId,
12331        role: crate::auth::Role,
12332        tenant: Option<&str>,
12333        username: &str,
12334        action: &str,
12335        collection: &str,
12336    ) -> Result<(), String> {
12337        self.check_ddl_object_privilege(
12338            auth_store,
12339            principal,
12340            role,
12341            tenant,
12342            username,
12343            action,
12344            "collection",
12345            collection,
12346            crate::auth::Role::Write,
12347        )
12348    }
12349
12350    /// Generalised IAM privilege check for DDL on a named object.
12351    ///
12352    /// `action` is the stable verb advertised through the action catalog
12353    /// (`create`, `alter`, `drop`, `truncate`, `schema:write`,
12354    /// `schema:admin`). `resource_kind` / `resource_name` form the policy
12355    /// resource (`collection:<name>`, `schema:<name>`). `min_role` is the
12356    /// legacy gate when IAM is not yet enabled.
12357    ///
12358    /// Behaviour:
12359    /// * Role below `min_role` → structured "principal=… role=… cannot
12360    ///   issue DDL" denial, audit recorded.
12361    /// * IAM disabled → audit-record success and allow (legacy path).
12362    /// * IAM enabled → call `check_policy_authz_with_role`. Explicit Deny
12363    ///   and DefaultDeny in PolicyOnly mode both produce a UI-safe
12364    ///   "principal=… action=… resource=<kind>:<name> denied by IAM
12365    ///   policy" string. Explicit Allow and the LegacyRbac fallback
12366    ///   allow the action.
12367    #[allow(clippy::too_many_arguments)]
12368    fn check_ddl_object_privilege(
12369        &self,
12370        auth_store: &Arc<crate::auth::store::AuthStore>,
12371        principal: &crate::auth::UserId,
12372        role: crate::auth::Role,
12373        tenant: Option<&str>,
12374        username: &str,
12375        action: &str,
12376        resource_kind: &str,
12377        resource_name: &str,
12378        min_role: crate::auth::Role,
12379    ) -> Result<(), String> {
12380        if role < min_role {
12381            let msg = format!(
12382                "principal=`{}` role=`{:?}` cannot issue DDL action=`{}` resource=`{}:{}`",
12383                username, role, action, resource_kind, resource_name
12384            );
12385            self.inner.audit_log.record(
12386                action,
12387                username,
12388                resource_name,
12389                "denied",
12390                crate::json::Value::Null,
12391            );
12392            return Err(msg);
12393        }
12394
12395        if !auth_store.iam_authorization_enabled() {
12396            self.inner.audit_log.record(
12397                action,
12398                username,
12399                resource_name,
12400                "ok",
12401                crate::json::Value::Null,
12402            );
12403            return Ok(());
12404        }
12405
12406        let mut resource = crate::auth::policies::ResourceRef::new(
12407            resource_kind.to_string(),
12408            resource_name.to_string(),
12409        );
12410        if let Some(t) = tenant {
12411            resource = resource.with_tenant(t.to_string());
12412        }
12413        let ctx = runtime_iam_context(
12414            role,
12415            tenant,
12416            auth_store.principal_is_system_owned(principal),
12417        );
12418        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
12419            self.inner.audit_log.record(
12420                action,
12421                username,
12422                resource_name,
12423                "ok",
12424                crate::json::Value::Null,
12425            );
12426            Ok(())
12427        } else {
12428            self.inner.audit_log.record(
12429                action,
12430                username,
12431                resource_name,
12432                "denied",
12433                crate::json::Value::Null,
12434            );
12435            Err(format!(
12436                "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
12437                username, action, resource_kind, resource_name
12438            ))
12439        }
12440    }
12441
12442    /// Translate the parsed [`GrantStmt`] into AuthStore mutations.
12443    fn execute_grant_statement(
12444        &self,
12445        query: &str,
12446        stmt: &crate::storage::query::ast::GrantStmt,
12447    ) -> RedDBResult<RuntimeQueryResult> {
12448        use crate::auth::privileges::{Action, GrantPrincipal, Resource};
12449        use crate::auth::UserId;
12450        use crate::storage::query::ast::{GrantObjectKind, GrantPrincipalRef};
12451
12452        let auth_store = self
12453            .inner
12454            .auth_store
12455            .read()
12456            .clone()
12457            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
12458
12459        // Granter identity + role.
12460        let (gname, grole) = current_auth_identity().ok_or_else(|| {
12461            RedDBError::Query("GRANT requires an authenticated principal".to_string())
12462        })?;
12463        let granter = UserId::from_parts(current_tenant().as_deref(), &gname);
12464        let granter_role = grole;
12465
12466        // Build the action set.
12467        let mut actions: Vec<Action> = Vec::new();
12468        if stmt.all {
12469            actions.push(Action::All);
12470        } else {
12471            for kw in &stmt.actions {
12472                let a = Action::from_keyword(kw).ok_or_else(|| {
12473                    RedDBError::Query(format!("unknown privilege keyword `{}`", kw))
12474                })?;
12475                actions.push(a);
12476            }
12477        }
12478
12479        // Audit emit (printed; structured emission is Agent #4's lane).
12480        let mut applied = 0usize;
12481        for obj in &stmt.objects {
12482            let resource = match stmt.object_kind {
12483                GrantObjectKind::Table => Resource::Table {
12484                    schema: obj.schema.clone(),
12485                    table: obj.name.clone(),
12486                },
12487                GrantObjectKind::Schema => Resource::Schema(obj.name.clone()),
12488                GrantObjectKind::Database => Resource::Database,
12489                GrantObjectKind::Function => Resource::Function {
12490                    schema: obj.schema.clone(),
12491                    name: obj.name.clone(),
12492                },
12493            };
12494            for principal in &stmt.principals {
12495                let p = match principal {
12496                    GrantPrincipalRef::Public => GrantPrincipal::Public,
12497                    GrantPrincipalRef::Group(g) => GrantPrincipal::Group(g.clone()),
12498                    GrantPrincipalRef::User { tenant, name } => {
12499                        GrantPrincipal::User(UserId::from_parts(tenant.as_deref(), name))
12500                    }
12501                };
12502                // Tenant of the grant follows the granter's tenant
12503                // (cross-tenant guard inside `AuthStore::grant`).
12504                let tenant = granter.tenant.clone();
12505                auth_store
12506                    .grant(
12507                        &granter,
12508                        granter_role,
12509                        p.clone(),
12510                        resource.clone(),
12511                        actions.clone(),
12512                        stmt.with_grant_option,
12513                        tenant.clone(),
12514                    )
12515                    .map_err(|e| RedDBError::Query(e.to_string()))?;
12516
12517                // IAM policy translation: every GRANT also lands as a
12518                // synthetic `_grant_<id>` policy attached to the
12519                // principal so the new evaluator sees it.
12520                if let Some(policy) =
12521                    grant_to_iam_policy(&p, &resource, &actions, tenant.as_deref())
12522                {
12523                    let pid = policy.id.clone();
12524                    auth_store
12525                        .put_policy_internal(policy)
12526                        .map_err(|e| RedDBError::Query(e.to_string()))?;
12527                    let attachment = match &p {
12528                        GrantPrincipal::User(uid) => {
12529                            crate::auth::store::PrincipalRef::User(uid.clone())
12530                        }
12531                        GrantPrincipal::Group(group) => {
12532                            crate::auth::store::PrincipalRef::Group(group.clone())
12533                        }
12534                        GrantPrincipal::Public => crate::auth::store::PrincipalRef::Group(
12535                            crate::auth::store::PUBLIC_IAM_GROUP.to_string(),
12536                        ),
12537                    };
12538                    auth_store
12539                        .attach_policy(attachment, &pid)
12540                        .map_err(|e| RedDBError::Query(e.to_string()))?;
12541                }
12542                applied += 1;
12543                tracing::info!(
12544                    target: "audit",
12545                    principal = %granter,
12546                    action = "grant",
12547                    "GRANT applied"
12548                );
12549            }
12550        }
12551
12552        self.invalidate_result_cache();
12553        Ok(RuntimeQueryResult::ok_message(
12554            query.to_string(),
12555            &format!("GRANT applied to {} target(s)", applied),
12556            "grant",
12557        ))
12558    }
12559
12560    /// Translate the parsed [`RevokeStmt`] into AuthStore mutations.
12561    fn execute_revoke_statement(
12562        &self,
12563        query: &str,
12564        stmt: &crate::storage::query::ast::RevokeStmt,
12565    ) -> RedDBResult<RuntimeQueryResult> {
12566        use crate::auth::privileges::{Action, GrantPrincipal, Resource};
12567        use crate::auth::UserId;
12568        use crate::storage::query::ast::{GrantObjectKind, GrantPrincipalRef};
12569
12570        let auth_store = self
12571            .inner
12572            .auth_store
12573            .read()
12574            .clone()
12575            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
12576
12577        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
12578            RedDBError::Query("REVOKE requires an authenticated principal".to_string())
12579        })?;
12580        let granter_role = grole;
12581
12582        let actions: Vec<Action> = if stmt.all {
12583            vec![Action::All]
12584        } else {
12585            stmt.actions
12586                .iter()
12587                .map(|kw| Action::from_keyword(kw).unwrap_or(Action::Select))
12588                .collect()
12589        };
12590
12591        let mut total_removed = 0usize;
12592        for obj in &stmt.objects {
12593            let resource = match stmt.object_kind {
12594                GrantObjectKind::Table => Resource::Table {
12595                    schema: obj.schema.clone(),
12596                    table: obj.name.clone(),
12597                },
12598                GrantObjectKind::Schema => Resource::Schema(obj.name.clone()),
12599                GrantObjectKind::Database => Resource::Database,
12600                GrantObjectKind::Function => Resource::Function {
12601                    schema: obj.schema.clone(),
12602                    name: obj.name.clone(),
12603                },
12604            };
12605            for principal in &stmt.principals {
12606                let p = match principal {
12607                    GrantPrincipalRef::Public => GrantPrincipal::Public,
12608                    GrantPrincipalRef::Group(g) => GrantPrincipal::Group(g.clone()),
12609                    GrantPrincipalRef::User { tenant, name } => {
12610                        GrantPrincipal::User(UserId::from_parts(tenant.as_deref(), name))
12611                    }
12612                };
12613                let removed = auth_store
12614                    .revoke(granter_role, &p, &resource, &actions)
12615                    .map_err(|e| RedDBError::Query(e.to_string()))?;
12616                let _removed_policies =
12617                    auth_store.delete_synthetic_grant_policies(&p, &resource, &actions);
12618                total_removed += removed;
12619            }
12620        }
12621
12622        self.invalidate_result_cache();
12623        Ok(RuntimeQueryResult::ok_message(
12624            query.to_string(),
12625            &format!("REVOKE removed {} grant(s)", total_removed),
12626            "revoke",
12627        ))
12628    }
12629
12630    /// Translate the parsed [`AlterUserStmt`] into AuthStore mutations.
12631    fn execute_alter_user_statement(
12632        &self,
12633        query: &str,
12634        stmt: &crate::storage::query::ast::AlterUserStmt,
12635    ) -> RedDBResult<RuntimeQueryResult> {
12636        use crate::auth::privileges::UserAttributes;
12637        use crate::auth::UserId;
12638        use crate::storage::query::ast::AlterUserAttribute;
12639
12640        let auth_store = self
12641            .inner
12642            .auth_store
12643            .read()
12644            .clone()
12645            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
12646
12647        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
12648            RedDBError::Query("ALTER USER requires an authenticated principal".to_string())
12649        })?;
12650        if grole != crate::auth::Role::Admin {
12651            return Err(RedDBError::Query(
12652                "ALTER USER requires Admin role".to_string(),
12653            ));
12654        }
12655
12656        let target = UserId::from_parts(stmt.tenant.as_deref(), &stmt.username);
12657
12658        // Apply attributes incrementally — each one reads the current
12659        // record, mutates the relevant field, writes back.
12660        let mut attrs = auth_store.user_attributes(&target);
12661        let mut enable_change: Option<bool> = None;
12662
12663        for a in &stmt.attributes {
12664            match a {
12665                AlterUserAttribute::ValidUntil(ts) => {
12666                    // Parse ISO-ish timestamp → ms since epoch. Fall
12667                    // back to integer-ms parsing for callers that pass
12668                    // `'1234567890123'`.
12669                    let ms = parse_timestamp_to_ms(ts).ok_or_else(|| {
12670                        RedDBError::Query(format!("invalid VALID UNTIL timestamp `{ts}`"))
12671                    })?;
12672                    attrs.valid_until = Some(ms);
12673                }
12674                AlterUserAttribute::ConnectionLimit(n) => {
12675                    if *n < 0 {
12676                        return Err(RedDBError::Query(
12677                            "CONNECTION LIMIT must be non-negative".to_string(),
12678                        ));
12679                    }
12680                    attrs.connection_limit = Some(*n as u32);
12681                }
12682                AlterUserAttribute::SetSearchPath(p) => {
12683                    attrs.search_path = Some(p.clone());
12684                }
12685                AlterUserAttribute::AddGroup(g) => {
12686                    if !attrs.groups.iter().any(|existing| existing == g) {
12687                        attrs.groups.push(g.clone());
12688                        attrs.groups.sort();
12689                    }
12690                }
12691                AlterUserAttribute::DropGroup(g) => {
12692                    attrs.groups.retain(|existing| existing != g);
12693                }
12694                AlterUserAttribute::Enable => enable_change = Some(true),
12695                AlterUserAttribute::Disable => enable_change = Some(false),
12696                AlterUserAttribute::Password(_) => {
12697                    // Out of scope — accept the AST but no-op so the
12698                    // parser stays compatible with future password
12699                    // rotation work.
12700                }
12701            }
12702        }
12703
12704        auth_store
12705            .set_user_attributes(&target, attrs)
12706            .map_err(|e| RedDBError::Query(e.to_string()))?;
12707        if let Some(en) = enable_change {
12708            auth_store
12709                .set_user_enabled(&target, en)
12710                .map_err(|e| RedDBError::Query(e.to_string()))?;
12711        }
12712        self.invalidate_result_cache();
12713        tracing::info!(
12714            target: "audit",
12715            principal = %target,
12716            action = "alter_user",
12717            "ALTER USER applied"
12718        );
12719
12720        Ok(RuntimeQueryResult::ok_message(
12721            query.to_string(),
12722            &format!("ALTER USER {} applied", target),
12723            "alter_user",
12724        ))
12725    }
12726
12727    // -----------------------------------------------------------------
12728    // IAM policy executors
12729    // -----------------------------------------------------------------
12730
12731    fn execute_create_iam_policy(
12732        &self,
12733        query: &str,
12734        id: &str,
12735        json: &str,
12736    ) -> RedDBResult<RuntimeQueryResult> {
12737        use crate::auth::policies::Policy;
12738
12739        let auth_store = self
12740            .inner
12741            .auth_store
12742            .read()
12743            .clone()
12744            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
12745
12746        // Parse + validate. The kernel rejects oversize / bad shape /
12747        // bad action keywords. If the supplied id differs from the JSON
12748        // id, override it with the SQL-provided id (the JSON id is
12749        // optional context — the SQL DDL form is authoritative).
12750        let mut policy = Policy::from_json_str(json)
12751            .map_err(|e| RedDBError::Query(format!("policy parse: {e}")))?;
12752        if policy.id != id {
12753            policy.id = id.to_string();
12754        }
12755        let pid = policy.id.clone();
12756        let tenant = current_tenant();
12757        let (actor_name, actor_role) = current_auth_identity()
12758            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
12759        let actor = crate::auth::UserId::from_parts(tenant.as_deref(), &actor_name);
12760        let eval_ctx = runtime_iam_context(
12761            actor_role,
12762            tenant.as_deref(),
12763            auth_store.principal_is_system_owned(&actor),
12764        );
12765        let event_ctx = self.policy_mutation_control_ctx(&actor, tenant.as_deref());
12766        let ledger = self.inner.control_event_ledger.read();
12767        let control = crate::auth::store::PolicyMutationControl {
12768            ctx: &event_ctx,
12769            ledger: ledger.as_ref(),
12770            config: self.inner.control_event_config,
12771            registry: Some(self.inner.config_registry.as_ref()),
12772            actor: &actor,
12773            eval_ctx: &eval_ctx,
12774        };
12775        auth_store
12776            .put_policy_with_control_events(policy, &control)
12777            .map_err(|e| RedDBError::Query(e.to_string()))?;
12778
12779        let principal = actor_name;
12780        tracing::info!(
12781            target: "audit",
12782            principal = %principal,
12783            action = "iam:policy.put",
12784            matched_policy_id = %pid,
12785            "CREATE POLICY applied"
12786        );
12787        self.inner.audit_log.record(
12788            "iam/policy.put",
12789            &principal,
12790            &pid,
12791            "ok",
12792            crate::json::Value::Null,
12793        );
12794
12795        self.invalidate_result_cache();
12796        Ok(RuntimeQueryResult::ok_message(
12797            query.to_string(),
12798            &format!("policy `{pid}` stored"),
12799            "create_iam_policy",
12800        ))
12801    }
12802
12803    fn execute_drop_iam_policy(&self, query: &str, id: &str) -> RedDBResult<RuntimeQueryResult> {
12804        let auth_store = self
12805            .inner
12806            .auth_store
12807            .read()
12808            .clone()
12809            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
12810        let tenant = current_tenant();
12811        let (actor_name, actor_role) = current_auth_identity()
12812            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
12813        let actor = crate::auth::UserId::from_parts(tenant.as_deref(), &actor_name);
12814        let eval_ctx = runtime_iam_context(
12815            actor_role,
12816            tenant.as_deref(),
12817            auth_store.principal_is_system_owned(&actor),
12818        );
12819        let event_ctx = self.policy_mutation_control_ctx(&actor, tenant.as_deref());
12820        let ledger = self.inner.control_event_ledger.read();
12821        let control = crate::auth::store::PolicyMutationControl {
12822            ctx: &event_ctx,
12823            ledger: ledger.as_ref(),
12824            config: self.inner.control_event_config,
12825            registry: Some(self.inner.config_registry.as_ref()),
12826            actor: &actor,
12827            eval_ctx: &eval_ctx,
12828        };
12829        auth_store
12830            .delete_policy_with_control_events(id, &control)
12831            .map_err(|e| RedDBError::Query(e.to_string()))?;
12832
12833        let principal = actor_name;
12834        tracing::info!(
12835            target: "audit",
12836            principal = %principal,
12837            action = "iam:policy.drop",
12838            matched_policy_id = %id,
12839            "DROP POLICY applied"
12840        );
12841        self.inner.audit_log.record(
12842            "iam/policy.drop",
12843            &principal,
12844            id,
12845            "ok",
12846            crate::json::Value::Null,
12847        );
12848
12849        self.invalidate_result_cache();
12850        Ok(RuntimeQueryResult::ok_message(
12851            query.to_string(),
12852            &format!("policy `{id}` dropped"),
12853            "drop_iam_policy",
12854        ))
12855    }
12856
12857    fn execute_attach_policy(
12858        &self,
12859        query: &str,
12860        policy_id: &str,
12861        principal: &crate::storage::query::ast::PolicyPrincipalRef,
12862    ) -> RedDBResult<RuntimeQueryResult> {
12863        use crate::auth::store::PrincipalRef;
12864        use crate::auth::UserId;
12865        use crate::storage::query::ast::PolicyPrincipalRef;
12866
12867        let auth_store = self
12868            .inner
12869            .auth_store
12870            .read()
12871            .clone()
12872            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
12873        let p = match principal {
12874            PolicyPrincipalRef::User(u) => {
12875                PrincipalRef::User(UserId::from_parts(u.tenant.as_deref(), &u.username))
12876            }
12877            PolicyPrincipalRef::Group(g) => PrincipalRef::Group(g.clone()),
12878        };
12879        let pretty_target = principal_label(principal);
12880        let tenant = current_tenant();
12881        let (actor_name, actor_role) = current_auth_identity()
12882            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
12883        let actor = crate::auth::UserId::from_parts(tenant.as_deref(), &actor_name);
12884        let eval_ctx = runtime_iam_context(
12885            actor_role,
12886            tenant.as_deref(),
12887            auth_store.principal_is_system_owned(&actor),
12888        );
12889        let event_ctx = self.policy_mutation_control_ctx(&actor, tenant.as_deref());
12890        let ledger = self.inner.control_event_ledger.read();
12891        let control = crate::auth::store::PolicyMutationControl {
12892            ctx: &event_ctx,
12893            ledger: ledger.as_ref(),
12894            config: self.inner.control_event_config,
12895            registry: Some(self.inner.config_registry.as_ref()),
12896            actor: &actor,
12897            eval_ctx: &eval_ctx,
12898        };
12899        auth_store
12900            .attach_policy_with_control_events(p, policy_id, &control)
12901            .map_err(|e| RedDBError::Query(e.to_string()))?;
12902
12903        let principal_str = actor_name;
12904        tracing::info!(
12905            target: "audit",
12906            principal = %principal_str,
12907            action = "iam:policy.attach",
12908            matched_policy_id = %policy_id,
12909            target = %pretty_target,
12910            "ATTACH POLICY applied"
12911        );
12912        self.inner.audit_log.record(
12913            "iam/policy.attach",
12914            &principal_str,
12915            &pretty_target,
12916            "ok",
12917            crate::json::Value::Null,
12918        );
12919
12920        self.invalidate_result_cache();
12921        Ok(RuntimeQueryResult::ok_message(
12922            query.to_string(),
12923            &format!("policy `{policy_id}` attached to {pretty_target}"),
12924            "attach_policy",
12925        ))
12926    }
12927
12928    fn execute_detach_policy(
12929        &self,
12930        query: &str,
12931        policy_id: &str,
12932        principal: &crate::storage::query::ast::PolicyPrincipalRef,
12933    ) -> RedDBResult<RuntimeQueryResult> {
12934        use crate::auth::store::PrincipalRef;
12935        use crate::auth::UserId;
12936        use crate::storage::query::ast::PolicyPrincipalRef;
12937
12938        let auth_store = self
12939            .inner
12940            .auth_store
12941            .read()
12942            .clone()
12943            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
12944        let p = match principal {
12945            PolicyPrincipalRef::User(u) => {
12946                PrincipalRef::User(UserId::from_parts(u.tenant.as_deref(), &u.username))
12947            }
12948            PolicyPrincipalRef::Group(g) => PrincipalRef::Group(g.clone()),
12949        };
12950        let pretty_target = principal_label(principal);
12951        let tenant = current_tenant();
12952        let (actor_name, actor_role) = current_auth_identity()
12953            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
12954        let actor = crate::auth::UserId::from_parts(tenant.as_deref(), &actor_name);
12955        let eval_ctx = runtime_iam_context(
12956            actor_role,
12957            tenant.as_deref(),
12958            auth_store.principal_is_system_owned(&actor),
12959        );
12960        let event_ctx = self.policy_mutation_control_ctx(&actor, tenant.as_deref());
12961        let ledger = self.inner.control_event_ledger.read();
12962        let control = crate::auth::store::PolicyMutationControl {
12963            ctx: &event_ctx,
12964            ledger: ledger.as_ref(),
12965            config: self.inner.control_event_config,
12966            registry: Some(self.inner.config_registry.as_ref()),
12967            actor: &actor,
12968            eval_ctx: &eval_ctx,
12969        };
12970        auth_store
12971            .detach_policy_with_control_events(p, policy_id, &control)
12972            .map_err(|e| RedDBError::Query(e.to_string()))?;
12973
12974        let principal_str = actor_name;
12975        tracing::info!(
12976            target: "audit",
12977            principal = %principal_str,
12978            action = "iam:policy.detach",
12979            matched_policy_id = %policy_id,
12980            target = %pretty_target,
12981            "DETACH POLICY applied"
12982        );
12983        self.inner.audit_log.record(
12984            "iam/policy.detach",
12985            &principal_str,
12986            &pretty_target,
12987            "ok",
12988            crate::json::Value::Null,
12989        );
12990
12991        self.invalidate_result_cache();
12992        Ok(RuntimeQueryResult::ok_message(
12993            query.to_string(),
12994            &format!("policy `{policy_id}` detached from {pretty_target}"),
12995            "detach_policy",
12996        ))
12997    }
12998
12999    fn execute_show_policies(
13000        &self,
13001        query: &str,
13002        filter: Option<&crate::storage::query::ast::PolicyPrincipalRef>,
13003    ) -> RedDBResult<RuntimeQueryResult> {
13004        use crate::auth::UserId;
13005        use crate::storage::query::ast::PolicyPrincipalRef;
13006        use crate::storage::query::unified::UnifiedRecord;
13007        use crate::storage::schema::Value as SchemaValue;
13008        use std::sync::Arc;
13009
13010        let auth_store = self
13011            .inner
13012            .auth_store
13013            .read()
13014            .clone()
13015            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
13016
13017        let pols = match filter {
13018            None => auth_store.list_policies(),
13019            Some(PolicyPrincipalRef::User(u)) => {
13020                let id = UserId::from_parts(u.tenant.as_deref(), &u.username);
13021                auth_store.effective_policies(&id)
13022            }
13023            Some(PolicyPrincipalRef::Group(g)) => auth_store.group_policies(g),
13024        };
13025
13026        let mut records = Vec::with_capacity(pols.len() + 1);
13027
13028        // Header row (#712 / S5A): synthetic record at index 0 that
13029        // reports the active PolicyEnforcementMode and the hard-cutover
13030        // version, so an operator running SHOW POLICIES can see the
13031        // current posture without a separate command.
13032        let mode = auth_store.enforcement_mode();
13033        let mut header = UnifiedRecord::default();
13034        header.set_arc(
13035            Arc::from("id"),
13036            SchemaValue::text("<enforcement_mode>".to_string()),
13037        );
13038        header.set_arc(Arc::from("statements"), SchemaValue::Integer(0));
13039        header.set_arc(Arc::from("tenant"), SchemaValue::Null);
13040        let header_json = format!(
13041            r#"{{"enforcement_mode":"{}","policy_only_hard_version":"{}"}}"#,
13042            mode.as_str(),
13043            crate::auth::enforcement_mode::POLICY_ONLY_HARD_VERSION
13044        );
13045        header.set_arc(Arc::from("json"), SchemaValue::text(header_json));
13046        records.push(header);
13047
13048        for p in pols.iter() {
13049            let mut rec = UnifiedRecord::default();
13050            rec.set_arc(Arc::from("id"), SchemaValue::text(p.id.clone()));
13051            rec.set_arc(
13052                Arc::from("statements"),
13053                SchemaValue::Integer(p.statements.len() as i64),
13054            );
13055            rec.set_arc(
13056                Arc::from("tenant"),
13057                p.tenant
13058                    .as_deref()
13059                    .map(|t| SchemaValue::text(t.to_string()))
13060                    .unwrap_or(SchemaValue::Null),
13061            );
13062            rec.set_arc(Arc::from("json"), SchemaValue::text(p.to_json_string()));
13063            records.push(rec);
13064        }
13065        let mut result = crate::storage::query::unified::UnifiedResult::empty();
13066        result.records = records;
13067        Ok(RuntimeQueryResult {
13068            query: query.to_string(),
13069            mode: crate::storage::query::modes::QueryMode::Sql,
13070            statement: "show_policies",
13071            engine: "iam-policies",
13072            result,
13073            affected_rows: 0,
13074            statement_type: "select",
13075            bookmark: None,
13076        })
13077    }
13078
13079    fn execute_show_effective_permissions(
13080        &self,
13081        query: &str,
13082        user: &crate::storage::query::ast::PolicyUserRef,
13083        resource: Option<&crate::storage::query::ast::PolicyResourceRef>,
13084    ) -> RedDBResult<RuntimeQueryResult> {
13085        use crate::auth::UserId;
13086        use crate::storage::query::unified::UnifiedRecord;
13087        use crate::storage::schema::Value as SchemaValue;
13088        use std::sync::Arc;
13089
13090        let auth_store = self
13091            .inner
13092            .auth_store
13093            .read()
13094            .clone()
13095            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
13096        let id = UserId::from_parts(user.tenant.as_deref(), &user.username);
13097        let pols = auth_store.effective_policies(&id);
13098
13099        // Show one row per (policy, statement) tuple, plus any
13100        // resource-level filter passed by the caller.
13101        let mut records = Vec::new();
13102        for p in pols.iter() {
13103            for (idx, st) in p.statements.iter().enumerate() {
13104                if let Some(_r) = resource {
13105                    // Naive filter: render statement targets to strings
13106                    // and skip if no match. Conservative default = include
13107                    // (the simulator handles fine-grained matching).
13108                }
13109                let mut rec = UnifiedRecord::default();
13110                rec.set_arc(Arc::from("policy_id"), SchemaValue::text(p.id.clone()));
13111                rec.set_arc(
13112                    Arc::from("statement_index"),
13113                    SchemaValue::Integer(idx as i64),
13114                );
13115                rec.set_arc(
13116                    Arc::from("sid"),
13117                    st.sid
13118                        .as_deref()
13119                        .map(|s| SchemaValue::text(s.to_string()))
13120                        .unwrap_or(SchemaValue::Null),
13121                );
13122                rec.set_arc(
13123                    Arc::from("effect"),
13124                    SchemaValue::text(match st.effect {
13125                        crate::auth::policies::Effect::Allow => "allow",
13126                        crate::auth::policies::Effect::Deny => "deny",
13127                    }),
13128                );
13129                rec.set_arc(
13130                    Arc::from("actions"),
13131                    SchemaValue::Integer(st.actions.len() as i64),
13132                );
13133                rec.set_arc(
13134                    Arc::from("resources"),
13135                    SchemaValue::Integer(st.resources.len() as i64),
13136                );
13137                records.push(rec);
13138            }
13139        }
13140        let mut result = crate::storage::query::unified::UnifiedResult::empty();
13141        result.records = records;
13142        Ok(RuntimeQueryResult {
13143            query: query.to_string(),
13144            mode: crate::storage::query::modes::QueryMode::Sql,
13145            statement: "show_effective_permissions",
13146            engine: "iam-policies",
13147            result,
13148            affected_rows: 0,
13149            statement_type: "select",
13150            bookmark: None,
13151        })
13152    }
13153
13154    fn execute_lint_policy(
13155        &self,
13156        query: &str,
13157        source: &crate::storage::query::ast::LintPolicySource,
13158    ) -> RedDBResult<RuntimeQueryResult> {
13159        use crate::auth::policy_linter::lint;
13160        use crate::storage::query::ast::LintPolicySource;
13161        use crate::storage::query::unified::UnifiedRecord;
13162        use crate::storage::schema::Value as SchemaValue;
13163        use std::sync::Arc;
13164
13165        // Resolve the policy text. `JSON` source lints the literal
13166        // verbatim; `Id` source fetches the stored document so
13167        // operators can lint a policy by name without rebuilding the
13168        // JSON from `SHOW POLICY`.
13169        let policy_text = match source {
13170            LintPolicySource::Json(text) => text.clone(),
13171            LintPolicySource::Id(id) => {
13172                let auth_store =
13173                    self.inner.auth_store.read().clone().ok_or_else(|| {
13174                        RedDBError::Query("auth store not configured".to_string())
13175                    })?;
13176                let policy = auth_store
13177                    .get_policy(id)
13178                    .ok_or_else(|| RedDBError::Query(format!("policy `{id}` not found")))?;
13179                policy.to_json_string()
13180            }
13181        };
13182        let diagnostics = lint(&policy_text);
13183
13184        let principal_str = current_auth_identity()
13185            .map(|(u, _)| u)
13186            .unwrap_or_else(|| "anonymous".into());
13187        tracing::info!(
13188            target: "audit",
13189            principal = %principal_str,
13190            action = "iam:policy.lint",
13191            diagnostic_count = diagnostics.len(),
13192            "LINT POLICY issued"
13193        );
13194        self.inner.audit_log.record(
13195            "iam/policy.lint",
13196            &principal_str,
13197            match source {
13198                LintPolicySource::Id(id) => id.as_str(),
13199                LintPolicySource::Json(_) => "<json>",
13200            },
13201            "ok",
13202            crate::json::Value::Null,
13203        );
13204
13205        // One row per diagnostic. Column order matches the HTTP
13206        // surface's JSON keys so the two contracts line up.
13207        const COLUMNS: [&str; 5] = ["severity", "code", "message", "suggested_fix", "location"];
13208        let schema = Arc::new(
13209            COLUMNS
13210                .iter()
13211                .map(|name| Arc::<str>::from(*name))
13212                .collect::<Vec<_>>(),
13213        );
13214        let records: Vec<UnifiedRecord> = diagnostics
13215            .iter()
13216            .map(|d| {
13217                UnifiedRecord::with_schema(
13218                    Arc::clone(&schema),
13219                    vec![
13220                        SchemaValue::text(d.severity.as_str()),
13221                        SchemaValue::text(d.code.as_str()),
13222                        SchemaValue::text(d.message.clone()),
13223                        d.suggested_fix
13224                            .as_deref()
13225                            .map(SchemaValue::text)
13226                            .unwrap_or(SchemaValue::Null),
13227                        d.location
13228                            .as_deref()
13229                            .map(SchemaValue::text)
13230                            .unwrap_or(SchemaValue::Null),
13231                    ],
13232                )
13233            })
13234            .collect();
13235        let mut result = crate::storage::query::unified::UnifiedResult::with_columns(
13236            COLUMNS.iter().map(|c| c.to_string()).collect(),
13237        );
13238        result.records = records;
13239        Ok(RuntimeQueryResult {
13240            query: query.to_string(),
13241            mode: crate::storage::query::modes::QueryMode::Sql,
13242            statement: "lint_policy",
13243            engine: "iam-policies",
13244            result,
13245            affected_rows: 0,
13246            statement_type: "select",
13247            bookmark: None,
13248        })
13249    }
13250
13251    /// `MIGRATE POLICY MODE TO '<target>' [DRY RUN]` — flip the install
13252    /// from `legacy_rbac` to `policy_only` after the pre-flight delta
13253    /// simulator confirms no non-admin principal would lose access.
13254    /// Issue #714.
13255    fn execute_migrate_policy_mode(
13256        &self,
13257        query: &str,
13258        target: &str,
13259        dry_run: bool,
13260    ) -> RedDBResult<RuntimeQueryResult> {
13261        use crate::auth::enforcement_mode::PolicyEnforcementMode;
13262        use crate::auth::migrate_policy_mode::{
13263            principal_label, simulate_migration_delta, MigratePolicyDelta,
13264        };
13265        use crate::auth::policies::ResourceRef;
13266        use crate::storage::query::unified::UnifiedRecord;
13267        use crate::storage::schema::Value as SchemaValue;
13268        use std::sync::Arc;
13269
13270        // Only `policy_only` is a meaningful destination for this
13271        // command — flipping back to `legacy_rbac` is supported via
13272        // direct config writes (it doesn't need a pre-flight). We
13273        // reject everything else with the same allowlist `parse` uses.
13274        let parsed = PolicyEnforcementMode::parse(target).ok_or_else(|| {
13275            RedDBError::Query(format!(
13276                "MIGRATE POLICY MODE: invalid target `{target}` (expected `policy_only`)"
13277            ))
13278        })?;
13279        if parsed != PolicyEnforcementMode::PolicyOnly {
13280            return Err(RedDBError::Query(format!(
13281                "MIGRATE POLICY MODE: target `{target}` is not supported — only `policy_only` may be migrated to via this command"
13282            )));
13283        }
13284
13285        let auth_store = self
13286            .inner
13287            .auth_store
13288            .read()
13289            .clone()
13290            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
13291
13292        // Resource enumeration: every existing collection probed as
13293        // `table:<name>`. This is the realistic resource surface for
13294        // the legacy_rbac fallback (the role floors gate per-table
13295        // actions). Wildcard / column-scoped resources are still
13296        // covered by the policy evaluator because evaluate() resolves
13297        // resource patterns relative to the concrete resources we
13298        // probe here.
13299        let snapshot = self.inner.db.catalog_model_snapshot();
13300        let resources: Vec<ResourceRef> = snapshot
13301            .collections
13302            .iter()
13303            .map(|c| ResourceRef::new("table", c.name.clone()))
13304            .collect();
13305
13306        let now_ms = crate::utils::now_unix_millis() as u128;
13307        let deltas: Vec<MigratePolicyDelta> =
13308            simulate_migration_delta(auth_store.as_ref(), &resources, now_ms);
13309
13310        let principal_str = current_auth_identity()
13311            .map(|(u, _)| u)
13312            .unwrap_or_else(|| "anonymous".into());
13313
13314        // Audit every issuance. The outcome line differentiates
13315        // dry-run, refused, and applied — operators can grep for these
13316        // strings in the audit log.
13317        let outcome_str = if dry_run {
13318            "dry_run"
13319        } else if deltas.is_empty() {
13320            "applied"
13321        } else {
13322            "refused"
13323        };
13324        tracing::info!(
13325            target: "audit",
13326            principal = %principal_str,
13327            action = "iam:policy.migrate_mode",
13328            target = %target,
13329            dry_run,
13330            delta_count = deltas.len(),
13331            outcome = outcome_str,
13332            "MIGRATE POLICY MODE issued"
13333        );
13334        self.inner.audit_log.record(
13335            "iam/policy.migrate_mode",
13336            &principal_str,
13337            target,
13338            outcome_str,
13339            crate::json::Value::Null,
13340        );
13341
13342        // Refuse the non-dry-run path when any principal would lose
13343        // access. The error string carries a compact summary plus the
13344        // delta count so operators can re-run with DRY RUN to inspect.
13345        if !dry_run && !deltas.is_empty() {
13346            let summary = deltas
13347                .iter()
13348                .take(5)
13349                .map(|d| {
13350                    format!(
13351                        "{}:{}/{}:{}",
13352                        principal_label(&d.principal),
13353                        d.action,
13354                        d.resource_kind,
13355                        d.resource_name
13356                    )
13357                })
13358                .collect::<Vec<_>>()
13359                .join(", ");
13360            let more = if deltas.len() > 5 {
13361                format!(" (and {} more)", deltas.len() - 5)
13362            } else {
13363                String::new()
13364            };
13365            return Err(RedDBError::Query(format!(
13366                "MIGRATE POLICY MODE refused: {n} principal/action/resource pair(s) would lose access under `policy_only`. Run `MIGRATE POLICY MODE TO '{target}' DRY RUN` to inspect. Sample: {summary}{more}",
13367                n = deltas.len(),
13368            )));
13369        }
13370
13371        // Mutate the live enforcement mode only on the non-dry-run
13372        // path with an empty delta. `set_enforcement_mode` also
13373        // persists to vault_kv so the new mode survives restart.
13374        if !dry_run {
13375            auth_store.set_enforcement_mode(parsed);
13376        }
13377
13378        const COLUMNS: [&str; 5] = [
13379            "principal",
13380            "role",
13381            "action",
13382            "resource_kind",
13383            "resource_name",
13384        ];
13385        let schema = Arc::new(
13386            COLUMNS
13387                .iter()
13388                .map(|name| Arc::<str>::from(*name))
13389                .collect::<Vec<_>>(),
13390        );
13391        let records: Vec<UnifiedRecord> = deltas
13392            .iter()
13393            .map(|d| {
13394                UnifiedRecord::with_schema(
13395                    Arc::clone(&schema),
13396                    vec![
13397                        SchemaValue::text(principal_label(&d.principal)),
13398                        SchemaValue::text(d.role.as_str()),
13399                        SchemaValue::text(d.action.clone()),
13400                        SchemaValue::text(d.resource_kind.clone()),
13401                        SchemaValue::text(d.resource_name.clone()),
13402                    ],
13403                )
13404            })
13405            .collect();
13406        let mut result = crate::storage::query::unified::UnifiedResult::with_columns(
13407            COLUMNS.iter().map(|c| c.to_string()).collect(),
13408        );
13409        result.records = records;
13410        Ok(RuntimeQueryResult {
13411            query: query.to_string(),
13412            mode: crate::storage::query::modes::QueryMode::Sql,
13413            statement: "migrate_policy_mode",
13414            engine: "iam-policies",
13415            result,
13416            affected_rows: 0,
13417            statement_type: "select",
13418            bookmark: None,
13419        })
13420    }
13421
13422    fn execute_simulate_policy(
13423        &self,
13424        query: &str,
13425        user: &crate::storage::query::ast::PolicyUserRef,
13426        action: &str,
13427        resource: &crate::storage::query::ast::PolicyResourceRef,
13428    ) -> RedDBResult<RuntimeQueryResult> {
13429        use crate::auth::policies::ResourceRef;
13430        use crate::auth::store::SimCtx;
13431        use crate::auth::UserId;
13432        use crate::storage::query::unified::UnifiedRecord;
13433        use crate::storage::schema::Value as SchemaValue;
13434        use std::sync::Arc;
13435
13436        let auth_store = self
13437            .inner
13438            .auth_store
13439            .read()
13440            .clone()
13441            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
13442        let id = UserId::from_parts(user.tenant.as_deref(), &user.username);
13443        let r = ResourceRef::new(resource.kind.clone(), resource.name.clone());
13444        let outcome = auth_store.simulate(&id, action, &r, SimCtx::default());
13445
13446        let principal_str = current_auth_identity()
13447            .map(|(u, _)| u)
13448            .unwrap_or_else(|| "anonymous".into());
13449        let (decision_str, matched_pid, matched_sid) = decision_to_strings(&outcome.decision);
13450        tracing::info!(
13451            target: "audit",
13452            principal = %principal_str,
13453            action = "iam:policy.simulate",
13454            decision = %decision_str,
13455            matched_policy_id = ?matched_pid,
13456            matched_sid = ?matched_sid,
13457            "SIMULATE issued"
13458        );
13459        self.inner.audit_log.record(
13460            "iam/policy.simulate",
13461            &principal_str,
13462            &id.to_string(),
13463            "ok",
13464            crate::json::Value::Null,
13465        );
13466
13467        let mut rec = UnifiedRecord::default();
13468        rec.set_arc(Arc::from("decision"), SchemaValue::text(decision_str));
13469        rec.set_arc(
13470            Arc::from("matched_policy_id"),
13471            matched_pid
13472                .map(SchemaValue::text)
13473                .unwrap_or(SchemaValue::Null),
13474        );
13475        rec.set_arc(
13476            Arc::from("matched_sid"),
13477            matched_sid
13478                .map(SchemaValue::text)
13479                .unwrap_or(SchemaValue::Null),
13480        );
13481        rec.set_arc(Arc::from("reason"), SchemaValue::text(outcome.reason));
13482        rec.set_arc(
13483            Arc::from("trail_len"),
13484            SchemaValue::Integer(outcome.trail.len() as i64),
13485        );
13486        let mut result = crate::storage::query::unified::UnifiedResult::empty();
13487        result.records = vec![rec];
13488        Ok(RuntimeQueryResult {
13489            query: query.to_string(),
13490            mode: crate::storage::query::modes::QueryMode::Sql,
13491            statement: "simulate_policy",
13492            engine: "iam-policies",
13493            result,
13494            affected_rows: 0,
13495            statement_type: "select",
13496            bookmark: None,
13497        })
13498    }
13499}
13500
13501/// Translate a parsed GRANT into a synthetic IAM policy whose id
13502/// starts with `_grant_<unique>`. PUBLIC is represented as an
13503/// implicit IAM group; legacy GROUP grants are still rejected by the
13504/// grant store and are not translated here.
13505fn grant_to_iam_policy(
13506    principal: &crate::auth::privileges::GrantPrincipal,
13507    resource: &crate::auth::privileges::Resource,
13508    actions: &[crate::auth::privileges::Action],
13509    tenant: Option<&str>,
13510) -> Option<crate::auth::policies::Policy> {
13511    use crate::auth::policies::{
13512        compile_action, ActionPattern, Effect, Policy, ResourcePattern, Statement,
13513    };
13514    use crate::auth::privileges::{Action, GrantPrincipal, Resource};
13515
13516    if matches!(principal, GrantPrincipal::Group(_)) {
13517        return None;
13518    }
13519
13520    let now = crate::auth::now_ms();
13521    let id = format!("_grant_{:x}_{:x}", now, std::process::id());
13522
13523    let resource_str = match resource {
13524        Resource::Database => "table:*".to_string(),
13525        Resource::Schema(s) => format!("table:{s}.*"),
13526        Resource::Table { schema, table } => match schema {
13527            Some(s) => format!("table:{s}.{table}"),
13528            None => format!("table:{table}"),
13529        },
13530        Resource::Function { schema, name } => match schema {
13531            Some(s) => format!("function:{s}.{name}"),
13532            None => format!("function:{name}"),
13533        },
13534    };
13535
13536    // Compile actions — fall back to `*` only when the grant included
13537    // `Action::All`. Map every other action keyword to its lowercase
13538    // form so it lines up with the kernel's allowlist.
13539    let action_patterns: Vec<ActionPattern> = if actions.contains(&Action::All) {
13540        vec![ActionPattern::Wildcard]
13541    } else {
13542        actions
13543            .iter()
13544            .map(|a| compile_action(&a.as_str().to_ascii_lowercase()))
13545            .collect()
13546    };
13547    if action_patterns.is_empty() {
13548        return None;
13549    }
13550
13551    // Inline resource compilation matching the kernel's `compile_resource`:
13552    //   * `*` → wildcard
13553    //   * contains `*` → glob
13554    //   * `kind:name` → exact
13555    let resource_patterns = if resource_str == "*" {
13556        vec![ResourcePattern::Wildcard]
13557    } else if resource_str.contains('*') {
13558        vec![ResourcePattern::Glob(resource_str.clone())]
13559    } else if let Some((kind, name)) = resource_str.split_once(':') {
13560        vec![ResourcePattern::Exact {
13561            kind: kind.to_string(),
13562            name: name.to_string(),
13563        }]
13564    } else {
13565        vec![ResourcePattern::Wildcard]
13566    };
13567
13568    let policy = Policy {
13569        id,
13570        version: 1,
13571        tenant: tenant.map(|t| t.to_string()),
13572        created_at: now,
13573        updated_at: now,
13574        statements: vec![Statement {
13575            sid: None,
13576            effect: Effect::Allow,
13577            actions: action_patterns,
13578            resources: resource_patterns,
13579            condition: None,
13580        }],
13581    };
13582    if policy.validate().is_err() {
13583        return None;
13584    }
13585    Some(policy)
13586}
13587
13588/// Coerce a `key => <number>` table-function named argument into a positive
13589/// iteration count for the centrality TVFs (issue #797). The parser lexes all
13590/// named values as `f64`, so an integral, finite, strictly-positive value is
13591/// required here; anything else (fractional, zero, negative, NaN/inf) is a
13592/// clear query error. `func` names the function for the message.
13593fn parse_positive_iterations(func: &str, value: &f64) -> RedDBResult<usize> {
13594    if !value.is_finite() || *value < 1.0 || value.fract() != 0.0 {
13595        return Err(RedDBError::Query(format!(
13596            "table function '{func}' max_iterations must be a positive integer, got {value}"
13597        )));
13598    }
13599    Ok(*value as usize)
13600}
13601
13602fn legacy_action_to_iam(action: crate::auth::privileges::Action) -> &'static str {
13603    use crate::auth::privileges::Action;
13604    match action {
13605        Action::Select => "select",
13606        Action::Insert => "insert",
13607        Action::Update => "update",
13608        Action::Delete => "delete",
13609        Action::Truncate => "truncate",
13610        Action::References => "references",
13611        Action::Execute => "execute",
13612        Action::Usage => "usage",
13613        Action::All => "*",
13614    }
13615}
13616
13617fn update_set_target_columns(query: &crate::storage::query::ast::UpdateQuery) -> Vec<String> {
13618    let mut columns = Vec::new();
13619    for (column, _) in &query.assignment_exprs {
13620        if !columns.iter().any(|seen| seen == column) {
13621            columns.push(column.clone());
13622        }
13623    }
13624    columns
13625}
13626
13627fn column_access_request_for_table_update(
13628    table_name: &str,
13629    columns: Vec<String>,
13630) -> crate::auth::ColumnAccessRequest {
13631    match table_name.split_once('.') {
13632        Some((schema, table)) => {
13633            crate::auth::ColumnAccessRequest::update(table.to_string(), columns)
13634                .with_schema(schema.to_string())
13635        }
13636        None => crate::auth::ColumnAccessRequest::update(table_name.to_string(), columns),
13637    }
13638}
13639
13640fn column_access_request_for_table_select(
13641    table_name: &str,
13642    columns: Vec<String>,
13643) -> crate::auth::ColumnAccessRequest {
13644    match table_name.split_once('.') {
13645        Some((schema, table)) => {
13646            crate::auth::ColumnAccessRequest::select(table.to_string(), columns)
13647                .with_schema(schema.to_string())
13648        }
13649        None => crate::auth::ColumnAccessRequest::select(table_name.to_string(), columns),
13650    }
13651}
13652
13653fn update_returning_columns_for_policy(
13654    runtime: &RedDBRuntime,
13655    query: &crate::storage::query::ast::UpdateQuery,
13656) -> Option<Vec<String>> {
13657    let items = query.returning.as_ref()?;
13658    let mut columns = Vec::new();
13659    let project_all = items
13660        .iter()
13661        .any(|item| matches!(item, crate::storage::query::ast::ReturningItem::All));
13662    if project_all {
13663        collect_returning_star_columns(runtime, query, &mut columns);
13664    } else {
13665        for item in items {
13666            let crate::storage::query::ast::ReturningItem::Column(column) = item else {
13667                continue;
13668            };
13669            push_returning_policy_column(&mut columns, column);
13670        }
13671    }
13672    (!columns.is_empty()).then_some(columns)
13673}
13674
13675fn collect_returning_star_columns(
13676    runtime: &RedDBRuntime,
13677    query: &crate::storage::query::ast::UpdateQuery,
13678    columns: &mut Vec<String>,
13679) {
13680    let store = runtime.db().store();
13681    let Some(manager) = store.get_collection(&query.table) else {
13682        return;
13683    };
13684    if let Some(schema) = manager.column_schema() {
13685        for column in schema.iter() {
13686            push_returning_policy_column(columns, column);
13687        }
13688    }
13689    for entity in manager.query_all(|_| true) {
13690        if !returning_entity_matches_update_target(&entity, query.target) {
13691            continue;
13692        }
13693        match &entity.data {
13694            crate::storage::EntityData::Row(row) => {
13695                for (column, _) in row.iter_fields() {
13696                    push_returning_policy_column(columns, column);
13697                }
13698            }
13699            crate::storage::EntityData::Node(node) => {
13700                push_returning_policy_column(columns, "label");
13701                push_returning_policy_column(columns, "node_type");
13702                for column in node.properties.keys() {
13703                    push_returning_policy_column(columns, column);
13704                }
13705            }
13706            crate::storage::EntityData::Edge(edge) => {
13707                push_returning_policy_column(columns, "label");
13708                push_returning_policy_column(columns, "from_rid");
13709                push_returning_policy_column(columns, "to_rid");
13710                push_returning_policy_column(columns, "weight");
13711                for column in edge.properties.keys() {
13712                    push_returning_policy_column(columns, column);
13713                }
13714            }
13715            _ => {}
13716        }
13717    }
13718}
13719
13720fn push_returning_policy_column(columns: &mut Vec<String>, column: &str) {
13721    if returning_public_envelope_column(column) {
13722        return;
13723    }
13724    if !columns.iter().any(|seen| seen == column) {
13725        columns.push(column.to_string());
13726    }
13727}
13728
13729fn returning_public_envelope_column(column: &str) -> bool {
13730    matches!(
13731        column.to_ascii_lowercase().as_str(),
13732        "rid" | "collection" | "kind" | "tenant" | "created_at" | "updated_at" | "red_entity_id"
13733    )
13734}
13735
13736fn returning_entity_matches_update_target(
13737    entity: &crate::storage::UnifiedEntity,
13738    target: crate::storage::query::ast::UpdateTarget,
13739) -> bool {
13740    use crate::storage::query::ast::UpdateTarget;
13741    match target {
13742        UpdateTarget::Rows => {
13743            matches!(returning_row_item_kind(entity), Some(ReturningRowKind::Row))
13744        }
13745        UpdateTarget::Documents => {
13746            matches!(
13747                returning_row_item_kind(entity),
13748                Some(ReturningRowKind::Document)
13749            )
13750        }
13751        UpdateTarget::Kv => matches!(returning_row_item_kind(entity), Some(ReturningRowKind::Kv)),
13752        UpdateTarget::Nodes => matches!(
13753            (&entity.kind, &entity.data),
13754            (
13755                crate::storage::EntityKind::GraphNode(_),
13756                crate::storage::EntityData::Node(_)
13757            )
13758        ),
13759        UpdateTarget::Edges => matches!(
13760            (&entity.kind, &entity.data),
13761            (
13762                crate::storage::EntityKind::GraphEdge(_),
13763                crate::storage::EntityData::Edge(_)
13764            )
13765        ),
13766    }
13767}
13768
13769#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13770enum ReturningRowKind {
13771    Row,
13772    Document,
13773    Kv,
13774}
13775
13776fn returning_row_item_kind(entity: &crate::storage::UnifiedEntity) -> Option<ReturningRowKind> {
13777    let row = entity.data.as_row()?;
13778    let is_kv = row.iter_fields().all(|(column, _)| {
13779        column.eq_ignore_ascii_case("key") || column.eq_ignore_ascii_case("value")
13780    });
13781    if is_kv {
13782        return Some(ReturningRowKind::Kv);
13783    }
13784    let is_document = row
13785        .iter_fields()
13786        .any(|(_, value)| matches!(value, crate::storage::schema::Value::Json(_)));
13787    if is_document {
13788        Some(ReturningRowKind::Document)
13789    } else {
13790        Some(ReturningRowKind::Row)
13791    }
13792}
13793
13794fn requested_table_columns_for_policy(
13795    table: &crate::storage::query::ast::TableQuery,
13796) -> Vec<String> {
13797    use crate::storage::query::sql_lowering::{
13798        effective_table_filter, effective_table_group_by_exprs, effective_table_having_filter,
13799        effective_table_projections,
13800    };
13801
13802    let table_name = table.table.as_str();
13803    let table_alias = table.alias.as_deref();
13804    let mut columns = std::collections::BTreeSet::new();
13805
13806    for projection in effective_table_projections(table) {
13807        collect_projection_columns(&projection, table_name, table_alias, &mut columns);
13808    }
13809    if let Some(filter) = effective_table_filter(table) {
13810        collect_filter_columns(&filter, table_name, table_alias, &mut columns);
13811    }
13812    for expr in effective_table_group_by_exprs(table) {
13813        collect_expr_columns(&expr, table_name, table_alias, &mut columns);
13814    }
13815    if let Some(filter) = effective_table_having_filter(table) {
13816        collect_filter_columns(&filter, table_name, table_alias, &mut columns);
13817    }
13818    for order in &table.order_by {
13819        if let Some(expr) = order.expr.as_ref() {
13820            collect_expr_columns(expr, table_name, table_alias, &mut columns);
13821        } else {
13822            collect_field_ref_column(&order.field, table_name, table_alias, &mut columns);
13823        }
13824    }
13825
13826    columns.into_iter().collect()
13827}
13828
13829fn collect_projection_columns(
13830    projection: &crate::storage::query::ast::Projection,
13831    table_name: &str,
13832    table_alias: Option<&str>,
13833    columns: &mut std::collections::BTreeSet<String>,
13834) {
13835    use crate::storage::query::ast::Projection;
13836    match projection {
13837        Projection::All => {
13838            columns.insert("*".to_string());
13839        }
13840        Projection::Column(column) | Projection::Alias(column, _) => {
13841            if column != "*" {
13842                columns.insert(column.clone());
13843            }
13844        }
13845        Projection::Function(_, args) => {
13846            for arg in args {
13847                collect_projection_columns(arg, table_name, table_alias, columns);
13848            }
13849        }
13850        Projection::Expression(filter, _) => {
13851            collect_filter_columns(filter, table_name, table_alias, columns);
13852        }
13853        Projection::Field(field, _) => {
13854            collect_field_ref_column(field, table_name, table_alias, columns);
13855        }
13856        // Slice 7a (#589): no runtime support yet; recurse into args so
13857        // any column references are still tracked in case a future
13858        // executor needs the column set.
13859        Projection::Window { args, .. } => {
13860            for arg in args {
13861                collect_projection_columns(arg, table_name, table_alias, columns);
13862            }
13863        }
13864    }
13865}
13866
13867fn collect_filter_columns(
13868    filter: &crate::storage::query::ast::Filter,
13869    table_name: &str,
13870    table_alias: Option<&str>,
13871    columns: &mut std::collections::BTreeSet<String>,
13872) {
13873    use crate::storage::query::ast::Filter;
13874    match filter {
13875        Filter::Compare { field, .. }
13876        | Filter::IsNull(field)
13877        | Filter::IsNotNull(field)
13878        | Filter::In { field, .. }
13879        | Filter::Between { field, .. }
13880        | Filter::Like { field, .. }
13881        | Filter::StartsWith { field, .. }
13882        | Filter::EndsWith { field, .. }
13883        | Filter::Contains { field, .. } => {
13884            collect_field_ref_column(field, table_name, table_alias, columns);
13885        }
13886        Filter::CompareFields { left, right, .. } => {
13887            collect_field_ref_column(left, table_name, table_alias, columns);
13888            collect_field_ref_column(right, table_name, table_alias, columns);
13889        }
13890        Filter::CompareExpr { lhs, rhs, .. } => {
13891            collect_expr_columns(lhs, table_name, table_alias, columns);
13892            collect_expr_columns(rhs, table_name, table_alias, columns);
13893        }
13894        Filter::And(left, right) | Filter::Or(left, right) => {
13895            collect_filter_columns(left, table_name, table_alias, columns);
13896            collect_filter_columns(right, table_name, table_alias, columns);
13897        }
13898        Filter::Not(inner) => collect_filter_columns(inner, table_name, table_alias, columns),
13899    }
13900}
13901
13902fn collect_expr_columns(
13903    expr: &crate::storage::query::ast::Expr,
13904    table_name: &str,
13905    table_alias: Option<&str>,
13906    columns: &mut std::collections::BTreeSet<String>,
13907) {
13908    use crate::storage::query::ast::Expr;
13909    match expr {
13910        Expr::Column { field, .. } => {
13911            collect_field_ref_column(field, table_name, table_alias, columns);
13912        }
13913        Expr::Literal { .. } | Expr::Parameter { .. } => {}
13914        Expr::UnaryOp { operand, .. } | Expr::Cast { inner: operand, .. } => {
13915            collect_expr_columns(operand, table_name, table_alias, columns);
13916        }
13917        Expr::BinaryOp { lhs, rhs, .. } => {
13918            collect_expr_columns(lhs, table_name, table_alias, columns);
13919            collect_expr_columns(rhs, table_name, table_alias, columns);
13920        }
13921        Expr::FunctionCall { args, .. } => {
13922            for arg in args {
13923                collect_expr_columns(arg, table_name, table_alias, columns);
13924            }
13925        }
13926        Expr::Case {
13927            branches, else_, ..
13928        } => {
13929            for (condition, value) in branches {
13930                collect_expr_columns(condition, table_name, table_alias, columns);
13931                collect_expr_columns(value, table_name, table_alias, columns);
13932            }
13933            if let Some(value) = else_ {
13934                collect_expr_columns(value, table_name, table_alias, columns);
13935            }
13936        }
13937        Expr::IsNull { operand, .. } => {
13938            collect_expr_columns(operand, table_name, table_alias, columns);
13939        }
13940        Expr::InList { target, values, .. } => {
13941            collect_expr_columns(target, table_name, table_alias, columns);
13942            for value in values {
13943                collect_expr_columns(value, table_name, table_alias, columns);
13944            }
13945        }
13946        Expr::Between {
13947            target, low, high, ..
13948        } => {
13949            collect_expr_columns(target, table_name, table_alias, columns);
13950            collect_expr_columns(low, table_name, table_alias, columns);
13951            collect_expr_columns(high, table_name, table_alias, columns);
13952        }
13953        Expr::Subquery { .. } => {}
13954        Expr::WindowFunctionCall { args, window, .. } => {
13955            for arg in args {
13956                collect_expr_columns(arg, table_name, table_alias, columns);
13957            }
13958            for e in &window.partition_by {
13959                collect_expr_columns(e, table_name, table_alias, columns);
13960            }
13961            for o in &window.order_by {
13962                collect_expr_columns(&o.expr, table_name, table_alias, columns);
13963            }
13964        }
13965    }
13966}
13967
13968fn collect_field_ref_column(
13969    field: &crate::storage::query::ast::FieldRef,
13970    table_name: &str,
13971    table_alias: Option<&str>,
13972    columns: &mut std::collections::BTreeSet<String>,
13973) {
13974    if let Some(column) = policy_column_name_from_field_ref(field, table_name, table_alias) {
13975        if column != "*" {
13976            columns.insert(column);
13977        }
13978    }
13979}
13980
13981fn policy_column_name_from_field_ref(
13982    field: &crate::storage::query::ast::FieldRef,
13983    table_name: &str,
13984    table_alias: Option<&str>,
13985) -> Option<String> {
13986    match field {
13987        crate::storage::query::ast::FieldRef::TableColumn { table, column } => {
13988            if column == "*" {
13989                return Some("*".to_string());
13990            }
13991            if table.is_empty() || table == table_name || Some(table.as_str()) == table_alias {
13992                Some(column.clone())
13993            } else {
13994                Some(format!("{table}.{column}"))
13995            }
13996        }
13997        _ => None,
13998    }
13999}
14000
14001fn legacy_resource_to_iam(
14002    resource: &crate::auth::privileges::Resource,
14003    tenant: Option<&str>,
14004) -> crate::auth::policies::ResourceRef {
14005    use crate::auth::privileges::Resource;
14006
14007    let (kind, name) = match resource {
14008        Resource::Database => ("database".to_string(), "*".to_string()),
14009        Resource::Schema(s) => ("schema".to_string(), format!("{s}.*")),
14010        Resource::Table { schema, table } => (
14011            "table".to_string(),
14012            match schema {
14013                Some(s) => format!("{s}.{table}"),
14014                None => table.clone(),
14015            },
14016        ),
14017        Resource::Function { schema, name } => (
14018            "function".to_string(),
14019            match schema {
14020                Some(s) => format!("{s}.{name}"),
14021                None => name.clone(),
14022            },
14023        ),
14024    };
14025
14026    let mut out = crate::auth::policies::ResourceRef::new(kind, name);
14027    if let Some(t) = tenant {
14028        out = out.with_tenant(t.to_string());
14029    }
14030    out
14031}
14032
14033#[derive(Debug)]
14034struct JoinTableSide {
14035    table: String,
14036    alias: String,
14037}
14038
14039fn table_side_context(expr: &QueryExpr) -> Option<JoinTableSide> {
14040    match expr {
14041        QueryExpr::Table(table) => Some(JoinTableSide {
14042            table: table.table.clone(),
14043            alias: table.alias.clone().unwrap_or_else(|| table.table.clone()),
14044        }),
14045        _ => None,
14046    }
14047}
14048
14049fn collect_projection_columns_for_table(
14050    projection: &Projection,
14051    table: &str,
14052    alias: Option<&str>,
14053    out: &mut BTreeSet<String>,
14054) {
14055    match projection {
14056        Projection::Column(column) | Projection::Alias(column, _) => {
14057            match split_qualified_column(column) {
14058                Some((qualifier, column))
14059                    if qualifier == table || alias.is_some_and(|alias| qualifier == alias) =>
14060                {
14061                    push_policy_column(column, out);
14062                }
14063                Some(_) => {}
14064                None => push_policy_column(column, out),
14065            }
14066        }
14067        Projection::Field(
14068            FieldRef::TableColumn {
14069                table: qualifier,
14070                column,
14071            },
14072            _,
14073        ) => {
14074            if qualifier.is_empty()
14075                || qualifier == table
14076                || alias.is_some_and(|alias| qualifier == alias)
14077            {
14078                push_policy_column(column, out);
14079            }
14080        }
14081        Projection::Field(
14082            FieldRef::NodeProperty {
14083                alias: qualifier,
14084                property,
14085            },
14086            _,
14087        )
14088        | Projection::Field(
14089            FieldRef::EdgeProperty {
14090                alias: qualifier,
14091                property,
14092            },
14093            _,
14094        ) => {
14095            if qualifier == table || alias.is_some_and(|alias| qualifier == alias) {
14096                push_policy_column(property, out);
14097            }
14098        }
14099        Projection::Function(_, args) => {
14100            for arg in args {
14101                collect_projection_columns_for_table(arg, table, alias, out);
14102            }
14103        }
14104        Projection::Expression(_, _) | Projection::All | Projection::Field(_, _) => {}
14105        Projection::Window { args, .. } => {
14106            for arg in args {
14107                collect_projection_columns_for_table(arg, table, alias, out);
14108            }
14109        }
14110    }
14111}
14112
14113fn collect_projection_columns_for_join_side(
14114    projection: &Projection,
14115    left: Option<&JoinTableSide>,
14116    right: Option<&JoinTableSide>,
14117    out: &mut HashMap<String, BTreeSet<String>>,
14118) -> RedDBResult<()> {
14119    match projection {
14120        Projection::Column(column) | Projection::Alias(column, _) => {
14121            if let Some((qualifier, column)) = split_qualified_column(column) {
14122                push_qualified_join_column(qualifier, column, left, right, out);
14123            } else {
14124                push_unqualified_join_column(column, left, right, out);
14125            }
14126        }
14127        Projection::Field(FieldRef::TableColumn { table, column }, _) => {
14128            if table.is_empty() {
14129                push_unqualified_join_column(column, left, right, out);
14130            } else if let Some(side) = [left, right]
14131                .into_iter()
14132                .flatten()
14133                .find(|side| table == side.table.as_str() || table == side.alias.as_str())
14134            {
14135                push_join_column(&side.table, column, out);
14136            }
14137        }
14138        Projection::Field(FieldRef::NodeProperty { alias, property }, _)
14139        | Projection::Field(FieldRef::EdgeProperty { alias, property }, _) => {
14140            push_qualified_join_column(alias, property, left, right, out);
14141        }
14142        Projection::Function(_, args) => {
14143            for arg in args {
14144                collect_projection_columns_for_join_side(arg, left, right, out)?;
14145            }
14146        }
14147        Projection::Expression(_, _) | Projection::All | Projection::Field(_, _) => {}
14148        Projection::Window { args, .. } => {
14149            for arg in args {
14150                collect_projection_columns_for_join_side(arg, left, right, out)?;
14151            }
14152        }
14153    }
14154    Ok(())
14155}
14156
14157fn split_qualified_column(column: &str) -> Option<(&str, &str)> {
14158    let (qualifier, column) = column.split_once('.')?;
14159    if qualifier.is_empty() || column.is_empty() || column.contains('.') {
14160        return None;
14161    }
14162    Some((qualifier, column))
14163}
14164
14165fn push_qualified_join_column(
14166    qualifier: &str,
14167    column: &str,
14168    left: Option<&JoinTableSide>,
14169    right: Option<&JoinTableSide>,
14170    out: &mut HashMap<String, BTreeSet<String>>,
14171) {
14172    if let Some(side) = [left, right]
14173        .into_iter()
14174        .flatten()
14175        .find(|side| qualifier == side.table.as_str() || qualifier == side.alias.as_str())
14176    {
14177        push_join_column(&side.table, column, out);
14178    }
14179}
14180
14181fn push_unqualified_join_column(
14182    column: &str,
14183    left: Option<&JoinTableSide>,
14184    right: Option<&JoinTableSide>,
14185    out: &mut HashMap<String, BTreeSet<String>>,
14186) {
14187    for side in [left, right].into_iter().flatten() {
14188        push_join_column(&side.table, column, out);
14189    }
14190}
14191
14192fn push_join_column(table: &str, column: &str, out: &mut HashMap<String, BTreeSet<String>>) {
14193    if is_policy_column_name(column) {
14194        out.entry(table.to_string())
14195            .or_default()
14196            .insert(column.to_string());
14197    }
14198}
14199
14200fn push_policy_column(column: &str, out: &mut BTreeSet<String>) {
14201    if is_policy_column_name(column) {
14202        out.insert(column.to_string());
14203    }
14204}
14205
14206fn is_policy_column_name(column: &str) -> bool {
14207    !column.is_empty()
14208        && column != "*"
14209        && !column.starts_with("LIT:")
14210        && !column.starts_with("TYPE:")
14211}
14212
14213fn runtime_iam_context(
14214    role: crate::auth::Role,
14215    tenant: Option<&str>,
14216    principal_is_system_owned: bool,
14217) -> crate::auth::policies::EvalContext {
14218    crate::auth::policies::EvalContext {
14219        principal_tenant: tenant.map(|t| t.to_string()),
14220        current_tenant: tenant.map(|t| t.to_string()),
14221        peer_ip: None,
14222        mfa_present: false,
14223        now_ms: crate::auth::now_ms(),
14224        principal_is_admin_role: role == crate::auth::Role::Admin,
14225        principal_is_system_owned,
14226        principal_is_platform_scoped: tenant.is_none(),
14227    }
14228}
14229
14230fn explicit_table_projection_columns(
14231    query: &crate::storage::query::ast::TableQuery,
14232) -> Vec<String> {
14233    use crate::storage::query::ast::{FieldRef, Projection};
14234
14235    let mut columns = Vec::new();
14236    for projection in crate::storage::query::sql_lowering::effective_table_projections(query) {
14237        match projection {
14238            Projection::Column(column) | Projection::Alias(column, _) => {
14239                push_unique(&mut columns, column)
14240            }
14241            Projection::Field(FieldRef::TableColumn { column, .. }, _) => {
14242                push_unique(&mut columns, column)
14243            }
14244            // SELECT * and expression/function projections need the
14245            // executor-wide column-policy context mapped in
14246            // docs/security/select-relational-column-policy-audit-2026-05-08.md.
14247            _ => {}
14248        }
14249    }
14250    columns
14251}
14252
14253fn explicit_graph_projection_properties(
14254    query: &crate::storage::query::ast::GraphQuery,
14255) -> Vec<String> {
14256    use crate::storage::query::ast::{FieldRef, Projection};
14257
14258    let mut columns = Vec::new();
14259    for projection in &query.return_ {
14260        match projection {
14261            Projection::Field(FieldRef::NodeProperty { property, .. }, _)
14262            | Projection::Field(FieldRef::EdgeProperty { property, .. }, _) => {
14263                push_unique(&mut columns, property.clone())
14264            }
14265            _ => {}
14266        }
14267    }
14268    columns
14269}
14270
14271fn push_unique(columns: &mut Vec<String>, column: String) {
14272    if !columns.iter().any(|existing| existing == &column) {
14273        columns.push(column);
14274    }
14275}
14276
14277fn principal_label(p: &crate::storage::query::ast::PolicyPrincipalRef) -> String {
14278    use crate::storage::query::ast::PolicyPrincipalRef;
14279    match p {
14280        PolicyPrincipalRef::User(u) => match &u.tenant {
14281            Some(t) => format!("user:{t}/{}", u.username),
14282            None => format!("user:{}", u.username),
14283        },
14284        PolicyPrincipalRef::Group(g) => format!("group:{g}"),
14285    }
14286}
14287
14288/// Render a `Decision` into the (decision, matched_policy_id, matched_sid)
14289/// shape used by every audit emit + the simulator response.
14290pub(crate) fn decision_to_strings(
14291    d: &crate::auth::policies::Decision,
14292) -> (String, Option<String>, Option<String>) {
14293    use crate::auth::policies::Decision;
14294    match d {
14295        Decision::Allow {
14296            matched_policy_id,
14297            matched_sid,
14298        } => (
14299            "allow".into(),
14300            Some(matched_policy_id.clone()),
14301            matched_sid.clone(),
14302        ),
14303        Decision::Deny {
14304            matched_policy_id,
14305            matched_sid,
14306        } => (
14307            "deny".into(),
14308            Some(matched_policy_id.clone()),
14309            matched_sid.clone(),
14310        ),
14311        Decision::DefaultDeny => ("default_deny".into(), None, None),
14312        Decision::AdminBypass => ("admin_bypass".into(), None, None),
14313    }
14314}
14315
14316fn relation_scopes_for_query(query: &QueryExpr) -> Vec<String> {
14317    let mut scopes = Vec::new();
14318    collect_relation_scopes(query, &mut scopes);
14319    scopes.sort();
14320    scopes.dedup();
14321    scopes
14322}
14323
14324fn collect_relation_scopes(query: &QueryExpr, scopes: &mut Vec<String>) {
14325    match query {
14326        QueryExpr::Table(table) => {
14327            if !table.table.is_empty() {
14328                scopes.push(table.table.clone());
14329            }
14330            if let Some(alias) = &table.alias {
14331                scopes.push(alias.clone());
14332            }
14333        }
14334        QueryExpr::Join(join) => {
14335            collect_relation_scopes(&join.left, scopes);
14336            collect_relation_scopes(&join.right, scopes);
14337        }
14338        _ => {}
14339    }
14340}
14341
14342fn query_references_outer_scope(query: &QueryExpr, outer_scopes: &[String]) -> bool {
14343    let inner_scopes = relation_scopes_for_query(query);
14344    query_expr_references_outer_scope(query, outer_scopes, &inner_scopes)
14345}
14346
14347fn query_expr_references_outer_scope(
14348    query: &QueryExpr,
14349    outer_scopes: &[String],
14350    inner_scopes: &[String],
14351) -> bool {
14352    match query {
14353        QueryExpr::Table(table) => {
14354            table.select_items.iter().any(|item| match item {
14355                crate::storage::query::ast::SelectItem::Wildcard => false,
14356                crate::storage::query::ast::SelectItem::Expr { expr, .. } => {
14357                    expr_references_outer_scope(expr, outer_scopes, inner_scopes)
14358                }
14359            }) || table
14360                .where_expr
14361                .as_ref()
14362                .is_some_and(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
14363                || table.filter.as_ref().is_some_and(|filter| {
14364                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
14365                })
14366                || table.having_expr.as_ref().is_some_and(|expr| {
14367                    expr_references_outer_scope(expr, outer_scopes, inner_scopes)
14368                })
14369                || table.having.as_ref().is_some_and(|filter| {
14370                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
14371                })
14372                || table
14373                    .group_by_exprs
14374                    .iter()
14375                    .any(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
14376                || table.order_by.iter().any(|clause| {
14377                    clause.expr.as_ref().is_some_and(|expr| {
14378                        expr_references_outer_scope(expr, outer_scopes, inner_scopes)
14379                    })
14380                })
14381        }
14382        QueryExpr::Join(join) => {
14383            query_expr_references_outer_scope(&join.left, outer_scopes, inner_scopes)
14384                || query_expr_references_outer_scope(&join.right, outer_scopes, inner_scopes)
14385                || join.filter.as_ref().is_some_and(|filter| {
14386                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
14387                })
14388                || join.return_items.iter().any(|item| match item {
14389                    crate::storage::query::ast::SelectItem::Wildcard => false,
14390                    crate::storage::query::ast::SelectItem::Expr { expr, .. } => {
14391                        expr_references_outer_scope(expr, outer_scopes, inner_scopes)
14392                    }
14393                })
14394        }
14395        _ => false,
14396    }
14397}
14398
14399fn filter_references_outer_scope(
14400    filter: &crate::storage::query::ast::Filter,
14401    outer_scopes: &[String],
14402    inner_scopes: &[String],
14403) -> bool {
14404    use crate::storage::query::ast::Filter;
14405    match filter {
14406        Filter::Compare { field, .. }
14407        | Filter::IsNull(field)
14408        | Filter::IsNotNull(field)
14409        | Filter::In { field, .. }
14410        | Filter::Between { field, .. }
14411        | Filter::Like { field, .. }
14412        | Filter::StartsWith { field, .. }
14413        | Filter::EndsWith { field, .. }
14414        | Filter::Contains { field, .. } => {
14415            field_ref_references_outer_scope(field, outer_scopes, inner_scopes)
14416        }
14417        Filter::CompareFields { left, right, .. } => {
14418            field_ref_references_outer_scope(left, outer_scopes, inner_scopes)
14419                || field_ref_references_outer_scope(right, outer_scopes, inner_scopes)
14420        }
14421        Filter::CompareExpr { lhs, rhs, .. } => {
14422            expr_references_outer_scope(lhs, outer_scopes, inner_scopes)
14423                || expr_references_outer_scope(rhs, outer_scopes, inner_scopes)
14424        }
14425        Filter::And(left, right) | Filter::Or(left, right) => {
14426            filter_references_outer_scope(left, outer_scopes, inner_scopes)
14427                || filter_references_outer_scope(right, outer_scopes, inner_scopes)
14428        }
14429        Filter::Not(inner) => filter_references_outer_scope(inner, outer_scopes, inner_scopes),
14430    }
14431}
14432
14433fn expr_references_outer_scope(
14434    expr: &crate::storage::query::ast::Expr,
14435    outer_scopes: &[String],
14436    inner_scopes: &[String],
14437) -> bool {
14438    use crate::storage::query::ast::Expr;
14439    match expr {
14440        Expr::Column { field, .. } => {
14441            field_ref_references_outer_scope(field, outer_scopes, inner_scopes)
14442        }
14443        Expr::BinaryOp { lhs, rhs, .. } => {
14444            expr_references_outer_scope(lhs, outer_scopes, inner_scopes)
14445                || expr_references_outer_scope(rhs, outer_scopes, inner_scopes)
14446        }
14447        Expr::UnaryOp { operand, .. }
14448        | Expr::Cast { inner: operand, .. }
14449        | Expr::IsNull { operand, .. } => {
14450            expr_references_outer_scope(operand, outer_scopes, inner_scopes)
14451        }
14452        Expr::FunctionCall { args, .. } => args
14453            .iter()
14454            .any(|arg| expr_references_outer_scope(arg, outer_scopes, inner_scopes)),
14455        Expr::Case {
14456            branches, else_, ..
14457        } => {
14458            branches.iter().any(|(cond, value)| {
14459                expr_references_outer_scope(cond, outer_scopes, inner_scopes)
14460                    || expr_references_outer_scope(value, outer_scopes, inner_scopes)
14461            }) || else_
14462                .as_ref()
14463                .is_some_and(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
14464        }
14465        Expr::InList { target, values, .. } => {
14466            expr_references_outer_scope(target, outer_scopes, inner_scopes)
14467                || values
14468                    .iter()
14469                    .any(|value| expr_references_outer_scope(value, outer_scopes, inner_scopes))
14470        }
14471        Expr::Between {
14472            target, low, high, ..
14473        } => {
14474            expr_references_outer_scope(target, outer_scopes, inner_scopes)
14475                || expr_references_outer_scope(low, outer_scopes, inner_scopes)
14476                || expr_references_outer_scope(high, outer_scopes, inner_scopes)
14477        }
14478        Expr::Subquery { query, .. } => query_references_outer_scope(&query.query, inner_scopes),
14479        Expr::Literal { .. } | Expr::Parameter { .. } => false,
14480        Expr::WindowFunctionCall { args, window, .. } => {
14481            args.iter()
14482                .any(|arg| expr_references_outer_scope(arg, outer_scopes, inner_scopes))
14483                || window
14484                    .partition_by
14485                    .iter()
14486                    .any(|e| expr_references_outer_scope(e, outer_scopes, inner_scopes))
14487                || window
14488                    .order_by
14489                    .iter()
14490                    .any(|o| expr_references_outer_scope(&o.expr, outer_scopes, inner_scopes))
14491        }
14492    }
14493}
14494
14495fn field_ref_references_outer_scope(
14496    field: &crate::storage::query::ast::FieldRef,
14497    outer_scopes: &[String],
14498    inner_scopes: &[String],
14499) -> bool {
14500    match field {
14501        crate::storage::query::ast::FieldRef::TableColumn { table, .. } if !table.is_empty() => {
14502            outer_scopes.iter().any(|scope| scope == table)
14503                && !inner_scopes.iter().any(|scope| scope == table)
14504        }
14505        _ => false,
14506    }
14507}
14508
14509fn first_column_values(
14510    result: crate::storage::query::unified::UnifiedResult,
14511) -> RedDBResult<Vec<Value>> {
14512    if result.columns.len() > 1 {
14513        return Err(RedDBError::Query(
14514            "expression subquery must return exactly one column".to_string(),
14515        ));
14516    }
14517    let fallback_column = result
14518        .records
14519        .first()
14520        .and_then(|record| record.column_names().into_iter().next())
14521        .map(|name| name.to_string());
14522    let column = result.columns.first().cloned().or(fallback_column);
14523    let Some(column) = column else {
14524        return Ok(Vec::new());
14525    };
14526    Ok(result
14527        .records
14528        .iter()
14529        .map(|record| record.get(column.as_str()).cloned().unwrap_or(Value::Null))
14530        .collect())
14531}
14532
14533fn parse_timestamp_to_ms(s: &str) -> Option<u128> {
14534    // Bare integer ms.
14535    if let Ok(n) = s.parse::<u128>() {
14536        return Some(n);
14537    }
14538    // Fallback: ISO-8601 like 2030-01-02 03:04:05 — accept the date
14539    // portion only (midnight UTC). Full RFC3339 parsing is a stretch
14540    // goal; the common case is `'2030-01-01'`.
14541    if let Some(date) = s.split_whitespace().next() {
14542        let parts: Vec<&str> = date.split('-').collect();
14543        if parts.len() == 3 {
14544            let (y, m, d) = (parts[0], parts[1], parts[2]);
14545            if let (Ok(y), Ok(m), Ok(d)) = (y.parse::<i64>(), m.parse::<u32>(), d.parse::<u32>()) {
14546                // Days since 1970-01-01 — simple Julian arithmetic
14547                // suitable for years 1970-2100. Good enough for test
14548                // fixtures; precise parsing lands when we wire chrono.
14549                let days_in = days_from_civil(y, m, d);
14550                return Some((days_in as u128) * 86_400_000u128);
14551            }
14552        }
14553    }
14554    None
14555}
14556
14557/// Days from Unix epoch using H. Hinnant's civil-from-days algorithm.
14558/// Robust for the entire Gregorian range; used by `parse_timestamp_to_ms`.
14559fn days_from_civil(y: i64, m: u32, d: u32) -> i64 {
14560    let y = if m <= 2 { y - 1 } else { y };
14561    let era = if y >= 0 { y } else { y - 399 } / 400;
14562    let yoe = (y - era * 400) as u64; // [0, 399]
14563    let doy = (153 * (if m > 2 { m - 3 } else { m + 9 }) as u64 + 2) / 5 + d as u64 - 1;
14564    let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
14565    era * 146097 + doe as i64 - 719468
14566}
14567
14568fn walk_plan_node(
14569    node: &crate::storage::query::planner::CanonicalLogicalNode,
14570    depth: usize,
14571    out: &mut Vec<crate::storage::query::unified::UnifiedRecord>,
14572) {
14573    use std::sync::Arc;
14574    let mut rec = crate::storage::query::unified::UnifiedRecord::default();
14575    rec.set_arc(Arc::from("op"), Value::text(node.operator.clone()));
14576    rec.set_arc(
14577        Arc::from("source"),
14578        node.source.clone().map(Value::text).unwrap_or(Value::Null),
14579    );
14580    rec.set_arc(Arc::from("est_rows"), Value::Float(node.estimated_rows));
14581    rec.set_arc(Arc::from("est_cost"), Value::Float(node.operator_cost));
14582    rec.set_arc(Arc::from("depth"), Value::Integer(depth as i64));
14583    out.push(rec);
14584    for child in &node.children {
14585        walk_plan_node(child, depth + 1, out);
14586    }
14587}
14588
14589#[cfg(test)]
14590mod inline_graph_tvf_tests {
14591    use super::*;
14592
14593    fn scopes_for(sql: &str) -> HashSet<String> {
14594        let expr = crate::storage::query::parser::parse(sql)
14595            .expect("parse")
14596            .query;
14597        query_expr_result_cache_scopes(&expr)
14598    }
14599
14600    #[test]
14601    fn inline_tvf_cache_scopes_include_source_collections() {
14602        // The result-cache key for the inline form must derive from the
14603        // `nodes`/`edges` source collections so a write to either invalidates
14604        // the cached result (issue #799).
14605        let scopes = scopes_for(
14606            "SELECT * FROM components(nodes => (SELECT id FROM hosts), edges => (SELECT src, dst FROM links))",
14607        );
14608        assert!(scopes.contains("hosts"), "nodes source scoped: {scopes:?}");
14609        assert!(scopes.contains("links"), "edges source scoped: {scopes:?}");
14610    }
14611
14612    #[test]
14613    fn graph_collection_tvf_has_no_cache_scope() {
14614        // The graph-collection form reads the whole graph store and is not
14615        // scoped to any named collection (issue #795 behaviour preserved).
14616        let scopes = scopes_for("SELECT * FROM components(g)");
14617        assert!(scopes.is_empty(), "collection form unscoped: {scopes:?}");
14618    }
14619
14620    #[test]
14621    fn abstract_degree_centrality_counts_undirected_endpoints() {
14622        let nodes = vec!["a".to_string(), "b".to_string(), "c".to_string()];
14623        let edges = vec![
14624            ("a".to_string(), "b".to_string(), 1.0_f32),
14625            ("b".to_string(), "c".to_string(), 1.0_f32),
14626        ];
14627        let degrees = abstract_degree_centrality(&nodes, &edges);
14628        assert_eq!(
14629            degrees,
14630            vec![
14631                ("a".to_string(), 1),
14632                ("b".to_string(), 2),
14633                ("c".to_string(), 1),
14634            ]
14635        );
14636    }
14637}