Skip to main content

reddb_server/runtime/
impl_core.rs

1use super::*;
2use crate::application::entity::metadata_to_json;
3use crate::auth::column_policy_gate::ColumnAccessRequest;
4use crate::auth::UserId;
5use crate::replication::cdc::ChangeRecord;
6use crate::replication::logical::{ApplyMode, LogicalChangeApplier};
7use crate::storage::query::ast::TableSource;
8
9thread_local! {
10    /// Current connection id for the executing statement. Set by the
11    /// per-connection wrapper (stdio/gRPC handlers) before dispatching
12    /// into `execute_query`; falls back to `0` for embedded callers.
13    static CURRENT_CONN_ID: std::cell::Cell<u64> = const { std::cell::Cell::new(0) };
14
15    /// Authenticated user + role for the executing statement (Phase 2.5.2
16    /// RLS enforcement). Set by the transport middleware after validating
17    /// credentials (password / cert / oauth); unset means "anonymous" /
18    /// "embedded" — RLS policies degrade to the role-agnostic subset.
19    ///
20    /// `None` skips RLS injection entirely; `Some((username, role))`
21    /// passes `role` to `matching_rls_policies(table, Some(role), action)`.
22    static CURRENT_AUTH_IDENTITY: std::cell::RefCell<Option<(String, crate::auth::Role)>> =
23        const { std::cell::RefCell::new(None) };
24
25    /// MVCC snapshot scoped to the currently-executing statement (Phase
26    /// 2.3.2d PG parity). `execute_query` captures it on entry and drops
27    /// it on exit; every scan consults it via
28    /// `entity_visible_under_current_snapshot` to hide tuples whose xmin
29    /// hasn't committed or whose xmax already has.
30    ///
31    /// `None` means "pre-MVCC semantics" — the read path returns every
32    /// tuple regardless of xmin/xmax. All embedded callers that bypass
33    /// `execute_query` see this default.
34    static CURRENT_SNAPSHOT: std::cell::RefCell<Option<SnapshotContext>> =
35        const { std::cell::RefCell::new(None) };
36
37    /// Cheap presence flag for `CURRENT_SNAPSHOT`. Scan hot paths
38    /// poll this instead of `borrow()`-ing the RefCell on every
39    /// row — the common case (autocommit / no MVCC session) reads
40    /// one atomic `Cell<bool>` and short-circuits, saving ~10ns × N
41    /// rows on aggregate_group / select_range scans.
42    static HAS_SNAPSHOT: std::cell::Cell<bool> = const { std::cell::Cell::new(false) };
43
44    /// Session-scoped tenant id for the current connection (Phase 2.5.3
45    /// multi-tenancy). Populated by `SET TENANT 'id'` or by transport
46    /// middleware after resolving tenant from auth claims. Read by the
47    /// `CURRENT_TENANT()` scalar function — RLS policies typically
48    /// combine it as `USING (tenant_id = CURRENT_TENANT())` to scope
49    /// every query to one tenant.
50    ///
51    /// `None` means "no tenant bound" — `CURRENT_TENANT()` returns
52    /// NULL, and RLS policies that gate on it hide every row.
53    static CURRENT_TENANT_ID: std::cell::RefCell<Option<String>> =
54        const { std::cell::RefCell::new(None) };
55
56    /// Statement-local config resolver. SQL expressions materialize the
57    /// `red_config` snapshot lazily on the first `$config.*`/`CONFIG()`
58    /// access, keeping ordinary statements on the zero-scan path.
59    static CURRENT_CONFIG_RESOLVER: std::cell::RefCell<Option<ConfigResolver>> =
60        const { std::cell::RefCell::new(None) };
61
62    /// Statement-local secret resolver. SQL expressions materialize the
63    /// vault KV snapshot lazily on first `$secret.*` access, then use
64    /// lock-free map reads for the rest of the statement.
65    static CURRENT_SECRET_RESOLVER: std::cell::RefCell<Option<SecretResolver>> =
66        const { std::cell::RefCell::new(None) };
67}
68
69/// Read a numeric score column out of a result record as `f64`, matching
70/// the column name case-insensitively. Used by the leaderboard-rank head
71/// walk (#918) to compare scores; non-numeric / missing columns yield
72/// `None` so a row with no comparable score never shifts a rank.
73fn record_column_f64(
74    rec: &crate::storage::query::unified::UnifiedRecord,
75    column: &str,
76) -> Option<f64> {
77    let value = rec
78        .get(column)
79        .or_else(|| rec.get(&column.to_lowercase()))?;
80    match value {
81        Value::Integer(n) => Some(*n as f64),
82        Value::UnsignedInteger(n) => Some(*n as f64),
83        Value::Float(n) => Some(*n),
84        Value::Timestamp(n) | Value::Duration(n) => Some(*n as f64),
85        _ => None,
86    }
87}
88
89fn secret_sql_value_to_string(value: &Value) -> RedDBResult<String> {
90    match value {
91        Value::Text(s) => Ok(s.to_string()),
92        Value::Integer(n) => Ok(n.to_string()),
93        Value::UnsignedInteger(n) => Ok(n.to_string()),
94        Value::Float(n) => Ok(n.to_string()),
95        Value::Boolean(b) => Ok(b.to_string()),
96        Value::Null => Err(RedDBError::Query(
97            "SET SECRET key = NULL deletes the secret; use DELETE SECRET for explicit deletes"
98                .to_string(),
99        )),
100        Value::Password(_) | Value::Secret(_) => Err(RedDBError::Query(
101            "SET SECRET accepts plain scalar literals; PASSWORD() and SECRET() are for typed columns"
102                .to_string(),
103        )),
104        _ => Err(RedDBError::Query(format!(
105            "SET SECRET does not support value type {:?} yet",
106            value.data_type()
107        ))),
108    }
109}
110
111#[derive(Clone)]
112struct QueryControlEventSpec {
113    kind: crate::runtime::control_events::EventKind,
114    action: &'static str,
115    resource: Option<String>,
116    fields: Vec<(String, crate::runtime::control_events::Sensitivity)>,
117}
118
119#[derive(Clone)]
120struct QueryAuditPlan {
121    statement_kind: &'static str,
122    collections: Vec<String>,
123}
124
125fn query_audit_plan(expr: &QueryExpr) -> Option<QueryAuditPlan> {
126    let mut collections = Vec::new();
127    let statement_kind = match expr {
128        QueryExpr::Table(table) => {
129            push_query_audit_collection(&mut collections, &table.table);
130            "select"
131        }
132        QueryExpr::Join(join) => {
133            collect_query_audit_collections(&join.left, &mut collections);
134            collect_query_audit_collections(&join.right, &mut collections);
135            "select"
136        }
137        QueryExpr::Insert(insert) => {
138            push_query_audit_collection(&mut collections, &insert.table);
139            "insert"
140        }
141        QueryExpr::Update(update) => {
142            push_query_audit_collection(&mut collections, &update.table);
143            "update"
144        }
145        QueryExpr::Delete(delete) => {
146            push_query_audit_collection(&mut collections, &delete.table);
147            "delete"
148        }
149        _ => return None,
150    };
151    if collections.is_empty() {
152        None
153    } else {
154        Some(QueryAuditPlan {
155            statement_kind,
156            collections,
157        })
158    }
159}
160
161fn collect_query_audit_collections(expr: &QueryExpr, collections: &mut Vec<String>) {
162    match expr {
163        QueryExpr::Table(table) => push_query_audit_collection(collections, &table.table),
164        QueryExpr::Join(join) => {
165            collect_query_audit_collections(&join.left, collections);
166            collect_query_audit_collections(&join.right, collections);
167        }
168        _ => {}
169    }
170}
171
172fn push_query_audit_collection(collections: &mut Vec<String>, name: &str) {
173    if name == "red" || name.starts_with("red.") || name.starts_with("__red_schema_") {
174        return;
175    }
176    if !collections.iter().any(|existing| existing == name) {
177        collections.push(name.to_string());
178    }
179}
180
181impl RedDBRuntime {
182    fn execute_create_metric(
183        &self,
184        raw_query: &str,
185        query: &crate::storage::query::ast::CreateMetricQuery,
186    ) -> RedDBResult<RuntimeQueryResult> {
187        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
188        let store = self.inner.db.store();
189        super::metric_descriptor_catalog::create(
190            store.as_ref(),
191            &query.path,
192            &query.kind,
193            &query.role,
194            super::metric_descriptor_catalog::DerivedSpec {
195                source: query.source.clone(),
196                query: query.query.clone(),
197                window_ms: query.window_ms,
198                time_field: query.time_field.clone(),
199            },
200        )?;
201        self.invalidate_result_cache();
202        Ok(RuntimeQueryResult::ok_message(
203            raw_query.to_string(),
204            &format!("metric descriptor '{}' created", query.path),
205            "create",
206        ))
207    }
208
209    /// `CREATE RANKING <name> ON <table> (<column> [ASC|DESC]) [TOP <k>]`
210    /// — declare a Ranking capability over an ordinary table's score
211    /// column (issue #918 / ADR 0035). Persists a WAL-backed catalog
212    /// record; no new Collection model is introduced. Authorized through
213    /// the same DDL write gate as `CREATE METRIC`/`CREATE INDEX`.
214    fn execute_create_ranking(
215        &self,
216        raw_query: &str,
217        req: super::ranking_descriptor_catalog::CreateRankingRequest,
218    ) -> RedDBResult<RuntimeQueryResult> {
219        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
220        let store = self.inner.db.store();
221        let descriptor = super::ranking_descriptor_catalog::create(store.as_ref(), &req)?;
222        self.invalidate_result_cache();
223        Ok(RuntimeQueryResult::ok_message(
224            raw_query.to_string(),
225            &format!(
226                "ranking '{}' created on {}({})",
227                descriptor.name, descriptor.table, descriptor.column
228            ),
229            "create",
230        ))
231    }
232
233    /// `SHOW RANKINGS` — project the declared Ranking capabilities back as
234    /// rows, so a declared capability is observable (the Analytics
235    /// "prefer SELECT over admin verbs" rule).
236    fn execute_show_rankings(&self, raw_query: &str) -> RedDBResult<RuntimeQueryResult> {
237        let store = self.inner.db.store();
238        let entries = super::ranking_descriptor_catalog::list(store.as_ref());
239        let columns = vec![
240            "name".to_string(),
241            "table".to_string(),
242            "column".to_string(),
243            "direction".to_string(),
244            "top_k".to_string(),
245        ];
246        let rows = entries
247            .into_iter()
248            .map(|e| {
249                vec![
250                    ("name".to_string(), Value::text(e.name)),
251                    ("table".to_string(), Value::text(e.table)),
252                    ("column".to_string(), Value::text(e.column)),
253                    (
254                        "direction".to_string(),
255                        Value::text(if e.descending { "DESC" } else { "ASC" }.to_string()),
256                    ),
257                    ("top_k".to_string(), Value::UnsignedInteger(e.top_k)),
258                ]
259            })
260            .collect();
261        Ok(RuntimeQueryResult::ok_records(
262            raw_query.to_string(),
263            columns,
264            rows,
265            "select",
266        ))
267    }
268
269    /// `RANK OF <id> IN <name>` — exact, MVCC-correct rank of a specific
270    /// row within the capability's bounded top-K head (issue #918).
271    ///
272    /// Returns a single `rank` row when the row is visible *and* falls
273    /// inside the exact head; an empty result otherwise (not visible, or
274    /// in the approximate tail — a separate slice). The computation runs
275    /// entirely over the regular read pipeline so it inherits MVCC
276    /// visibility, RLS/policy, and tenant scope from ordinary reads.
277    fn execute_rank_of(
278        &self,
279        raw_query: &str,
280        req: super::ranking_descriptor_catalog::RankOfRequest,
281    ) -> RedDBResult<RuntimeQueryResult> {
282        let store = self.inner.db.store();
283        let descriptor = super::ranking_descriptor_catalog::get(store.as_ref(), &req.ranking)
284            .ok_or_else(|| {
285                RedDBError::Query(format!("ranking '{}' does not exist", req.ranking))
286            })?;
287        let rank = self.compute_exact_head_rank(&descriptor, req.entity_id)?;
288        let columns = vec!["rank".to_string()];
289        let rows = match rank {
290            Some(rank) => vec![vec![("rank".to_string(), Value::UnsignedInteger(rank))]],
291            None => Vec::new(),
292        };
293        Ok(RuntimeQueryResult::ok_records(
294            raw_query.to_string(),
295            columns,
296            rows,
297            "select",
298        ))
299    }
300
301    /// Compute the exact rank of `target_id` within the descriptor's
302    /// bounded top-K head, or `None` if the row is invisible to the
303    /// querying snapshot or beyond the exact head.
304    ///
305    /// Faithful to ADR 0035: it walks the sorted index head
306    /// (`ORDER BY <col> {DESC|ASC} LIMIT k`, served by
307    /// `try_sorted_index_lookup` + the per-row MVCC visibility re-check)
308    /// and counts only rows visible to the current snapshot. Running the
309    /// head scan through `execute_query_inner` keeps it on the same
310    /// snapshot/tenant/policy frame as ordinary reads, so the rank agrees
311    /// with `ORDER BY <col> {DESC|ASC} LIMIT` under that snapshot by
312    /// construction. RANK semantics: tied scores share a rank, so the
313    /// rank is `1 + (number of strictly-better visible rows)`.
314    fn compute_exact_head_rank(
315        &self,
316        descriptor: &super::ranking_descriptor_catalog::RankingDescriptor,
317        target_id: u64,
318    ) -> RedDBResult<Option<u64>> {
319        let table = &descriptor.table;
320        let column = &descriptor.column;
321
322        // The exact head: top-K rows in rank order. Each row here already
323        // passed MVCC visibility *and* RLS/tenant filtering during the
324        // scan, so identifying the target *within* this result (rather
325        // than via a separate `red_entity_id` lookup, which takes the
326        // direct entity-fetch path that bypasses the RLS gate) is what
327        // makes the rank honor policy/tenant scope (criterion 5).
328        let dir = if descriptor.descending { "DESC" } else { "ASC" };
329        let head_sql = format!(
330            "SELECT * FROM {table} ORDER BY {column} {dir} LIMIT {}",
331            descriptor.top_k
332        );
333        let head_result = self.execute_query_inner(&head_sql)?;
334        let head = &head_result.result.records;
335
336        // Locate the target inside the head. Not present ⇒ either invisible
337        // to this snapshot/tenant, or beyond the exact head — both correctly
338        // yield "no exact rank" (the approximate tail is a separate slice).
339        let target_score = head.iter().find_map(|rec| {
340            let rid = match rec.get("rid") {
341                Some(Value::UnsignedInteger(n)) => *n,
342                Some(Value::Integer(n)) if *n >= 0 => *n as u64,
343                _ => return None,
344            };
345            (rid == target_id).then(|| record_column_f64(rec, column))?
346        });
347        let Some(target_score) = target_score else {
348            return Ok(None);
349        };
350
351        // RANK semantics: tied scores share a rank, so the rank is
352        // 1 + (number of strictly-better visible rows in the head).
353        let mut strictly_better = 0u64;
354        for rec in head {
355            let Some(score) = record_column_f64(rec, column) else {
356                continue;
357            };
358            let better = if descriptor.descending {
359                score > target_score
360            } else {
361                score < target_score
362            };
363            if better {
364                strictly_better += 1;
365            }
366        }
367        Ok(Some(strictly_better + 1))
368    }
369
370    fn execute_alter_metric(
371        &self,
372        raw_query: &str,
373        query: &crate::storage::query::ast::AlterMetricQuery,
374    ) -> RedDBResult<RuntimeQueryResult> {
375        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
376        let store = self.inner.db.store();
377        super::metric_descriptor_catalog::update(
378            store.as_ref(),
379            &query.path,
380            query.set_role.as_deref(),
381            query.attempted_kind.as_deref(),
382            query.attempted_path.as_deref(),
383        )?;
384        self.invalidate_result_cache();
385        Ok(RuntimeQueryResult::ok_message(
386            raw_query.to_string(),
387            &format!("metric descriptor '{}' updated", query.path),
388            "alter",
389        ))
390    }
391
392    fn execute_create_slo(
393        &self,
394        raw_query: &str,
395        query: &crate::storage::query::ast::CreateSloQuery,
396    ) -> RedDBResult<RuntimeQueryResult> {
397        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
398        let store = self.inner.db.store();
399        super::slo_descriptor_catalog::create(
400            store.as_ref(),
401            &query.path,
402            &query.metric_path,
403            query.target,
404            query.window_ms,
405        )?;
406        self.invalidate_result_cache();
407        Ok(RuntimeQueryResult::ok_message(
408            raw_query.to_string(),
409            &format!("SLO descriptor '{}' created", query.path),
410            "create",
411        ))
412    }
413
414    fn execute_create_analytics_source(
415        &self,
416        raw_query: &str,
417        query: super::analytics_source_catalog::CreateAnalyticsSourceProfile,
418    ) -> RedDBResult<RuntimeQueryResult> {
419        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
420        let store = self.inner.db.store();
421        let profile = super::analytics_source_catalog::create(
422            store.as_ref(),
423            &self.inner.db.collection_contracts(),
424            query,
425        )?;
426        self.invalidate_result_cache();
427        Ok(RuntimeQueryResult::ok_message(
428            raw_query.to_string(),
429            &format!("analytics source '{}' created", profile.name),
430            "create",
431        ))
432    }
433}
434
435fn query_control_event_specs(expr: &QueryExpr) -> Vec<QueryControlEventSpec> {
436    use crate::runtime::control_events::{EventKind, Sensitivity};
437
438    let mut specs = Vec::new();
439    let mut schema = |action: &'static str, resource: Option<String>| {
440        specs.push(QueryControlEventSpec {
441            kind: EventKind::SchemaDdl,
442            action,
443            resource,
444            fields: Vec::new(),
445        });
446    };
447    match expr {
448        QueryExpr::CreateTable(q) => {
449            schema("create_table", Some(format!("table:{}", q.name)));
450            if let Some(column) = &q.tenant_by {
451                specs.push(QueryControlEventSpec {
452                    kind: EventKind::TenantGovernance,
453                    action: "create_table_tenant_by",
454                    resource: Some(format!("table:{}", q.name)),
455                    fields: vec![("tenant_column".to_string(), Sensitivity::raw(column))],
456                });
457            }
458        }
459        QueryExpr::CreateCollection(q) => {
460            schema("create_collection", Some(format!("collection:{}", q.name)));
461        }
462        QueryExpr::CreateVector(q) => schema("create_vector", Some(format!("vector:{}", q.name))),
463        QueryExpr::DropTable(q) => schema("drop_table", Some(format!("table:{}", q.name))),
464        QueryExpr::DropGraph(q) => schema("drop_graph", Some(format!("graph:{}", q.name))),
465        QueryExpr::DropVector(q) => schema("drop_vector", Some(format!("vector:{}", q.name))),
466        QueryExpr::DropDocument(q) => {
467            schema("drop_document", Some(format!("document:{}", q.name)));
468        }
469        QueryExpr::DropKv(q) => schema("drop_kv", Some(format!("kv:{}", q.name))),
470        QueryExpr::DropCollection(q) => {
471            schema("drop_collection", Some(format!("collection:{}", q.name)));
472        }
473        QueryExpr::Truncate(q) => schema("truncate", Some(format!("collection:{}", q.name))),
474        QueryExpr::AlterTable(q) => {
475            schema("alter_table", Some(format!("table:{}", q.name)));
476            for op in &q.operations {
477                match op {
478                    crate::storage::query::ast::AlterOperation::EnableRowLevelSecurity => {
479                        specs.push(QueryControlEventSpec {
480                            kind: EventKind::RlsGovernance,
481                            action: "enable_rls",
482                            resource: Some(format!("table:{}", q.name)),
483                            fields: Vec::new(),
484                        });
485                    }
486                    crate::storage::query::ast::AlterOperation::DisableRowLevelSecurity => {
487                        specs.push(QueryControlEventSpec {
488                            kind: EventKind::RlsGovernance,
489                            action: "disable_rls",
490                            resource: Some(format!("table:{}", q.name)),
491                            fields: Vec::new(),
492                        });
493                    }
494                    crate::storage::query::ast::AlterOperation::EnableTenancy { column } => {
495                        specs.push(QueryControlEventSpec {
496                            kind: EventKind::TenantGovernance,
497                            action: "enable_tenancy",
498                            resource: Some(format!("table:{}", q.name)),
499                            fields: vec![("tenant_column".to_string(), Sensitivity::raw(column))],
500                        });
501                    }
502                    crate::storage::query::ast::AlterOperation::DisableTenancy => {
503                        specs.push(QueryControlEventSpec {
504                            kind: EventKind::TenantGovernance,
505                            action: "disable_tenancy",
506                            resource: Some(format!("table:{}", q.name)),
507                            fields: Vec::new(),
508                        });
509                    }
510                    _ => {}
511                }
512            }
513        }
514        QueryExpr::CreateIndex(q) => {
515            schema(
516                "create_index",
517                Some(format!("index:{}:{}", q.table, q.name)),
518            );
519        }
520        QueryExpr::DropIndex(q) => {
521            schema("drop_index", Some(format!("index:{}:{}", q.table, q.name)));
522        }
523        QueryExpr::CreateTimeSeries(q) => {
524            schema("create_timeseries", Some(format!("timeseries:{}", q.name)));
525        }
526        QueryExpr::CreateMetric(q) => {
527            schema("create_metric", Some(format!("metric:{}", q.path)));
528        }
529        QueryExpr::AlterMetric(q) => {
530            schema("alter_metric", Some(format!("metric:{}", q.path)));
531        }
532        QueryExpr::CreateSlo(q) => {
533            schema("create_slo", Some(format!("slo:{}", q.path)));
534        }
535        QueryExpr::DropTimeSeries(q) => {
536            schema("drop_timeseries", Some(format!("timeseries:{}", q.name)));
537        }
538        QueryExpr::CreateQueue(q) => schema("create_queue", Some(format!("queue:{}", q.name))),
539        QueryExpr::AlterQueue(q) => schema("alter_queue", Some(format!("queue:{}", q.name))),
540        QueryExpr::DropQueue(q) => schema("drop_queue", Some(format!("queue:{}", q.name))),
541        QueryExpr::CreateTree(q) => {
542            schema(
543                "create_tree",
544                Some(format!("tree:{}:{}", q.collection, q.name)),
545            );
546        }
547        QueryExpr::DropTree(q) => {
548            schema(
549                "drop_tree",
550                Some(format!("tree:{}:{}", q.collection, q.name)),
551            );
552        }
553        QueryExpr::CreateSchema(q) => schema("create_schema", Some(format!("schema:{}", q.name))),
554        QueryExpr::DropSchema(q) => schema("drop_schema", Some(format!("schema:{}", q.name))),
555        QueryExpr::CreateSequence(q) => {
556            schema("create_sequence", Some(format!("sequence:{}", q.name)));
557        }
558        QueryExpr::DropSequence(q) => schema("drop_sequence", Some(format!("sequence:{}", q.name))),
559        QueryExpr::CreateView(q) => schema("create_view", Some(format!("view:{}", q.name))),
560        QueryExpr::DropView(q) => schema("drop_view", Some(format!("view:{}", q.name))),
561        QueryExpr::RefreshMaterializedView(q) => {
562            schema(
563                "refresh_materialized_view",
564                Some(format!("view:{}", q.name)),
565            );
566        }
567        QueryExpr::CreatePolicy(q) => {
568            specs.push(QueryControlEventSpec {
569                kind: EventKind::RlsGovernance,
570                action: "create_policy",
571                resource: Some(format!("table:{}:policy:{}", q.table, q.name)),
572                fields: vec![(
573                    "target_kind".to_string(),
574                    Sensitivity::raw(q.target_kind.as_ident()),
575                )],
576            });
577        }
578        QueryExpr::DropPolicy(q) => {
579            specs.push(QueryControlEventSpec {
580                kind: EventKind::RlsGovernance,
581                action: "drop_policy",
582                resource: Some(format!("table:{}:policy:{}", q.table, q.name)),
583                fields: Vec::new(),
584            });
585        }
586        QueryExpr::SetTenant(value) => {
587            let mut fields = Vec::new();
588            if let Some(value) = value {
589                fields.push(("tenant".to_string(), Sensitivity::raw(value)));
590            }
591            specs.push(QueryControlEventSpec {
592                kind: EventKind::TenantGovernance,
593                action: "set_tenant",
594                resource: Some("tenant:session".to_string()),
595                fields,
596            });
597        }
598        QueryExpr::SetConfig { key, .. } => {
599            specs.push(QueryControlEventSpec {
600                kind: EventKind::ConfigWrite,
601                action: "config:write",
602                resource: Some(format!("config:{key}")),
603                fields: vec![("key".to_string(), Sensitivity::raw(key))],
604            });
605        }
606        QueryExpr::ConfigCommand(cmd) => match cmd {
607            crate::storage::query::ast::ConfigCommand::Put {
608                collection, key, ..
609            }
610            | crate::storage::query::ast::ConfigCommand::Rotate {
611                collection, key, ..
612            } => {
613                let target = format!("{collection}/{key}");
614                specs.push(QueryControlEventSpec {
615                    kind: EventKind::ConfigWrite,
616                    action: "config:write",
617                    resource: Some(format!("config:{target}")),
618                    fields: vec![
619                        ("collection".to_string(), Sensitivity::raw(collection)),
620                        ("key".to_string(), Sensitivity::raw(key)),
621                    ],
622                });
623            }
624            crate::storage::query::ast::ConfigCommand::Delete { collection, key } => {
625                let target = format!("{collection}/{key}");
626                specs.push(QueryControlEventSpec {
627                    kind: EventKind::ConfigDelete,
628                    action: "config:write",
629                    resource: Some(format!("config:{target}")),
630                    fields: vec![
631                        ("collection".to_string(), Sensitivity::raw(collection)),
632                        ("key".to_string(), Sensitivity::raw(key)),
633                    ],
634                });
635            }
636            _ => {}
637        },
638        QueryExpr::AlterUser(stmt) => {
639            let disables = stmt.attributes.iter().any(|attr| {
640                matches!(
641                    attr,
642                    crate::storage::query::ast::AlterUserAttribute::Disable
643                )
644            });
645            specs.push(QueryControlEventSpec {
646                kind: if disables {
647                    EventKind::UserDisable
648                } else {
649                    EventKind::UserUpdate
650                },
651                action: "alter_user",
652                resource: Some(format!("user:{}", stmt.username)),
653                fields: Vec::new(),
654            });
655        }
656        _ => {}
657    }
658    specs
659}
660
661fn control_event_outcome_for_error(err: &RedDBError) -> crate::runtime::control_events::Outcome {
662    match err {
663        RedDBError::ReadOnly(_) => crate::runtime::control_events::Outcome::Denied,
664        RedDBError::Query(msg)
665            if msg.contains("permission denied")
666                || msg.contains("cannot issue")
667                || msg.contains("lacks") =>
668        {
669            crate::runtime::control_events::Outcome::Denied
670        }
671        _ => crate::runtime::control_events::Outcome::Error,
672    }
673}
674
675/// Convert the rows produced by a materialized-view body into
676/// `UnifiedEntity` table rows targeting the backing collection.
677/// Issue #595 slice 9c — feeds `UnifiedStore::refresh_collection`.
678///
679/// Graph fragments and vector hits are ignored: a materialized view
680/// is a relational result set (SELECT-shaped); slices 11+ may extend
681/// this once we have a richer view body shape. Each row materialises
682/// the union of its schema-bound columns + overflow.
683fn view_records_to_entities(
684    table: &str,
685    records: &[crate::storage::query::unified::UnifiedRecord],
686) -> Vec<crate::storage::UnifiedEntity> {
687    use std::collections::HashMap;
688    let table_arc: std::sync::Arc<str> = std::sync::Arc::from(table);
689    let mut out = Vec::with_capacity(records.len());
690    for record in records {
691        let mut named: HashMap<String, crate::storage::schema::Value> = HashMap::new();
692        for (name, value) in record.iter_fields() {
693            named.insert(name.to_string(), value.clone());
694        }
695        let entity = crate::storage::UnifiedEntity::new(
696            crate::storage::EntityId::new(0),
697            crate::storage::EntityKind::TableRow {
698                table: std::sync::Arc::clone(&table_arc),
699                row_id: 0,
700            },
701            crate::storage::EntityData::Row(crate::storage::RowData {
702                columns: Vec::new(),
703                named: Some(named),
704                schema: None,
705            }),
706        );
707        out.push(entity);
708    }
709    out
710}
711
712fn system_keyed_collection_contract(
713    name: &str,
714    model: crate::catalog::CollectionModel,
715) -> crate::physical::CollectionContract {
716    let now = crate::utils::now_unix_millis() as u128;
717    crate::physical::CollectionContract {
718        name: name.to_string(),
719        declared_model: model,
720        schema_mode: crate::catalog::SchemaMode::Dynamic,
721        origin: crate::physical::ContractOrigin::Implicit,
722        version: 1,
723        created_at_unix_ms: now,
724        updated_at_unix_ms: now,
725        default_ttl_ms: None,
726        vector_dimension: None,
727        vector_metric: None,
728        context_index_fields: Vec::new(),
729        declared_columns: Vec::new(),
730        table_def: None,
731        timestamps_enabled: false,
732        context_index_enabled: false,
733        metrics_raw_retention_ms: None,
734        metrics_rollup_policies: Vec::new(),
735        metrics_tenant_identity: None,
736        metrics_namespace: None,
737        append_only: false,
738        subscriptions: Vec::new(),
739        analytics_config: Vec::new(),
740        session_key: None,
741        session_gap_ms: None,
742        retention_duration_ms: None,
743        analytical_storage: None,
744    }
745}
746
747/// Snapshot + manager pair used for read-path visibility checks.
748///
749/// The manager is needed in addition to the snapshot because `aborted`
750/// state mutates after the snapshot is captured — a ROLLBACK by a
751/// committed-at-capture-time writer must still hide its tuples. Keeping
752/// the Arc around is O(pointer) and the RwLock reads on `is_aborted`
753/// are cheap (HashSet lookup under a parking_lot read guard).
754///
755/// `own_xids` (Phase 2.3.2e) lists the xids belonging to the current
756/// connection's transaction — the parent xid plus open and released
757/// savepoint sub-xids. The visibility rule promotes rows stamped with
758/// these xids to "always visible (unless aborted)" so the writer sees
759/// its own nested-savepoint writes even though their xids exceed
760/// `snapshot.xid`.
761#[derive(Clone)]
762pub struct SnapshotContext {
763    pub snapshot: crate::storage::transaction::snapshot::Snapshot,
764    pub manager: Arc<crate::storage::transaction::snapshot::SnapshotManager>,
765    pub own_xids: std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
766    pub requires_index_fallback: bool,
767}
768
769/// Install a connection id on the current thread for the duration of a
770/// statement. Transaction state (`RuntimeInner::tx_contexts`) is keyed
771/// by this id so different connections can hold independent BEGINs.
772///
773/// Pub so transports (PG wire, gRPC, HTTP per-request spawners) and
774/// tests can emulate per-connection isolation. Call it once when
775/// binding the connection's worker thread; pair with
776/// `clear_current_connection_id` on teardown.
777pub fn set_current_connection_id(id: u64) {
778    CURRENT_CONN_ID.with(|c| c.set(id));
779}
780
781/// Reset the thread's connection id back to `0` (autocommit).
782pub fn clear_current_connection_id() {
783    CURRENT_CONN_ID.with(|c| c.set(0));
784}
785
786/// Read the connection id set by `set_current_connection_id`. Returns
787/// `0` when no wrapper installed one — auto-commit path.
788pub fn current_connection_id() -> u64 {
789    CURRENT_CONN_ID.with(|c| c.get())
790}
791
792/// Install the authenticated identity for the current thread (Phase 2.5.2
793/// RLS enforcement). Transport layers call this right after resolving
794/// auth so the query dispatch can fold RLS policies into the filter.
795pub fn set_current_auth_identity(username: String, role: crate::auth::Role) {
796    CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = Some((username, role)));
797}
798
799/// Clear the thread-local auth identity. Transports call this after the
800/// statement completes so pooled threads don't leak identities across
801/// requests.
802pub fn clear_current_auth_identity() {
803    CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = None);
804}
805
806/// Read the current-thread auth identity. `None` when no transport
807/// installed one (embedded mode / anonymous access).
808pub(crate) fn current_auth_identity() -> Option<(String, crate::auth::Role)> {
809    CURRENT_AUTH_IDENTITY.with(|cell| cell.borrow().clone())
810}
811
812/// Public probe of the thread-local auth identity for callers outside
813/// the `runtime` module (e.g. the AI credential resolver, which audits
814/// who triggered a secret read on behalf of a query).
815pub fn current_auth_identity_for_audit() -> Option<(String, crate::auth::Role)> {
816    current_auth_identity()
817}
818
819/// Install the session tenant id for the current thread (Phase 2.5.3
820/// multi-tenancy). Called by `SET TENANT 'id'` dispatch and by
821/// transport middleware that resolves tenant from auth claims (e.g.
822/// JWT `tenant` claim, HTTP header, subdomain).
823pub fn set_current_tenant(tenant_id: String) {
824    CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = Some(tenant_id));
825}
826
827/// Clear the current-thread tenant — `CURRENT_TENANT()` will then
828/// return NULL and any RLS policy gated on it will hide every row.
829pub fn clear_current_tenant() {
830    CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = None);
831}
832
833/// Read the current-thread tenant id, applying overrides in priority order:
834///   1. `WITHIN TENANT '<id>' …` per-statement override (highest)
835///   2. `SET LOCAL TENANT '<id>'` transaction-local override (consulted
836///      only when the current connection has an open transaction)
837///   3. `SET TENANT '<id>'` session-level thread-local
838///   4. `None` (deny-default for RLS).
839///
840/// The transaction-local layer is read through the runtime; an embedded
841/// helper crate that has no `RedDBRuntime` access still gets correct
842/// behaviour for layers 1, 3, and 4.
843pub fn current_tenant() -> Option<String> {
844    let inherited = CURRENT_TENANT_ID.with(|cell| cell.borrow().clone());
845    if let Some(over) = current_scope_override() {
846        if over.tenant.is_active() {
847            return over.tenant.resolve(inherited);
848        }
849    }
850    if let Some(tx_local) = current_tx_local_tenant() {
851        return tx_local;
852    }
853    inherited
854}
855
856thread_local! {
857    /// Snapshot of the active connection's `tx_local_tenants` entry for
858    /// the current `execute_query` call. Outer `Some(_)` means "a
859    /// transaction-local tenant override is active for this call";
860    /// inner is the override's value (`Some(s)` overrides to `s`,
861    /// `None` overrides to NULL/cleared). Refreshed at the top of every
862    /// `execute_query` invocation and cleared by the RAII guard on
863    /// return so pooled connections cannot leak the override past the
864    /// statement that owns it.
865    static TX_LOCAL_TENANT: std::cell::RefCell<Option<Option<String>>> =
866        const { std::cell::RefCell::new(None) };
867}
868
869fn current_tx_local_tenant() -> Option<Option<String>> {
870    TX_LOCAL_TENANT.with(|cell| cell.borrow().clone())
871}
872
873/// Recognise `SET LOCAL TENANT '<id>'` / `SET LOCAL TENANT NULL` —
874/// returns `Ok(Some(Some(id)))` for an explicit value, `Ok(Some(None))`
875/// for an explicit NULL clear, `Ok(None)` when the input is not a
876/// `SET LOCAL TENANT` statement at all, and `Err` when the prefix
877/// matches but the value is malformed.
878fn parse_set_local_tenant(query: &str) -> RedDBResult<Option<Option<String>>> {
879    let mut tokens = query.split_ascii_whitespace();
880    let Some(w1) = tokens.next() else {
881        return Ok(None);
882    };
883    if !w1.eq_ignore_ascii_case("SET") {
884        return Ok(None);
885    }
886    let Some(w2) = tokens.next() else {
887        return Ok(None);
888    };
889    if !w2.eq_ignore_ascii_case("LOCAL") {
890        return Ok(None);
891    }
892    let Some(w3) = tokens.next() else {
893        return Ok(None);
894    };
895    if !w3.eq_ignore_ascii_case("TENANT") {
896        return Ok(None);
897    }
898    let rest: String = tokens.collect::<Vec<_>>().join(" ");
899    let rest = rest.trim().trim_end_matches(';').trim();
900    let value_str = rest.strip_prefix('=').map(|s| s.trim()).unwrap_or(rest);
901    if value_str.is_empty() {
902        return Err(RedDBError::Query(
903            "SET LOCAL TENANT expects a string literal or NULL".to_string(),
904        ));
905    }
906    if value_str.eq_ignore_ascii_case("NULL") {
907        return Ok(Some(None));
908    }
909    if value_str.starts_with('\'') && value_str.ends_with('\'') && value_str.len() >= 2 {
910        let inner = &value_str[1..value_str.len() - 1];
911        return Ok(Some(Some(inner.to_string())));
912    }
913    Err(RedDBError::Query(format!(
914        "SET LOCAL TENANT expects a string literal or NULL, got `{value_str}`"
915    )))
916}
917
918pub(crate) struct TxLocalTenantGuard;
919
920impl TxLocalTenantGuard {
921    pub fn install(value: Option<Option<String>>) -> Self {
922        TX_LOCAL_TENANT.with(|cell| *cell.borrow_mut() = value);
923        Self
924    }
925}
926
927impl Drop for TxLocalTenantGuard {
928    fn drop(&mut self) {
929        TX_LOCAL_TENANT.with(|cell| *cell.borrow_mut() = None);
930    }
931}
932
933thread_local! {
934    /// Stack of `WITHIN ... <stmt>` overrides active on the current
935    /// thread. Every entry corresponds to one in-flight `execute_query`
936    /// call that started with a `WITHIN` prefix; the entry is pushed
937    /// before dispatch and popped before the call returns. The stack
938    /// shape supports nested invocations (e.g. a view body that itself
939    /// re-enters execute_query).
940    static SCOPE_OVERRIDES: std::cell::RefCell<Vec<crate::runtime::within_clause::ScopeOverride>> =
941        const { std::cell::RefCell::new(Vec::new()) };
942}
943
944pub(crate) fn push_scope_override(over: crate::runtime::within_clause::ScopeOverride) {
945    SCOPE_OVERRIDES.with(|cell| cell.borrow_mut().push(over));
946}
947
948pub(crate) fn pop_scope_override() {
949    SCOPE_OVERRIDES.with(|cell| {
950        cell.borrow_mut().pop();
951    });
952}
953
954pub(crate) fn current_scope_override() -> Option<crate::runtime::within_clause::ScopeOverride> {
955    SCOPE_OVERRIDES.with(|cell| cell.borrow().last().cloned())
956}
957
958/// Cheap probe: is any `WITHIN …` scope override active on this
959/// thread? The fast-path needs to know without paying for the full
960/// `.last().cloned()` allocation — just peek at stack length.
961pub(crate) fn has_scope_override_active() -> bool {
962    SCOPE_OVERRIDES.with(|cell| !cell.borrow().is_empty())
963}
964
965/// RAII guard pairing `push_scope_override` with the matching pop, so
966/// the stack stays balanced even when the inner `execute_query` returns
967/// early via `?`.
968pub(crate) struct ScopeOverrideGuard;
969
970impl ScopeOverrideGuard {
971    pub fn install(over: crate::runtime::within_clause::ScopeOverride) -> Self {
972        push_scope_override(over);
973        Self
974    }
975}
976
977impl Drop for ScopeOverrideGuard {
978    fn drop(&mut self) {
979        pop_scope_override();
980    }
981}
982
983/// Read the current-thread auth identity, honouring per-statement
984/// `WITHIN ... USER '<u>' AS ROLE '<r>'` overrides. The override only
985/// supplies projected strings — it never grants additional privilege —
986/// so callers that need to make authorisation decisions must read from
987/// the underlying `current_auth_identity()` directly.
988pub(crate) fn current_user_projected() -> Option<String> {
989    let inherited = current_auth_identity().map(|(u, _)| u);
990    if let Some(over) = current_scope_override() {
991        if over.user.is_active() {
992            return over.user.resolve(inherited);
993        }
994    }
995    inherited
996}
997
998pub(crate) fn current_role_projected() -> Option<String> {
999    let inherited = current_auth_identity().map(|(_, r)| format!("{r:?}").to_lowercase());
1000    if let Some(over) = current_scope_override() {
1001        if over.role.is_active() {
1002            return over.role.resolve(inherited);
1003        }
1004    }
1005    inherited
1006}
1007
1008pub(crate) fn current_secret_value(path: &str) -> Option<String> {
1009    let key = path.to_ascii_lowercase();
1010    CURRENT_SECRET_RESOLVER.with(|cell| {
1011        let mut resolver = cell.borrow_mut();
1012        let resolver = resolver.as_mut()?;
1013        if resolver.values.is_none() {
1014            resolver.values = resolver
1015                .store
1016                .as_ref()
1017                .map(|store| store.vault_kv_snapshot());
1018        }
1019        let values = resolver.values.as_ref()?;
1020        values.get(&key).cloned().or_else(|| {
1021            key.strip_prefix("red.vault/").and_then(|rest| {
1022                values
1023                    .get(rest)
1024                    .cloned()
1025                    .or_else(|| values.get(&format!("red.secret.{rest}")).cloned())
1026            })
1027        })
1028    })
1029}
1030
1031struct SecretResolver {
1032    store: Option<Arc<crate::auth::store::AuthStore>>,
1033    values: Option<HashMap<String, String>>,
1034}
1035
1036pub(super) struct SecretStoreGuard {
1037    previous: Option<SecretResolver>,
1038}
1039
1040impl SecretStoreGuard {
1041    pub(super) fn install(store: Option<Arc<crate::auth::store::AuthStore>>) -> Self {
1042        let previous = CURRENT_SECRET_RESOLVER.with(|cell| {
1043            cell.replace(Some(SecretResolver {
1044                store,
1045                values: None,
1046            }))
1047        });
1048        Self { previous }
1049    }
1050}
1051
1052impl Drop for SecretStoreGuard {
1053    fn drop(&mut self) {
1054        let previous = self.previous.take();
1055        CURRENT_SECRET_RESOLVER.with(|cell| {
1056            cell.replace(previous);
1057        });
1058    }
1059}
1060
1061pub(crate) fn current_config_value(path: &str) -> Option<Value> {
1062    let key = path.to_ascii_lowercase();
1063    CURRENT_CONFIG_RESOLVER.with(|cell| {
1064        let mut resolver = cell.borrow_mut();
1065        let resolver = resolver.as_mut()?;
1066        if resolver.values.is_none() {
1067            resolver.values = Some(latest_config_snapshot(&resolver.db));
1068        }
1069        let values = resolver.values.as_ref()?;
1070        values.get(&key).cloned().or_else(|| {
1071            key.strip_prefix("red.config/")
1072                .and_then(|rest| values.get(&format!("red.config.{rest}")).cloned())
1073        })
1074    })
1075}
1076
1077fn update_current_config_value(path: &str, value: Value) {
1078    let key = path.to_ascii_lowercase();
1079    CURRENT_CONFIG_RESOLVER.with(|cell| {
1080        if let Some(resolver) = cell.borrow_mut().as_mut() {
1081            if let Some(values) = resolver.values.as_mut() {
1082                values.insert(key, value);
1083            }
1084        }
1085    });
1086}
1087
1088fn update_current_secret_value(path: &str, value: Option<String>) {
1089    let key = path.to_ascii_lowercase();
1090    CURRENT_SECRET_RESOLVER.with(|cell| {
1091        if let Some(resolver) = cell.borrow_mut().as_mut() {
1092            let Some(values) = resolver.values.as_mut() else {
1093                return;
1094            };
1095            match value {
1096                Some(value) => {
1097                    values.insert(key, value);
1098                }
1099                None => {
1100                    values.remove(&key);
1101                }
1102            }
1103        }
1104    });
1105}
1106
1107fn latest_config_snapshot(db: &RedDB) -> HashMap<String, Value> {
1108    let mut latest: HashMap<String, (u64, Value)> = HashMap::new();
1109
1110    if let Some(manager) = db.store().get_collection("red_config") {
1111        manager.for_each_entity(|entity| {
1112            let Some(row) = entity.data.as_row() else {
1113                return true;
1114            };
1115            let Some(Value::Text(key)) = row.get_field("key") else {
1116                return true;
1117            };
1118            let value = row.get_field("value").cloned().unwrap_or(Value::Null);
1119            let id = entity.id.raw();
1120            let key = key.to_ascii_lowercase();
1121            insert_latest_config_value(&mut latest, key.clone(), id, value.clone());
1122            if let Some(rest) = key.strip_prefix("red.config.") {
1123                insert_latest_config_value(&mut latest, format!("red.config/{rest}"), id, value);
1124            }
1125            true
1126        });
1127    }
1128
1129    if let Some(manager) = db.store().get_collection("red.config") {
1130        manager.for_each_entity(|entity| {
1131            let Some(row) = entity.data.as_row() else {
1132                return true;
1133            };
1134            if matches!(row.get_field("tombstone"), Some(Value::Boolean(true))) {
1135                return true;
1136            }
1137            let Some(Value::Text(key)) = row.get_field("key") else {
1138                return true;
1139            };
1140            let value = row.get_field("value").cloned().unwrap_or(Value::Null);
1141            insert_latest_config_value(
1142                &mut latest,
1143                format!("red.config/{}", key.to_ascii_lowercase()),
1144                entity.id.raw(),
1145                value,
1146            );
1147            true
1148        });
1149    }
1150
1151    latest
1152        .into_iter()
1153        .map(|(key, (_, value))| (key, value))
1154        .collect()
1155}
1156
1157fn insert_latest_config_value(
1158    latest: &mut HashMap<String, (u64, Value)>,
1159    key: String,
1160    id: u64,
1161    value: Value,
1162) {
1163    match latest.get(&key) {
1164        Some((prev_id, _)) if *prev_id > id => {}
1165        _ => {
1166            latest.insert(key, (id, value));
1167        }
1168    }
1169}
1170
1171struct ConfigResolver {
1172    db: Arc<RedDB>,
1173    values: Option<HashMap<String, Value>>,
1174}
1175
1176pub(super) struct ConfigSnapshotGuard {
1177    previous: Option<ConfigResolver>,
1178}
1179
1180impl ConfigSnapshotGuard {
1181    pub(super) fn install(db: Arc<RedDB>) -> Self {
1182        let previous = CURRENT_CONFIG_RESOLVER
1183            .with(|cell| cell.replace(Some(ConfigResolver { db, values: None })));
1184        Self { previous }
1185    }
1186}
1187
1188impl Drop for ConfigSnapshotGuard {
1189    fn drop(&mut self) {
1190        let previous = self.previous.take();
1191        CURRENT_CONFIG_RESOLVER.with(|cell| {
1192            cell.replace(previous);
1193        });
1194    }
1195}
1196
1197/// Install the MVCC snapshot used by the current thread for the duration
1198/// of one statement. Paired with `clear_current_snapshot()` — callers
1199/// should prefer the `CurrentSnapshotGuard` RAII wrapper so early returns
1200/// still clean up.
1201pub fn set_current_snapshot(ctx: SnapshotContext) {
1202    CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = Some(ctx));
1203    HAS_SNAPSHOT.with(|c| c.set(true));
1204}
1205
1206pub fn clear_current_snapshot() {
1207    CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = None);
1208    HAS_SNAPSHOT.with(|c| c.set(false));
1209}
1210
1211/// Drop-guard that restores the previous snapshot on scope exit. Safe to
1212/// nest — each statement saves the caller's snapshot and puts it back
1213/// instead of blindly clearing, so a top-level `execute_query` called
1214/// from inside another statement dispatch (e.g. vector source subqueries)
1215/// doesn't strip visibility from the outer scan.
1216pub(crate) struct CurrentSnapshotGuard {
1217    previous: Option<SnapshotContext>,
1218}
1219
1220impl CurrentSnapshotGuard {
1221    pub(crate) fn install(ctx: SnapshotContext) -> Self {
1222        let previous = CURRENT_SNAPSHOT.with(|cell| cell.borrow().clone());
1223        set_current_snapshot(ctx);
1224        Self { previous }
1225    }
1226}
1227
1228impl Drop for CurrentSnapshotGuard {
1229    fn drop(&mut self) {
1230        let prev = self.previous.take();
1231        let has = prev.is_some();
1232        CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = prev);
1233        HAS_SNAPSHOT.with(|c| c.set(has));
1234    }
1235}
1236
1237/// Is this entity visible under the current thread's MVCC snapshot?
1238///
1239/// Returns `true` (no filtering) when no snapshot is installed — that
1240/// path is used by embedded callers and by operations that intentionally
1241/// bypass MVCC (VACUUM, snapshot export, admin introspection).
1242///
1243/// When a snapshot is installed the result is
1244///   `snapshot.sees(xmin, xmax) && !mgr.is_aborted(xmin) && !xmax_half_abort`
1245/// where `xmax_half_abort` re-grants visibility for tuples whose
1246/// deleting transaction rolled back.
1247#[inline]
1248pub fn entity_visible_under_current_snapshot(
1249    entity: &crate::storage::unified::entity::UnifiedEntity,
1250) -> bool {
1251    // Fast path — one `Cell<bool>` read, no RefCell borrow. Autocommit
1252    // reads (no active MVCC transaction) still hide superseded physical
1253    // versions while avoiding a full snapshot-context lookup.
1254    // This runs on every row of every scan; the slow path only fires
1255    // inside an explicit transaction.
1256    if !HAS_SNAPSHOT.with(|c| c.get()) {
1257        return entity.xmax == 0;
1258    }
1259    CURRENT_SNAPSHOT.with(|cell| {
1260        let guard = cell.borrow();
1261        let Some(ctx) = guard.as_ref() else {
1262            return true;
1263        };
1264        visibility_check(ctx, entity.xmin, entity.xmax)
1265    })
1266}
1267
1268/// Direct visibility check from raw `(xmin, xmax)` — bypasses the
1269/// entity borrow for callers that already decomposed the tuple (e.g.
1270/// pre-materialized scan caches). Same semantics as
1271/// `entity_visible_under_current_snapshot`.
1272#[inline]
1273pub(crate) fn xids_visible_under_current_snapshot(xmin: u64, xmax: u64) -> bool {
1274    if !HAS_SNAPSHOT.with(|c| c.get()) {
1275        return true;
1276    }
1277    CURRENT_SNAPSHOT.with(|cell| {
1278        let guard = cell.borrow();
1279        let Some(ctx) = guard.as_ref() else {
1280            return true;
1281        };
1282        visibility_check(ctx, xmin, xmax)
1283    })
1284}
1285
1286/// Clone the current thread's snapshot context. Parallel scan paths
1287/// (`query_all_zoned` with `std::thread::scope`) call this on the main
1288/// thread *before* spawning workers so the captured `SnapshotContext`
1289/// can be moved into every worker closure. Worker threads do not
1290/// inherit thread-locals, so calling `entity_visible_under_current_snapshot`
1291/// from inside a spawned closure would silently skip the filter.
1292pub fn capture_current_snapshot() -> Option<SnapshotContext> {
1293    CURRENT_SNAPSHOT.with(|cell| cell.borrow().clone())
1294}
1295
1296/// Whether the active read snapshot may need historical tuple versions
1297/// that the current secondary indexes cannot prove. Index paths can still
1298/// recheck visible candidates, but only a heap scan can discover versions
1299/// whose indexed value was changed or deleted after this snapshot.
1300pub(crate) fn current_snapshot_requires_index_fallback() -> bool {
1301    if !HAS_SNAPSHOT.with(|c| c.get()) {
1302        return false;
1303    }
1304    CURRENT_SNAPSHOT.with(|cell| {
1305        cell.borrow()
1306            .as_ref()
1307            .is_some_and(|ctx| ctx.requires_index_fallback)
1308    })
1309}
1310
1311/// Frozen MVCC + identity context for callers that need to reinstall
1312/// the same view across thread-local boundaries — long-lived cursors,
1313/// background batchers, anything that detaches from the dispatch path
1314/// and re-enters later.
1315///
1316/// The bundle bakes in the three thread-locals every read path
1317/// consults: `SnapshotContext` (MVCC visibility), the auth identity
1318/// (RLS policy gate), and the tenant id (RLS scope). A FETCH that
1319/// reinstalls the bundle sees exactly the same rows as the DECLARE
1320/// would have, regardless of writes that landed in between.
1321///
1322/// Cheap to clone — `SnapshotContext` is a clone of three
1323/// `Arc`-backed fields, identity is a `(String, Role)`, tenant is a
1324/// `String`. None of these contend with the read path.
1325#[derive(Clone, Default)]
1326pub struct SnapshotBundle {
1327    pub snapshot: Option<SnapshotContext>,
1328    pub auth: Option<(String, crate::auth::Role)>,
1329    pub tenant: Option<String>,
1330}
1331
1332/// Capture the three read-path thread-locals into a `SnapshotBundle`.
1333/// Pairs with `with_snapshot_bundle` for re-entry.
1334pub fn snapshot_bundle() -> SnapshotBundle {
1335    SnapshotBundle {
1336        snapshot: capture_current_snapshot(),
1337        auth: current_auth_identity(),
1338        tenant: CURRENT_TENANT_ID.with(|cell| cell.borrow().clone()),
1339    }
1340}
1341
1342/// Reinstall a captured `SnapshotBundle` for the duration of `f`.
1343/// Restores the caller's previous thread-locals on exit (panic-safe via
1344/// the explicit guard struct so a panic in `f` cannot leak the
1345/// installed identity into the worker's next request).
1346pub fn with_snapshot_bundle<R>(bundle: &SnapshotBundle, f: impl FnOnce() -> R) -> R {
1347    struct Guard {
1348        prev_snapshot: Option<SnapshotContext>,
1349        prev_auth: Option<(String, crate::auth::Role)>,
1350        prev_tenant: Option<String>,
1351    }
1352    impl Drop for Guard {
1353        fn drop(&mut self) {
1354            let snap = self.prev_snapshot.take();
1355            let has = snap.is_some();
1356            CURRENT_SNAPSHOT.with(|cell| *cell.borrow_mut() = snap);
1357            HAS_SNAPSHOT.with(|c| c.set(has));
1358            CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = self.prev_auth.take());
1359            CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = self.prev_tenant.take());
1360        }
1361    }
1362
1363    let _guard = {
1364        let prev_snapshot = CURRENT_SNAPSHOT.with(|cell| cell.borrow().clone());
1365        let prev_auth = CURRENT_AUTH_IDENTITY.with(|cell| cell.borrow().clone());
1366        let prev_tenant = CURRENT_TENANT_ID.with(|cell| cell.borrow().clone());
1367
1368        match bundle.snapshot.clone() {
1369            Some(ctx) => set_current_snapshot(ctx),
1370            None => clear_current_snapshot(),
1371        }
1372        CURRENT_AUTH_IDENTITY.with(|cell| *cell.borrow_mut() = bundle.auth.clone());
1373        CURRENT_TENANT_ID.with(|cell| *cell.borrow_mut() = bundle.tenant.clone());
1374
1375        Guard {
1376            prev_snapshot,
1377            prev_auth,
1378            prev_tenant,
1379        }
1380    };
1381    f()
1382}
1383
1384/// Apply the same visibility rules used by the thread-local helpers
1385/// against a caller-provided context. Intended for parallel workers
1386/// that captured the snapshot with `capture_current_snapshot()`.
1387#[inline]
1388pub fn entity_visible_with_context(
1389    ctx: Option<&SnapshotContext>,
1390    entity: &crate::storage::unified::entity::UnifiedEntity,
1391) -> bool {
1392    match ctx {
1393        Some(ctx) => visibility_check(ctx, entity.xmin, entity.xmax),
1394        None => true,
1395    }
1396}
1397
1398fn table_row_index_fields(
1399    entity: &crate::storage::unified::entity::UnifiedEntity,
1400) -> Vec<(String, crate::storage::schema::Value)> {
1401    let crate::storage::EntityData::Row(row) = &entity.data else {
1402        return Vec::new();
1403    };
1404    if let Some(named) = &row.named {
1405        return named
1406            .iter()
1407            .map(|(name, value)| (name.clone(), value.clone()))
1408            .collect();
1409    }
1410    if let Some(schema) = &row.schema {
1411        return schema
1412            .iter()
1413            .zip(row.columns.iter())
1414            .map(|(name, value)| (name.clone(), value.clone()))
1415            .collect();
1416    }
1417    Vec::new()
1418}
1419
1420#[inline]
1421fn visibility_check(ctx: &SnapshotContext, xmin: u64, xmax: u64) -> bool {
1422    // Writer aborted → tuple never existed from any future reader's view.
1423    // Checked *before* the own-xids fast path so an aborted own-sub-xid
1424    // (rolled-back savepoint) stays hidden from the parent.
1425    if xmin != 0 && ctx.manager.is_aborted(xmin) {
1426        return false;
1427    }
1428    // Deleter aborted → treat xmax as unset; fall back to xmin-only check.
1429    let effective_xmax = if xmax != 0 && ctx.manager.is_aborted(xmax) {
1430        0
1431    } else {
1432        xmax
1433    };
1434    // Phase 2.3.2e: own-tx writes are always visible to the connection
1435    // that stamped them, even when xmin/xmax exceed `snapshot.xid` (as
1436    // happens for sub-xids allocated by SAVEPOINT after BEGIN).
1437    let own_xmin = xmin != 0 && ctx.own_xids.contains(&xmin);
1438    let own_xmax = effective_xmax != 0 && ctx.own_xids.contains(&effective_xmax);
1439    if own_xmax {
1440        // This connection deleted the row via this xid — hide it from self.
1441        return false;
1442    }
1443    if own_xmin {
1444        return true;
1445    }
1446    ctx.snapshot.sees(xmin, effective_xmax)
1447}
1448
1449fn runtime_pool_lock(runtime: &RedDBRuntime) -> std::sync::MutexGuard<'_, PoolState> {
1450    runtime
1451        .inner
1452        .pool
1453        .lock()
1454        .unwrap_or_else(|poisoned| poisoned.into_inner())
1455}
1456
1457/// The graph-analytics table-valued functions recognized in FROM position.
1458/// Both the graph-collection form and the inline `nodes => / edges =>` form
1459/// (issue #799) accept these names.
1460fn is_graph_tvf_name(name: &str) -> bool {
1461    name.eq_ignore_ascii_case("components")
1462        || name.eq_ignore_ascii_case("louvain")
1463        || name.eq_ignore_ascii_case("degree_centrality")
1464        || name.eq_ignore_ascii_case("shortest_path")
1465        || name.eq_ignore_ascii_case("betweenness")
1466        || name.eq_ignore_ascii_case("eigenvector")
1467        || name.eq_ignore_ascii_case("pagerank")
1468}
1469
1470/// Map a declared `WITH ANALYTICS` view to the concrete graph algorithm name
1471/// and named-argument list that [`RedDBRuntime::dispatch_graph_algorithm`]
1472/// consumes (issue #800). The `using` option selects the algorithm inside the
1473/// output family; unsupported algorithms and the options that do not apply to
1474/// the chosen algorithm are rejected so a view never silently ignores a
1475/// declared parameter.
1476fn analytics_view_algorithm(
1477    graph: &str,
1478    view: &crate::catalog::AnalyticsViewDescriptor,
1479) -> RedDBResult<(String, Vec<(String, f64)>)> {
1480    use crate::catalog::AnalyticsOutput;
1481
1482    let mut named_args: Vec<(String, f64)> = Vec::new();
1483    let algorithm = match view.output {
1484        AnalyticsOutput::Communities => {
1485            let algo = view.algorithm.as_deref().unwrap_or("louvain");
1486            if !algo.eq_ignore_ascii_case("louvain") {
1487                return Err(RedDBError::Query(format!(
1488                    "analytics output 'communities' on graph '{graph}' has unsupported algorithm '{algo}' (expected louvain)"
1489                )));
1490            }
1491            if let Some(resolution) = view.resolution {
1492                named_args.push(("resolution".to_string(), resolution));
1493            }
1494            "louvain".to_string()
1495        }
1496        AnalyticsOutput::Components => {
1497            if let Some(algo) = view.algorithm.as_deref() {
1498                if !algo.eq_ignore_ascii_case("components")
1499                    && !algo.eq_ignore_ascii_case("connected_components")
1500                {
1501                    return Err(RedDBError::Query(format!(
1502                        "analytics output 'components' on graph '{graph}' has unsupported algorithm '{algo}' (expected connected_components)"
1503                    )));
1504                }
1505            }
1506            "components".to_string()
1507        }
1508        AnalyticsOutput::Centrality => {
1509            let algo = view
1510                .algorithm
1511                .as_deref()
1512                .unwrap_or("pagerank")
1513                .to_ascii_lowercase();
1514            match algo.as_str() {
1515                "pagerank" => {
1516                    if let Some(max_iterations) = view.max_iterations {
1517                        named_args.push(("max_iterations".to_string(), max_iterations as f64));
1518                    }
1519                }
1520                "eigenvector" => {
1521                    if let Some(max_iterations) = view.max_iterations {
1522                        named_args.push(("max_iterations".to_string(), max_iterations as f64));
1523                    }
1524                    if let Some(tolerance) = view.tolerance {
1525                        named_args.push(("tolerance".to_string(), tolerance));
1526                    }
1527                }
1528                "betweenness" => {}
1529                other => {
1530                    return Err(RedDBError::Query(format!(
1531                        "analytics output 'centrality' on graph '{graph}' has unsupported algorithm '{other}' (expected pagerank, betweenness, or eigenvector)"
1532                    )));
1533                }
1534            }
1535            algo
1536        }
1537    };
1538    Ok((algorithm, named_args))
1539}
1540
1541/// Reject any named arguments for a TVF that accepts none.
1542fn reject_named_args(name: &str, named_args: &[(String, f64)]) -> RedDBResult<()> {
1543    if let Some((key, _)) = named_args.first() {
1544        return Err(RedDBError::Query(format!(
1545            "table function '{name}' has no named argument '{key}'"
1546        )));
1547    }
1548    Ok(())
1549}
1550
1551/// Resolve louvain's optional `resolution` named arg (γ, default 1.0). Any
1552/// other named key, or a non-finite / non-positive resolution, is rejected.
1553fn louvain_resolution(named_args: &[(String, f64)]) -> RedDBResult<f64> {
1554    let mut resolution = 1.0_f64;
1555    for (key, value) in named_args {
1556        if key.eq_ignore_ascii_case("resolution") {
1557            if !value.is_finite() || *value <= 0.0 {
1558                return Err(RedDBError::Query(format!(
1559                    "table function 'louvain' resolution must be > 0, got {value}"
1560                )));
1561            }
1562            resolution = *value;
1563        } else {
1564            return Err(RedDBError::Query(format!(
1565                "table function 'louvain' has no named argument '{key}' (expected 'resolution')"
1566            )));
1567        }
1568    }
1569    Ok(resolution)
1570}
1571
1572/// Undirected degree centrality over abstract inputs: each edge contributes
1573/// 1 to both of its endpoints. Returns `(node_id, degree)` deterministically
1574/// in ascending node-id order, so identical input always yields identical
1575/// rows.
1576fn abstract_degree_centrality(
1577    nodes: &[String],
1578    edges: &[(
1579        String,
1580        String,
1581        crate::storage::engine::graph_algorithms::Weight,
1582    )],
1583) -> Vec<(String, usize)> {
1584    let mut degree: std::collections::BTreeMap<String, usize> = std::collections::BTreeMap::new();
1585    for n in nodes {
1586        degree.entry(n.clone()).or_insert(0);
1587    }
1588    for (a, b, _w) in edges {
1589        *degree.entry(a.clone()).or_insert(0) += 1;
1590        *degree.entry(b.clone()).or_insert(0) += 1;
1591    }
1592    degree.into_iter().collect()
1593}
1594
1595/// Ordered column names for a materialized subquery result: the projection
1596/// columns when present, else the first record's field order.
1597fn ordered_result_columns(result: &crate::storage::query::unified::UnifiedResult) -> Vec<String> {
1598    if !result.columns.is_empty() {
1599        return result.columns.clone();
1600    }
1601    result
1602        .records
1603        .first()
1604        .map(|record| {
1605            record
1606                .column_names()
1607                .iter()
1608                .map(|column| column.to_string())
1609                .collect()
1610        })
1611        .unwrap_or_default()
1612}
1613
1614/// Canonical node-id string for a cell value, so the node universe (from the
1615/// `nodes` subquery) and the edge endpoints (from the `edges` subquery)
1616/// compare equal regardless of integer-vs-text typing. `Null` is not a node.
1617fn value_to_node_id(value: &crate::storage::schema::Value) -> Option<String> {
1618    use crate::storage::schema::Value;
1619    match value {
1620        Value::Null => None,
1621        Value::Text(s) => Some(s.to_string()),
1622        Value::Integer(n) => Some(n.to_string()),
1623        Value::UnsignedInteger(n) => Some(n.to_string()),
1624        Value::NodeRef(s) => Some(s.clone()),
1625        other => Some(other.to_string()),
1626    }
1627}
1628
1629/// Numeric edge weight from a cell value (the optional third `edges` column).
1630fn value_to_weight(value: &crate::storage::schema::Value) -> Option<f32> {
1631    use crate::storage::schema::Value;
1632    match value {
1633        Value::Float(f) => Some(*f as f32),
1634        Value::Integer(n) => Some(*n as f32),
1635        Value::UnsignedInteger(n) => Some(*n as f32),
1636        _ => None,
1637    }
1638}
1639
1640/// Build the node universe from a materialized `nodes` subquery result: the
1641/// first projected column of each row is the node id (issue #799). Zero rows
1642/// is a valid empty node set; a row set with no columns is a shape error.
1643fn inline_node_ids(
1644    name: &str,
1645    result: &crate::storage::query::unified::UnifiedResult,
1646) -> RedDBResult<Vec<String>> {
1647    if result.records.is_empty() {
1648        return Ok(Vec::new());
1649    }
1650    let columns = ordered_result_columns(result);
1651    let Some(first_col) = columns.first() else {
1652        return Err(RedDBError::Query(format!(
1653            "table function '{name}' inline form: `nodes` subquery must project at least one column (the node id)"
1654        )));
1655    };
1656    let mut ids = Vec::with_capacity(result.records.len());
1657    for record in &result.records {
1658        if let Some(id) = record.get(first_col).and_then(value_to_node_id) {
1659            ids.push(id);
1660        }
1661    }
1662    Ok(ids)
1663}
1664
1665/// Build the edge list from a materialized `edges` subquery result: the first
1666/// two projected columns are `(source, target)` and an optional third column
1667/// is the numeric weight (defaulting to 1.0). Fewer than two columns is a
1668/// shape error (issue #799).
1669fn inline_edges(
1670    name: &str,
1671    result: &crate::storage::query::unified::UnifiedResult,
1672) -> RedDBResult<
1673    Vec<(
1674        String,
1675        String,
1676        crate::storage::engine::graph_algorithms::Weight,
1677    )>,
1678> {
1679    if result.records.is_empty() {
1680        return Ok(Vec::new());
1681    }
1682    let columns = ordered_result_columns(result);
1683    if columns.len() < 2 {
1684        return Err(RedDBError::Query(format!(
1685            "table function '{name}' inline form: `edges` subquery must project at least two columns (source, target), got {}",
1686            columns.len()
1687        )));
1688    }
1689    let src_col = &columns[0];
1690    let dst_col = &columns[1];
1691    let weight_col = columns.get(2);
1692    let mut edges = Vec::with_capacity(result.records.len());
1693    for record in &result.records {
1694        let (Some(src), Some(dst)) = (
1695            record.get(src_col).and_then(value_to_node_id),
1696            record.get(dst_col).and_then(value_to_node_id),
1697        ) else {
1698            // A null/absent endpoint is not a valid edge; skip it.
1699            continue;
1700        };
1701        let weight = match weight_col {
1702            Some(col) => match record.get(col) {
1703                None | Some(crate::storage::schema::Value::Null) => 1.0,
1704                Some(value) => value_to_weight(value).ok_or_else(|| {
1705                    RedDBError::Query(format!(
1706                        "table function '{name}' inline form: `edges` weight column must be numeric"
1707                    ))
1708                })?,
1709            },
1710            None => 1.0,
1711        };
1712        edges.push((src, dst, weight));
1713    }
1714    Ok(edges)
1715}
1716
1717fn cache_scope_insert(scopes: &mut HashSet<String>, name: &str) {
1718    if name.is_empty() || name.starts_with("__subq_") || is_universal_query_source(name) {
1719        return;
1720    }
1721    scopes.insert(name.to_string());
1722}
1723
1724fn collect_table_source_scopes(scopes: &mut HashSet<String>, query: &TableQuery) {
1725    match query.source.as_ref() {
1726        Some(crate::storage::query::ast::TableSource::Name(name)) => {
1727            cache_scope_insert(scopes, name)
1728        }
1729        Some(crate::storage::query::ast::TableSource::Subquery(subquery)) => {
1730            collect_query_expr_result_cache_scopes(scopes, subquery);
1731        }
1732        // Graph-collection TVFs (e.g. `louvain(g)`) read the graph store
1733        // read-only. The result is now cached (issue #802) and scoped to the
1734        // graph collection named in the first argument, so any mutation on
1735        // that collection (`INSERT INTO g NODE/EDGE …`) invalidates the
1736        // entry via `invalidate_result_cache_for_table`. Non-graph or
1737        // zero-arg functions contribute no scope.
1738        Some(crate::storage::query::ast::TableSource::Function { name, args, .. }) => {
1739            if is_graph_tvf_name(name) {
1740                if let Some(graph) = args.first() {
1741                    cache_scope_insert(scopes, graph);
1742                }
1743            }
1744        }
1745        // The inline-graph form reads ordinary tables/docs through its
1746        // `nodes`/`edges` subqueries, so its result cache must be scoped to
1747        // those source collections — mutating any of them invalidates the
1748        // cached result (issue #799).
1749        Some(crate::storage::query::ast::TableSource::InlineGraphFunction {
1750            nodes, edges, ..
1751        }) => {
1752            collect_query_expr_result_cache_scopes(scopes, nodes);
1753            collect_query_expr_result_cache_scopes(scopes, edges);
1754        }
1755        None => cache_scope_insert(scopes, &query.table),
1756    }
1757}
1758
1759fn collect_vector_source_scopes(
1760    scopes: &mut HashSet<String>,
1761    source: &crate::storage::query::ast::VectorSource,
1762) {
1763    match source {
1764        crate::storage::query::ast::VectorSource::Reference { collection, .. } => {
1765            cache_scope_insert(scopes, collection);
1766        }
1767        crate::storage::query::ast::VectorSource::Subquery(subquery) => {
1768            collect_query_expr_result_cache_scopes(scopes, subquery);
1769        }
1770        crate::storage::query::ast::VectorSource::Literal(_)
1771        | crate::storage::query::ast::VectorSource::Text(_) => {}
1772    }
1773}
1774
1775fn collect_path_selector_scopes(
1776    scopes: &mut HashSet<String>,
1777    selector: &crate::storage::query::ast::NodeSelector,
1778) {
1779    if let crate::storage::query::ast::NodeSelector::ByRow { table, .. } = selector {
1780        cache_scope_insert(scopes, table);
1781    }
1782}
1783
1784fn collect_query_expr_result_cache_scopes(scopes: &mut HashSet<String>, expr: &QueryExpr) {
1785    match expr {
1786        QueryExpr::Table(query) => collect_table_source_scopes(scopes, query),
1787        QueryExpr::Join(query) => {
1788            collect_query_expr_result_cache_scopes(scopes, &query.left);
1789            collect_query_expr_result_cache_scopes(scopes, &query.right);
1790        }
1791        QueryExpr::Path(query) => {
1792            collect_path_selector_scopes(scopes, &query.from);
1793            collect_path_selector_scopes(scopes, &query.to);
1794        }
1795        QueryExpr::Vector(query) => {
1796            cache_scope_insert(scopes, &query.collection);
1797            collect_vector_source_scopes(scopes, &query.query_vector);
1798        }
1799        QueryExpr::Hybrid(query) => {
1800            collect_query_expr_result_cache_scopes(scopes, &query.structured);
1801            cache_scope_insert(scopes, &query.vector.collection);
1802            collect_vector_source_scopes(scopes, &query.vector.query_vector);
1803        }
1804        QueryExpr::Insert(query) => cache_scope_insert(scopes, &query.table),
1805        QueryExpr::Update(query) => cache_scope_insert(scopes, &query.table),
1806        QueryExpr::Delete(query) => cache_scope_insert(scopes, &query.table),
1807        QueryExpr::CreateTable(query) => cache_scope_insert(scopes, &query.name),
1808        QueryExpr::CreateCollection(query) => cache_scope_insert(scopes, &query.name),
1809        QueryExpr::CreateVector(query) => cache_scope_insert(scopes, &query.name),
1810        QueryExpr::DropTable(query) => cache_scope_insert(scopes, &query.name),
1811        QueryExpr::DropGraph(query) => cache_scope_insert(scopes, &query.name),
1812        QueryExpr::DropVector(query) => cache_scope_insert(scopes, &query.name),
1813        QueryExpr::DropDocument(query) => cache_scope_insert(scopes, &query.name),
1814        QueryExpr::DropKv(query) => cache_scope_insert(scopes, &query.name),
1815        QueryExpr::DropCollection(query) => cache_scope_insert(scopes, &query.name),
1816        QueryExpr::Truncate(query) => cache_scope_insert(scopes, &query.name),
1817        QueryExpr::AlterTable(query) => cache_scope_insert(scopes, &query.name),
1818        QueryExpr::CreateIndex(query) => cache_scope_insert(scopes, &query.table),
1819        QueryExpr::DropIndex(query) => cache_scope_insert(scopes, &query.table),
1820        QueryExpr::CreateTimeSeries(query) => cache_scope_insert(scopes, &query.name),
1821        QueryExpr::CreateMetric(query) => cache_scope_insert(scopes, &query.path),
1822        QueryExpr::AlterMetric(query) => cache_scope_insert(scopes, &query.path),
1823        QueryExpr::CreateSlo(query) => cache_scope_insert(scopes, &query.path),
1824        QueryExpr::DropTimeSeries(query) => cache_scope_insert(scopes, &query.name),
1825        QueryExpr::CreateQueue(query) => cache_scope_insert(scopes, &query.name),
1826        QueryExpr::AlterQueue(query) => cache_scope_insert(scopes, &query.name),
1827        QueryExpr::DropQueue(query) => cache_scope_insert(scopes, &query.name),
1828        QueryExpr::QueueSelect(query) => cache_scope_insert(scopes, &query.queue),
1829        QueryExpr::QueueCommand(query) => match query {
1830            QueueCommand::Push { queue, .. }
1831            | QueueCommand::Pop { queue, .. }
1832            | QueueCommand::Peek { queue, .. }
1833            | QueueCommand::Len { queue }
1834            | QueueCommand::Purge { queue }
1835            | QueueCommand::GroupCreate { queue, .. }
1836            | QueueCommand::GroupRead { queue, .. }
1837            | QueueCommand::Pending { queue, .. }
1838            | QueueCommand::Claim { queue, .. }
1839            | QueueCommand::Ack { queue, .. }
1840            | QueueCommand::Nack { queue, .. } => cache_scope_insert(scopes, queue),
1841            QueueCommand::Move {
1842                source,
1843                destination,
1844                ..
1845            } => {
1846                cache_scope_insert(scopes, source);
1847                cache_scope_insert(scopes, destination);
1848            }
1849        },
1850        QueryExpr::EventsBackfill(query) => {
1851            cache_scope_insert(scopes, &query.collection);
1852            cache_scope_insert(scopes, &query.target_queue);
1853        }
1854        QueryExpr::CreateTree(query) => cache_scope_insert(scopes, &query.collection),
1855        QueryExpr::DropTree(query) => cache_scope_insert(scopes, &query.collection),
1856        QueryExpr::TreeCommand(query) => match query {
1857            TreeCommand::Insert { collection, .. }
1858            | TreeCommand::Move { collection, .. }
1859            | TreeCommand::Delete { collection, .. }
1860            | TreeCommand::Validate { collection, .. }
1861            | TreeCommand::Rebalance { collection, .. } => cache_scope_insert(scopes, collection),
1862        },
1863        QueryExpr::SearchCommand(query) => match query {
1864            SearchCommand::Similar { collection, .. }
1865            | SearchCommand::Hybrid { collection, .. }
1866            | SearchCommand::SpatialRadius { collection, .. }
1867            | SearchCommand::SpatialBbox { collection, .. }
1868            | SearchCommand::SpatialNearest { collection, .. } => {
1869                cache_scope_insert(scopes, collection);
1870            }
1871            SearchCommand::Text { collection, .. }
1872            | SearchCommand::Multimodal { collection, .. }
1873            | SearchCommand::Index { collection, .. }
1874            | SearchCommand::Context { collection, .. } => {
1875                if let Some(collection) = collection.as_deref() {
1876                    cache_scope_insert(scopes, collection);
1877                }
1878            }
1879        },
1880        QueryExpr::Ask(query) => {
1881            if let Some(collection) = query.collection.as_deref() {
1882                cache_scope_insert(scopes, collection);
1883            }
1884        }
1885        QueryExpr::ExplainAlter(query) => cache_scope_insert(scopes, &query.target.name),
1886        QueryExpr::MaintenanceCommand(cmd) => match cmd {
1887            crate::storage::query::ast::MaintenanceCommand::Vacuum { target, .. }
1888            | crate::storage::query::ast::MaintenanceCommand::Analyze { target } => {
1889                if let Some(t) = target {
1890                    cache_scope_insert(scopes, t);
1891                }
1892            }
1893        },
1894        QueryExpr::CopyFrom(cmd) => cache_scope_insert(scopes, &cmd.table),
1895        QueryExpr::CreateView(cmd) => {
1896            cache_scope_insert(scopes, &cmd.name);
1897            // Invalidating the view should also invalidate its dependencies.
1898            collect_query_expr_result_cache_scopes(scopes, &cmd.query);
1899        }
1900        QueryExpr::DropView(cmd) => cache_scope_insert(scopes, &cmd.name),
1901        QueryExpr::RefreshMaterializedView(cmd) => cache_scope_insert(scopes, &cmd.name),
1902        QueryExpr::CreatePolicy(cmd) => cache_scope_insert(scopes, &cmd.table),
1903        QueryExpr::DropPolicy(cmd) => cache_scope_insert(scopes, &cmd.table),
1904        QueryExpr::CreateServer(_) | QueryExpr::DropServer(_) => {}
1905        QueryExpr::CreateForeignTable(cmd) => cache_scope_insert(scopes, &cmd.name),
1906        QueryExpr::DropForeignTable(cmd) => cache_scope_insert(scopes, &cmd.name),
1907        QueryExpr::Graph(_)
1908        | QueryExpr::GraphCommand(_)
1909        | QueryExpr::ProbabilisticCommand(_)
1910        | QueryExpr::SetConfig { .. }
1911        | QueryExpr::ShowConfig { .. }
1912        | QueryExpr::SetSecret { .. }
1913        | QueryExpr::DeleteSecret { .. }
1914        | QueryExpr::ShowSecrets { .. }
1915        | QueryExpr::SetTenant(_)
1916        | QueryExpr::ShowTenant
1917        | QueryExpr::TransactionControl(_)
1918        | QueryExpr::CreateSchema(_)
1919        | QueryExpr::DropSchema(_)
1920        | QueryExpr::CreateSequence(_)
1921        | QueryExpr::DropSequence(_)
1922        | QueryExpr::Grant(_)
1923        | QueryExpr::Revoke(_)
1924        | QueryExpr::AlterUser(_)
1925        | QueryExpr::CreateIamPolicy { .. }
1926        | QueryExpr::DropIamPolicy { .. }
1927        | QueryExpr::AttachPolicy { .. }
1928        | QueryExpr::DetachPolicy { .. }
1929        | QueryExpr::ShowPolicies { .. }
1930        | QueryExpr::ShowEffectivePermissions { .. }
1931        | QueryExpr::SimulatePolicy { .. }
1932        | QueryExpr::LintPolicy { .. }
1933        | QueryExpr::MigratePolicyMode { .. }
1934        | QueryExpr::CreateMigration(_)
1935        | QueryExpr::ApplyMigration(_)
1936        | QueryExpr::RollbackMigration(_)
1937        | QueryExpr::ExplainMigration(_)
1938        | QueryExpr::EventsBackfillStatus { .. } => {}
1939        QueryExpr::KvCommand(cmd) => {
1940            use crate::storage::query::ast::KvCommand;
1941            match cmd {
1942                KvCommand::Put { collection, .. }
1943                | KvCommand::InvalidateTags { collection, .. }
1944                | KvCommand::Get { collection, .. }
1945                | KvCommand::Unseal { collection, .. }
1946                | KvCommand::Rotate { collection, .. }
1947                | KvCommand::History { collection, .. }
1948                | KvCommand::List { collection, .. }
1949                | KvCommand::Purge { collection, .. }
1950                | KvCommand::Watch { collection, .. }
1951                | KvCommand::Delete { collection, .. }
1952                | KvCommand::Incr { collection, .. }
1953                | KvCommand::Cas { collection, .. } => cache_scope_insert(scopes, collection),
1954            }
1955        }
1956        QueryExpr::ConfigCommand(cmd) => {
1957            use crate::storage::query::ast::ConfigCommand;
1958            match cmd {
1959                ConfigCommand::Put { collection, .. }
1960                | ConfigCommand::Get { collection, .. }
1961                | ConfigCommand::Resolve { collection, .. }
1962                | ConfigCommand::Rotate { collection, .. }
1963                | ConfigCommand::Delete { collection, .. }
1964                | ConfigCommand::History { collection, .. }
1965                | ConfigCommand::List { collection, .. }
1966                | ConfigCommand::Watch { collection, .. }
1967                | ConfigCommand::InvalidVolatileOperation { collection, .. } => {
1968                    cache_scope_insert(scopes, collection)
1969                }
1970            }
1971        }
1972    }
1973}
1974
1975/// Combine matching RLS policies for a table + action into a single
1976/// `Filter` suitable for AND-ing into a caller's `WHERE` clause.
1977///
1978/// Returns `None` when RLS is disabled or no policy admits the caller's
1979/// role — callers use that to short-circuit the mutation (for DELETE /
1980/// UPDATE we simply skip the operation, which PG expresses as "no rows
1981/// match the policy + predicate combination").
1982pub(crate) fn rls_policy_filter(
1983    runtime: &RedDBRuntime,
1984    table: &str,
1985    action: crate::storage::query::ast::PolicyAction,
1986) -> Option<crate::storage::query::ast::Filter> {
1987    rls_policy_filter_for_kind(
1988        runtime,
1989        table,
1990        action,
1991        crate::storage::query::ast::PolicyTargetKind::Table,
1992    )
1993}
1994
1995/// Kind-aware policy filter combiner (Phase 2.5.5 RLS universal).
1996/// Graph / vector / queue / timeseries scans pass the concrete kind;
1997/// policies targeting other kinds are ignored. Legacy Table-scoped
1998/// policies still apply cross-kind — callers register auto-tenancy
1999/// policies as Table today.
2000pub(crate) fn rls_policy_filter_for_kind(
2001    runtime: &RedDBRuntime,
2002    table: &str,
2003    action: crate::storage::query::ast::PolicyAction,
2004    kind: crate::storage::query::ast::PolicyTargetKind,
2005) -> Option<crate::storage::query::ast::Filter> {
2006    use crate::storage::query::ast::Filter;
2007
2008    if !runtime.inner.rls_enabled_tables.read().contains(table) {
2009        return None;
2010    }
2011    let role = current_auth_identity().map(|(_, role)| role);
2012    let role_str = role.map(|r| r.as_str().to_string());
2013    let policies = runtime.matching_rls_policies_for_kind(table, role_str.as_deref(), action, kind);
2014    if policies.is_empty() {
2015        return None;
2016    }
2017    policies
2018        .into_iter()
2019        .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
2020}
2021
2022/// Returns true when the table has RLS enforcement enabled. Convenience
2023/// shortcut so DML paths can gate the AND-combine work without reaching
2024/// into `runtime.inner.rls_enabled_tables` directly.
2025pub(crate) fn rls_is_enabled(runtime: &RedDBRuntime, table: &str) -> bool {
2026    runtime.inner.rls_enabled_tables.read().contains(table)
2027}
2028
2029/// Per-entity gate used by the graph materialiser for `GraphNode`
2030/// entities. RLS is checked against the source collection with
2031/// `kind = Nodes`, which `matching_rls_policies_for_kind` resolves to
2032/// either `Nodes`-targeted policies or legacy `Table`-targeted ones
2033/// (for back-compat with auto-tenancy declarations). Cached per
2034/// collection so big graphs only resolve the policy chain once.
2035fn node_passes_rls(
2036    runtime: &RedDBRuntime,
2037    collection: &str,
2038    role: Option<&str>,
2039    cache: &mut std::collections::HashMap<String, Option<crate::storage::query::ast::Filter>>,
2040    entity: &crate::storage::unified::entity::UnifiedEntity,
2041) -> bool {
2042    use crate::storage::query::ast::{Filter, PolicyAction, PolicyTargetKind};
2043
2044    if !runtime.inner.rls_enabled_tables.read().contains(collection) {
2045        return true;
2046    }
2047    let filter = cache.entry(collection.to_string()).or_insert_with(|| {
2048        let policies = runtime.matching_rls_policies_for_kind(
2049            collection,
2050            role,
2051            PolicyAction::Select,
2052            PolicyTargetKind::Nodes,
2053        );
2054        if policies.is_empty() {
2055            None
2056        } else {
2057            policies
2058                .into_iter()
2059                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
2060        }
2061    });
2062    let Some(filter) = filter else {
2063        return false;
2064    };
2065    crate::runtime::query_exec::evaluate_entity_filter_with_db(
2066        Some(&runtime.inner.db),
2067        entity,
2068        filter,
2069        collection,
2070        collection,
2071    )
2072}
2073
2074/// Edge counterpart of `node_passes_rls`. Same caching strategy with
2075/// `kind = Edges`.
2076fn edge_passes_rls(
2077    runtime: &RedDBRuntime,
2078    collection: &str,
2079    role: Option<&str>,
2080    cache: &mut std::collections::HashMap<String, Option<crate::storage::query::ast::Filter>>,
2081    entity: &crate::storage::unified::entity::UnifiedEntity,
2082) -> bool {
2083    use crate::storage::query::ast::{Filter, PolicyAction, PolicyTargetKind};
2084
2085    if !runtime.inner.rls_enabled_tables.read().contains(collection) {
2086        return true;
2087    }
2088    let filter = cache.entry(collection.to_string()).or_insert_with(|| {
2089        let policies = runtime.matching_rls_policies_for_kind(
2090            collection,
2091            role,
2092            PolicyAction::Select,
2093            PolicyTargetKind::Edges,
2094        );
2095        if policies.is_empty() {
2096            None
2097        } else {
2098            policies
2099                .into_iter()
2100                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
2101        }
2102    });
2103    let Some(filter) = filter else {
2104        return false;
2105    };
2106    crate::runtime::query_exec::evaluate_entity_filter_with_db(
2107        Some(&runtime.inner.db),
2108        entity,
2109        filter,
2110        collection,
2111        collection,
2112    )
2113}
2114
2115/// RLS policy injection (Phase 2.5.2 PG parity).
2116///
2117/// Fetch every matching policy for the current thread-local role and
2118/// fold them into the query's filter. Semantics mirror PostgreSQL:
2119///
2120/// * Multiple policies on the same table combine with **OR** — a row is
2121///   visible if *any* policy admits it.
2122/// * The combined policy predicate is **AND**-ed into the caller's
2123///   existing `WHERE` clause so explicit predicates continue to trim
2124///   the policy-allowed set.
2125/// * No matching policies + RLS enabled = zero rows (PG's
2126///   restrictive-default). Callers get `None` and return an empty
2127///   `UnifiedResult` without ever dispatching the scan.
2128///
2129/// This runs only when `RuntimeInner::rls_enabled_tables` already
2130/// contains the table name — callers gate the hot path upfront to
2131/// avoid the lock acquisition on tables without RLS.
2132///
2133/// Returns `None` when no policy admits the current role; returns
2134/// `Some(mutated_table)` with policy filters folded in otherwise.
2135fn inject_rls_filters(
2136    runtime: &RedDBRuntime,
2137    frame: &dyn super::statement_frame::ReadFrame,
2138    mut table: crate::storage::query::ast::TableQuery,
2139) -> Option<crate::storage::query::ast::TableQuery> {
2140    use crate::storage::query::ast::{Filter, PolicyAction};
2141
2142    // `None` role falls through to policies with no `TO role` clause.
2143    let role = frame.identity().map(|(_, role)| role);
2144    let role_str = role.map(|r| r.as_str().to_string());
2145    let policies =
2146        runtime.matching_rls_policies(&table.table, role_str.as_deref(), PolicyAction::Select);
2147
2148    if policies.is_empty() {
2149        // RLS enabled + no policy match = deny everything. Signal the
2150        // caller to short-circuit with an empty result set.
2151        return None;
2152    }
2153
2154    // Combine policy predicates with OR (PG's permissive default).
2155    let combined = policies
2156        .into_iter()
2157        .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
2158        .expect("policies non-empty");
2159
2160    // AND into the caller's existing predicate. The predicate may live
2161    // in `where_expr` rather than `filter`: `resolve_table_expr_subqueries`
2162    // nulls `filter` whenever `where_expr` is present (the case for a
2163    // view body rewritten into `SELECT … WHERE …`). Folding only into
2164    // `filter` here would silently drop that `where_expr` predicate at
2165    // eval time because `effective_table_filter` prefers `filter` —
2166    // e.g. `WITHIN TENANT … SELECT * FROM <view>` would apply the
2167    // tenant policy but lose the view's own WHERE (#635).
2168    use crate::storage::query::sql_lowering::{expr_to_filter, filter_to_expr};
2169    let had_where_expr = table.where_expr.is_some();
2170    let existing = table
2171        .filter
2172        .take()
2173        .or_else(|| table.where_expr.as_ref().map(expr_to_filter));
2174    let new_filter = match existing {
2175        Some(existing) => Filter::And(Box::new(existing), Box::new(combined)),
2176        None => combined,
2177    };
2178    // Keep `where_expr` in lock-step with the merged `filter` so
2179    // whichever the executor consults sees the full predicate.
2180    if had_where_expr {
2181        table.where_expr = Some(filter_to_expr(&new_filter));
2182    }
2183    table.filter = Some(new_filter);
2184    Some(table)
2185}
2186
2187/// Apply per-table RLS to a `JoinQuery` by folding each side's policy
2188/// predicate into the join's outer filter. Walking the merged record
2189/// at the join layer (rather than mutating the per-side scan filter)
2190/// keeps the planner's strategy choice and per-side index selection
2191/// undisturbed — the policy predicate uses the qualified `t.col` form
2192/// that resolves cleanly against the merged record's keys.
2193///
2194/// Returns `None` when any leaf has RLS enabled and no policy admits
2195/// the caller — the join short-circuits to an empty result.
2196fn inject_rls_into_join(
2197    runtime: &RedDBRuntime,
2198    frame: &dyn super::statement_frame::ReadFrame,
2199    mut join: crate::storage::query::ast::JoinQuery,
2200) -> Option<crate::storage::query::ast::JoinQuery> {
2201    use crate::storage::query::ast::Filter;
2202
2203    let mut policy_filters: Vec<Filter> = Vec::new();
2204    if !collect_join_side_policy(runtime, frame, join.left.as_ref(), &mut policy_filters) {
2205        return None;
2206    }
2207    if !collect_join_side_policy(runtime, frame, join.right.as_ref(), &mut policy_filters) {
2208        return None;
2209    }
2210
2211    if policy_filters.is_empty() {
2212        return Some(join);
2213    }
2214
2215    let combined = policy_filters
2216        .into_iter()
2217        .reduce(|acc, f| Filter::And(Box::new(acc), Box::new(f)))
2218        .expect("policy_filters non-empty");
2219
2220    join.filter = Some(match join.filter.take() {
2221        Some(existing) => Filter::And(Box::new(existing), Box::new(combined)),
2222        None => combined,
2223    });
2224
2225    Some(join)
2226}
2227
2228/// For each `Table` leaf reachable through nested joins, append the
2229/// RLS-policy filter (combined with OR across that side's matching
2230/// policies) into `out`. Returns `false` when a side has RLS enabled
2231/// but no policy admits the caller — the join must short-circuit.
2232fn collect_join_side_policy(
2233    runtime: &RedDBRuntime,
2234    frame: &dyn super::statement_frame::ReadFrame,
2235    expr: &crate::storage::query::ast::QueryExpr,
2236    out: &mut Vec<crate::storage::query::ast::Filter>,
2237) -> bool {
2238    use crate::storage::query::ast::{Filter, PolicyAction, QueryExpr};
2239    match expr {
2240        QueryExpr::Table(t) => {
2241            if !runtime.inner.rls_enabled_tables.read().contains(&t.table) {
2242                return true;
2243            }
2244            let role = frame.identity().map(|(_, role)| role);
2245            let role_str = role.map(|r| r.as_str().to_string());
2246            let policies =
2247                runtime.matching_rls_policies(&t.table, role_str.as_deref(), PolicyAction::Select);
2248            if policies.is_empty() {
2249                return false;
2250            }
2251            let combined = policies
2252                .into_iter()
2253                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
2254                .expect("policies non-empty");
2255            out.push(combined);
2256            true
2257        }
2258        QueryExpr::Join(inner) => {
2259            collect_join_side_policy(runtime, frame, inner.left.as_ref(), out)
2260                && collect_join_side_policy(runtime, frame, inner.right.as_ref(), out)
2261        }
2262        _ => true,
2263    }
2264}
2265
2266/// Foreign-table post-scan filter application (Phase 3.2.2 PG parity).
2267///
2268/// Phase 3.2 FDW wrappers don't advertise filter pushdown, so the runtime
2269/// applies `WHERE` / `ORDER BY` / `LIMIT` / `OFFSET` after the wrapper
2270/// materialises all rows. Projections are best-effort — when the query
2271/// lists explicit columns we keep only those; a `SELECT *` keeps every
2272/// wrapper-emitted field verbatim.
2273///
2274/// When a wrapper later opts into pushdown (`supports_pushdown = true`)
2275/// the runtime will pass the compiled filter down instead of post-filtering.
2276fn apply_foreign_table_filters(
2277    records: Vec<crate::storage::query::unified::UnifiedRecord>,
2278    query: &crate::storage::query::ast::TableQuery,
2279) -> crate::storage::query::unified::UnifiedResult {
2280    use crate::storage::query::sql_lowering::{
2281        effective_table_filter, effective_table_projections,
2282    };
2283    use crate::storage::query::unified::UnifiedResult;
2284
2285    let filter = effective_table_filter(query);
2286    let projections = effective_table_projections(query);
2287
2288    // Step 1 — WHERE. Reuse the cross-store evaluator so the semantics
2289    // match native-collection queries (same operators, same NULL handling).
2290    let mut filtered: Vec<_> = records
2291        .into_iter()
2292        .filter(|record| match &filter {
2293            Some(f) => {
2294                super::join_filter::evaluate_runtime_filter_with_db(None, record, f, None, None)
2295            }
2296            None => true,
2297        })
2298        .collect();
2299
2300    // Step 2 — LIMIT / OFFSET. Applied after filter to match SQL semantics.
2301    if let Some(offset) = query.offset {
2302        let offset = offset as usize;
2303        if offset >= filtered.len() {
2304            filtered.clear();
2305        } else {
2306            filtered.drain(0..offset);
2307        }
2308    }
2309    if let Some(limit) = query.limit {
2310        filtered.truncate(limit as usize);
2311    }
2312
2313    // Step 3 — columns list. `SELECT *` (no explicit projections) keeps
2314    // the wrapper's column set; an explicit list trims to those names.
2315    let columns: Vec<String> = if projections.is_empty() {
2316        filtered
2317            .first()
2318            .map(|r| r.column_names().iter().map(|k| k.to_string()).collect())
2319            .unwrap_or_default()
2320    } else {
2321        projections
2322            .iter()
2323            .map(super::join_filter::projection_name)
2324            .collect()
2325    };
2326
2327    let mut result = UnifiedResult::empty();
2328    result.columns = columns;
2329    result.records = filtered;
2330    result
2331}
2332
2333/// Collect every concrete table reference inside a `QueryExpr`.
2334///
2335/// Used by view bookkeeping (dependency tracking for materialised
2336/// invalidation) and any other rewriter that needs to know the base
2337/// tables a query pulls from. Does not descend into projections/filters;
2338/// only the `FROM` side.
2339pub(crate) fn collect_table_refs(expr: &QueryExpr) -> Vec<String> {
2340    let mut scopes: HashSet<String> = HashSet::new();
2341    collect_query_expr_result_cache_scopes(&mut scopes, expr);
2342    scopes.into_iter().collect()
2343}
2344
2345fn query_expr_result_cache_scopes(expr: &QueryExpr) -> HashSet<String> {
2346    let mut scopes = HashSet::new();
2347    collect_query_expr_result_cache_scopes(&mut scopes, expr);
2348    scopes
2349}
2350
2351const RESULT_CACHE_BACKEND_KEY: &str = "runtime.result_cache.backend";
2352const RESULT_CACHE_DEFAULT_BACKEND: &str = "legacy";
2353const RESULT_CACHE_BLOB_NAMESPACE: &str = "runtime.result_cache";
2354// Issue #802: TTL / capacity are now read from config at call time; these
2355// constants are the defaults the config falls back to (and match the
2356// `runtime.result_cache.*` matrix entries).
2357const RESULT_CACHE_TTL_SECS: u64 = 30;
2358const RESULT_CACHE_MAX_ENTRIES: usize = 1000;
2359const RESULT_CACHE_ENABLED_KEY: &str = "runtime.result_cache.enabled";
2360const RESULT_CACHE_TTL_KEY: &str = "runtime.result_cache.ttl_seconds";
2361const RESULT_CACHE_CAPACITY_KEY: &str = "runtime.result_cache.capacity_entries";
2362const RESULT_CACHE_PAYLOAD_MAGIC: &[u8; 8] = b"RDRC0001";
2363
2364#[derive(Clone, Copy, Debug, PartialEq, Eq)]
2365enum RuntimeResultCacheBackend {
2366    Legacy,
2367    BlobCache,
2368    Shadow,
2369}
2370
2371/// Evict oldest entries until `map` fits in `max_entries`. Returns the
2372/// number of entries evicted so callers can bump the eviction metric
2373/// (issue #802).
2374fn trim_result_cache(
2375    map: &mut HashMap<String, RuntimeResultCacheEntry>,
2376    order: &mut std::collections::VecDeque<String>,
2377    max_entries: usize,
2378) -> u64 {
2379    let mut evicted = 0u64;
2380    while map.len() > max_entries {
2381        if let Some(oldest) = order.pop_front() {
2382            if map.remove(&oldest).is_some() {
2383                evicted += 1;
2384            }
2385        } else {
2386            break;
2387        }
2388    }
2389    evicted
2390}
2391
2392fn result_cache_fingerprint(result: &RuntimeQueryResult) -> String {
2393    format!(
2394        "{:?}|{}|{}|{}|{}|{:?}",
2395        result.result,
2396        result.query,
2397        result.statement,
2398        result.engine,
2399        result.affected_rows,
2400        result.statement_type
2401    )
2402}
2403
2404fn mode_to_byte(mode: crate::storage::query::modes::QueryMode) -> u8 {
2405    match mode {
2406        crate::storage::query::modes::QueryMode::Sql => 0,
2407        crate::storage::query::modes::QueryMode::Gremlin => 1,
2408        crate::storage::query::modes::QueryMode::Cypher => 2,
2409        crate::storage::query::modes::QueryMode::Sparql => 3,
2410        crate::storage::query::modes::QueryMode::Path => 4,
2411        crate::storage::query::modes::QueryMode::Natural => 5,
2412        crate::storage::query::modes::QueryMode::Unknown => 255,
2413    }
2414}
2415
2416fn mode_from_byte(byte: u8) -> Option<crate::storage::query::modes::QueryMode> {
2417    match byte {
2418        0 => Some(crate::storage::query::modes::QueryMode::Sql),
2419        1 => Some(crate::storage::query::modes::QueryMode::Gremlin),
2420        2 => Some(crate::storage::query::modes::QueryMode::Cypher),
2421        3 => Some(crate::storage::query::modes::QueryMode::Sparql),
2422        4 => Some(crate::storage::query::modes::QueryMode::Path),
2423        5 => Some(crate::storage::query::modes::QueryMode::Natural),
2424        255 => Some(crate::storage::query::modes::QueryMode::Unknown),
2425        _ => None,
2426    }
2427}
2428
2429fn result_cache_static_str(value: &str) -> Option<&'static str> {
2430    match value {
2431        "select" => Some("select"),
2432        "materialized-graph" => Some("materialized-graph"),
2433        "runtime-red-schema" => Some("runtime-red-schema"),
2434        "runtime-fdw" => Some("runtime-fdw"),
2435        "runtime-table-rls" => Some("runtime-table-rls"),
2436        "runtime-table" => Some("runtime-table"),
2437        "runtime-join-rls" => Some("runtime-join-rls"),
2438        "runtime-join" => Some("runtime-join"),
2439        "runtime-vector" => Some("runtime-vector"),
2440        "runtime-hybrid" => Some("runtime-hybrid"),
2441        "runtime-secret" => Some("runtime-secret"),
2442        "runtime-config" => Some("runtime-config"),
2443        "runtime-tenant" => Some("runtime-tenant"),
2444        "runtime-explain" => Some("runtime-explain"),
2445        "runtime-tree" => Some("runtime-tree"),
2446        "runtime-kv" => Some("runtime-kv"),
2447        "runtime-queue" => Some("runtime-queue"),
2448        _ => None,
2449    }
2450}
2451
2452fn write_u32(out: &mut Vec<u8>, value: usize) -> Option<()> {
2453    let value = u32::try_from(value).ok()?;
2454    out.extend_from_slice(&value.to_le_bytes());
2455    Some(())
2456}
2457
2458fn write_string(out: &mut Vec<u8>, value: &str) -> Option<()> {
2459    write_u32(out, value.len())?;
2460    out.extend_from_slice(value.as_bytes());
2461    Some(())
2462}
2463
2464fn write_bytes(out: &mut Vec<u8>, value: &[u8]) -> Option<()> {
2465    write_u32(out, value.len())?;
2466    out.extend_from_slice(value);
2467    Some(())
2468}
2469
2470fn read_u8(input: &mut &[u8]) -> Option<u8> {
2471    let (&value, rest) = input.split_first()?;
2472    *input = rest;
2473    Some(value)
2474}
2475
2476fn read_u32(input: &mut &[u8]) -> Option<usize> {
2477    if input.len() < 4 {
2478        return None;
2479    }
2480    let value = u32::from_le_bytes(input[..4].try_into().ok()?) as usize;
2481    *input = &input[4..];
2482    Some(value)
2483}
2484
2485fn read_u64(input: &mut &[u8]) -> Option<u64> {
2486    if input.len() < 8 {
2487        return None;
2488    }
2489    let value = u64::from_le_bytes(input[..8].try_into().ok()?);
2490    *input = &input[8..];
2491    Some(value)
2492}
2493
2494fn read_string(input: &mut &[u8]) -> Option<String> {
2495    let len = read_u32(input)?;
2496    if input.len() < len {
2497        return None;
2498    }
2499    let value = String::from_utf8(input[..len].to_vec()).ok()?;
2500    *input = &input[len..];
2501    Some(value)
2502}
2503
2504fn read_bytes<'a>(input: &mut &'a [u8]) -> Option<&'a [u8]> {
2505    let len = read_u32(input)?;
2506    if input.len() < len {
2507        return None;
2508    }
2509    let value = &input[..len];
2510    *input = &input[len..];
2511    Some(value)
2512}
2513
2514fn encode_result_cache_payload(entry: &RuntimeResultCacheEntry) -> Option<Vec<u8>> {
2515    let result = &entry.result;
2516    if result.result.pre_serialized_json.is_some()
2517        || result_cache_static_str(result.statement).is_none()
2518        || result_cache_static_str(result.engine).is_none()
2519        || result_cache_static_str(result.statement_type).is_none()
2520        || result.result.records.iter().any(|record| {
2521            !record.nodes.is_empty()
2522                || !record.edges.is_empty()
2523                || !record.paths.is_empty()
2524                || !record.vector_results.is_empty()
2525        })
2526    {
2527        return None;
2528    }
2529
2530    let mut out = Vec::new();
2531    out.extend_from_slice(RESULT_CACHE_PAYLOAD_MAGIC);
2532    write_string(&mut out, &result.query)?;
2533    out.push(mode_to_byte(result.mode));
2534    write_string(&mut out, result.statement)?;
2535    write_string(&mut out, result.engine)?;
2536    out.extend_from_slice(&result.affected_rows.to_le_bytes());
2537    write_string(&mut out, result.statement_type)?;
2538
2539    write_u32(&mut out, result.result.columns.len())?;
2540    for column in &result.result.columns {
2541        write_string(&mut out, column)?;
2542    }
2543    out.extend_from_slice(&result.result.stats.nodes_scanned.to_le_bytes());
2544    out.extend_from_slice(&result.result.stats.edges_scanned.to_le_bytes());
2545    out.extend_from_slice(&result.result.stats.rows_scanned.to_le_bytes());
2546    out.extend_from_slice(&result.result.stats.exec_time_us.to_le_bytes());
2547
2548    write_u32(&mut out, result.result.records.len())?;
2549    for record in &result.result.records {
2550        let fields = record.iter_fields().collect::<Vec<_>>();
2551        write_u32(&mut out, fields.len())?;
2552        for (name, value) in fields {
2553            write_string(&mut out, name)?;
2554            let mut encoded = Vec::new();
2555            crate::storage::schema::value_codec::encode(value, &mut encoded);
2556            write_bytes(&mut out, &encoded)?;
2557        }
2558    }
2559
2560    write_u32(&mut out, entry.scopes.len())?;
2561    for scope in &entry.scopes {
2562        write_string(&mut out, scope)?;
2563    }
2564    Some(out)
2565}
2566
2567fn decode_result_cache_payload(mut input: &[u8]) -> Option<(RuntimeQueryResult, HashSet<String>)> {
2568    if input.len() < RESULT_CACHE_PAYLOAD_MAGIC.len()
2569        || &input[..RESULT_CACHE_PAYLOAD_MAGIC.len()] != RESULT_CACHE_PAYLOAD_MAGIC
2570    {
2571        return None;
2572    }
2573    input = &input[RESULT_CACHE_PAYLOAD_MAGIC.len()..];
2574
2575    let query = read_string(&mut input)?;
2576    let mode = mode_from_byte(read_u8(&mut input)?)?;
2577    let statement = result_cache_static_str(&read_string(&mut input)?)?;
2578    let engine = result_cache_static_str(&read_string(&mut input)?)?;
2579    let affected_rows = read_u64(&mut input)?;
2580    let statement_type = result_cache_static_str(&read_string(&mut input)?)?;
2581
2582    let mut columns = Vec::new();
2583    for _ in 0..read_u32(&mut input)? {
2584        columns.push(read_string(&mut input)?);
2585    }
2586    let stats = crate::storage::query::unified::QueryStats {
2587        nodes_scanned: read_u64(&mut input)?,
2588        edges_scanned: read_u64(&mut input)?,
2589        rows_scanned: read_u64(&mut input)?,
2590        exec_time_us: read_u64(&mut input)?,
2591    };
2592
2593    let mut records = Vec::new();
2594    for _ in 0..read_u32(&mut input)? {
2595        let mut record = crate::storage::query::unified::UnifiedRecord::new();
2596        for _ in 0..read_u32(&mut input)? {
2597            let name = read_string(&mut input)?;
2598            let bytes = read_bytes(&mut input)?;
2599            let (value, used) = crate::storage::schema::value_codec::decode(bytes).ok()?;
2600            if used != bytes.len() {
2601                return None;
2602            }
2603            record.set_owned(name, value);
2604        }
2605        records.push(record);
2606    }
2607
2608    let mut scopes = HashSet::new();
2609    for _ in 0..read_u32(&mut input)? {
2610        scopes.insert(read_string(&mut input)?);
2611    }
2612    if !input.is_empty() {
2613        return None;
2614    }
2615
2616    Some((
2617        RuntimeQueryResult {
2618            query,
2619            mode,
2620            statement,
2621            engine,
2622            result: crate::storage::query::unified::UnifiedResult {
2623                columns,
2624                records,
2625                stats,
2626                pre_serialized_json: None,
2627            },
2628            affected_rows,
2629            statement_type,
2630            bookmark: None,
2631        },
2632        scopes,
2633    ))
2634}
2635
2636/// Heuristic: does the raw SQL reference a built-in whose output
2637/// varies by connection, clock, or randomness? Such queries must
2638/// skip the 30s result cache — see the call site for rationale.
2639///
2640/// ASCII case-insensitive substring match. False positives (the
2641/// token appears in a quoted string) only skip caching, which is
2642/// the conservative direction.
2643/// If `sql` starts with `EXPLAIN` followed by a non-`ALTER` token,
2644/// return the trimmed inner statement; otherwise `None`.
2645///
2646/// `EXPLAIN ALTER FOR CREATE TABLE ...` is a separate schema-diff
2647/// command handled inside the normal SQL parser, so we leave it
2648/// alone here.
2649fn strip_explain_prefix(sql: &str) -> Option<&str> {
2650    let trimmed = sql.trim_start();
2651    let (head, rest) = trimmed.split_at(
2652        trimmed
2653            .find(|c: char| c.is_whitespace())
2654            .unwrap_or(trimmed.len()),
2655    );
2656    if !head.eq_ignore_ascii_case("EXPLAIN") {
2657        return None;
2658    }
2659    let rest = rest.trim_start();
2660    if rest.is_empty() {
2661        return None;
2662    }
2663    // Peek the next token — if ALTER or ASK, defer to the normal parser.
2664    // `EXPLAIN ASK` is an executable read path: it runs retrieval and
2665    // provider selection, then short-circuits before the LLM call.
2666    let next_head_end = rest.find(|c: char| c.is_whitespace()).unwrap_or(rest.len());
2667    if rest[..next_head_end].eq_ignore_ascii_case("ALTER")
2668        || rest[..next_head_end].eq_ignore_ascii_case("ASK")
2669    {
2670        return None;
2671    }
2672    Some(rest)
2673}
2674
2675/// Cheap prefix check for a leading `WITH` keyword. Used to gate the
2676/// CTE-aware parse in `execute_query` without paying for a full
2677/// lexer pass on every statement. Treats `WITHIN` as not-a-CTE so
2678/// `WITHIN TENANT '...' SELECT ...` doesn't mis-route.
2679pub(super) fn has_with_prefix(sql: &str) -> bool {
2680    let trimmed = sql.trim_start();
2681    let head_end = trimmed
2682        .find(|c: char| c.is_whitespace() || c == '(')
2683        .unwrap_or(trimmed.len());
2684    trimmed[..head_end].eq_ignore_ascii_case("WITH")
2685}
2686
2687/// If the query is a plain SELECT whose top-level `TableQuery`
2688/// carries an `AS OF` clause, return a typed spec that the runtime
2689/// can feed to `vcs_resolve_as_of`. Returns `None` for any other
2690/// shape — joins, DML, EXPLAIN, or parse failures — so callers fall
2691/// back to the connection's regular MVCC snapshot. A cheap textual
2692/// prefilter skips the parse entirely when the source doesn't
2693/// mention `AS OF` / `as of`, keeping the autocommit hot path free.
2694fn peek_top_level_as_of(sql: &str) -> Option<crate::application::vcs::AsOfSpec> {
2695    peek_top_level_as_of_with_table(sql).map(|(spec, _)| spec)
2696}
2697
2698/// Same as `peek_top_level_as_of` but also returns the table name
2699/// targeted by the AS OF clause (when the FROM clause names a
2700/// concrete table). `None` for the table slot means scalar SELECT
2701/// or a subquery source — callers treat those as "no enforcement".
2702pub(super) fn peek_top_level_as_of_with_table(
2703    sql: &str,
2704) -> Option<(crate::application::vcs::AsOfSpec, Option<String>)> {
2705    if !sql
2706        .as_bytes()
2707        .windows(5)
2708        .any(|w| w.eq_ignore_ascii_case(b"as of"))
2709    {
2710        return None;
2711    }
2712    let parsed = crate::storage::query::parser::parse(sql).ok()?;
2713    let crate::storage::query::ast::QueryExpr::Table(table) = parsed.query else {
2714        return None;
2715    };
2716    let clause = table.as_of?;
2717    let table_name = if table.table.is_empty() || table.table == "any" {
2718        None
2719    } else {
2720        Some(table.table.clone())
2721    };
2722    let spec = match clause {
2723        crate::storage::query::ast::AsOfClause::Commit(h) => {
2724            crate::application::vcs::AsOfSpec::Commit(h)
2725        }
2726        crate::storage::query::ast::AsOfClause::Branch(b) => {
2727            crate::application::vcs::AsOfSpec::Branch(b)
2728        }
2729        crate::storage::query::ast::AsOfClause::Tag(t) => crate::application::vcs::AsOfSpec::Tag(t),
2730        crate::storage::query::ast::AsOfClause::TimestampMs(ts) => {
2731            crate::application::vcs::AsOfSpec::TimestampMs(ts)
2732        }
2733        crate::storage::query::ast::AsOfClause::Snapshot(x) => {
2734            crate::application::vcs::AsOfSpec::Snapshot(x)
2735        }
2736    };
2737    Some((spec, table_name))
2738}
2739
2740pub(super) fn query_has_volatile_builtin(sql: &str) -> bool {
2741    // Lowercase the bytes up to the first null/newline into a small
2742    // stack buffer for cheap contains() checks. Most SQL fits in the
2743    // buffer; longer queries fall back to owned lowercase.
2744    const VOLATILE_TOKENS: &[&str] = &[
2745        "pg_advisory_lock",
2746        "pg_try_advisory_lock",
2747        "pg_advisory_unlock",
2748        "random()",
2749        // NOW() / CURRENT_TIMESTAMP / CURRENT_DATE intentionally
2750        // omitted for now — they ARE volatile but today's tests rely
2751        // on caching them. Revisit once a tighter volatility story
2752        // lands.
2753    ];
2754    let lowered = sql.to_ascii_lowercase();
2755    VOLATILE_TOKENS.iter().any(|t| lowered.contains(t))
2756}
2757
2758pub(super) fn query_is_ask_statement(sql: &str) -> bool {
2759    let trimmed = sql.trim_start();
2760    let head_end = trimmed
2761        .find(|c: char| c.is_whitespace() || c == '(' || c == ';')
2762        .unwrap_or(trimmed.len());
2763    trimmed[..head_end].eq_ignore_ascii_case("ASK")
2764}
2765
2766/// Pick the `(global_mode, collection_mode)` pair for an expression,
2767/// or `None` for variants that opt out of intent-locking entirely
2768/// (admin statements like `SHOW CONFIG`, transaction control, tenant
2769/// toggles).
2770///
2771/// Phase-1 contract:
2772/// - Reads  — `(IX-compatible) (Global, IS) → (Collection, IS)`
2773/// - Writes — `(IX-compatible) (Global, IX) → (Collection, IX)`
2774/// - DDL    — `(strong)        (Global, IX) → (Collection, X)`
2775pub(super) fn intent_lock_modes_for(
2776    expr: &QueryExpr,
2777) -> Option<(
2778    crate::storage::transaction::lock::LockMode,
2779    crate::storage::transaction::lock::LockMode,
2780)> {
2781    use crate::storage::transaction::lock::LockMode::{Exclusive, IntentExclusive, IntentShared};
2782
2783    match expr {
2784        // Reads — IS / IS.
2785        QueryExpr::Table(_)
2786        | QueryExpr::Join(_)
2787        | QueryExpr::Vector(_)
2788        | QueryExpr::Hybrid(_)
2789        | QueryExpr::Graph(_)
2790        | QueryExpr::Path(_)
2791        | QueryExpr::Ask(_)
2792        | QueryExpr::SearchCommand(_)
2793        | QueryExpr::GraphCommand(_)
2794        | QueryExpr::QueueSelect(_) => Some((IntentShared, IntentShared)),
2795
2796        // Writes — IX / IX. Non-tabular mutations (vector insert,
2797        // graph node insert, queue push, timeseries point insert)
2798        // don't carry their own dispatch arm here; they ride through
2799        // the Insert variant or a command variant covered by the
2800        // read-side arm above. P1.T4 expands only the TableQuery-ish
2801        // writes; non-tabular kinds inherit when their DML variants
2802        // land in later phases.
2803        QueryExpr::Insert(_)
2804        | QueryExpr::Update(_)
2805        | QueryExpr::Delete(_)
2806        | QueryExpr::QueueCommand(QueueCommand::Move { .. }) => {
2807            Some((IntentExclusive, IntentExclusive))
2808        }
2809        QueryExpr::QueueCommand(_) => Some((IntentShared, IntentShared)),
2810
2811        // DDL — IX / X. A DDL against collection `c` blocks all
2812        // other writers + readers on `c` but leaves other collections
2813        // running (because Global stays IX, not X).
2814        QueryExpr::CreateTable(_)
2815        | QueryExpr::CreateCollection(_)
2816        | QueryExpr::CreateVector(_)
2817        | QueryExpr::DropTable(_)
2818        | QueryExpr::DropGraph(_)
2819        | QueryExpr::DropVector(_)
2820        | QueryExpr::DropDocument(_)
2821        | QueryExpr::DropKv(_)
2822        | QueryExpr::DropCollection(_)
2823        | QueryExpr::Truncate(_)
2824        | QueryExpr::AlterTable(_)
2825        | QueryExpr::CreateIndex(_)
2826        | QueryExpr::DropIndex(_)
2827        | QueryExpr::CreateTimeSeries(_)
2828        | QueryExpr::CreateMetric(_)
2829        | QueryExpr::AlterMetric(_)
2830        | QueryExpr::CreateSlo(_)
2831        | QueryExpr::DropTimeSeries(_)
2832        | QueryExpr::CreateQueue(_)
2833        | QueryExpr::AlterQueue(_)
2834        | QueryExpr::DropQueue(_)
2835        | QueryExpr::CreateTree(_)
2836        | QueryExpr::DropTree(_)
2837        | QueryExpr::CreatePolicy(_)
2838        | QueryExpr::DropPolicy(_)
2839        | QueryExpr::CreateView(_)
2840        | QueryExpr::DropView(_)
2841        | QueryExpr::RefreshMaterializedView(_)
2842        | QueryExpr::CreateSchema(_)
2843        | QueryExpr::DropSchema(_)
2844        | QueryExpr::CreateSequence(_)
2845        | QueryExpr::DropSequence(_)
2846        | QueryExpr::CreateServer(_)
2847        | QueryExpr::DropServer(_)
2848        | QueryExpr::CreateForeignTable(_)
2849        | QueryExpr::DropForeignTable(_) => Some((IntentExclusive, Exclusive)),
2850
2851        // Admin / control — skip intent locks. `SET TENANT`,
2852        // `BEGIN / COMMIT / ROLLBACK`, `SET CONFIG`, `SHOW CONFIG`,
2853        // `VACUUM`, etc. don't touch collection data the same way
2854        // and the existing transaction layer already serialises the
2855        // pieces that matter.
2856        _ => None,
2857    }
2858}
2859
2860/// Best-effort collection inventory for an expression. Used to pick
2861/// `Collection(...)` resources for the intent-lock guard. Overshoots
2862/// are fine (take an extra IS, benign); undershoots leak writes past
2863/// DDL X locks, so err on the side of listing more names.
2864pub(super) fn collections_referenced(expr: &QueryExpr) -> Vec<String> {
2865    let mut out = Vec::new();
2866    walk_collections(expr, &mut out);
2867    out.sort();
2868    out.dedup();
2869    out
2870}
2871
2872fn walk_collections(expr: &QueryExpr, out: &mut Vec<String>) {
2873    match expr {
2874        QueryExpr::Table(t) => out.push(t.table.clone()),
2875        QueryExpr::Join(j) => {
2876            walk_collections(&j.left, out);
2877            walk_collections(&j.right, out);
2878        }
2879        QueryExpr::Insert(i) => out.push(i.table.clone()),
2880        QueryExpr::Update(u) => out.push(u.table.clone()),
2881        QueryExpr::Delete(d) => out.push(d.table.clone()),
2882        QueryExpr::QueueSelect(q) => out.push(q.queue.clone()),
2883
2884        // DDL — include the target collection so DDL takes
2885        // `(Collection, X)` and blocks concurrent readers / writers
2886        // on the same collection. Other collections stay live
2887        // because Global is still IX.
2888        QueryExpr::CreateTable(q) => out.push(q.name.clone()),
2889        QueryExpr::CreateCollection(q) => out.push(q.name.clone()),
2890        QueryExpr::CreateVector(q) => out.push(q.name.clone()),
2891        QueryExpr::DropTable(q) => out.push(q.name.clone()),
2892        QueryExpr::DropGraph(q) => out.push(q.name.clone()),
2893        QueryExpr::DropVector(q) => out.push(q.name.clone()),
2894        QueryExpr::DropDocument(q) => out.push(q.name.clone()),
2895        QueryExpr::DropKv(q) => out.push(q.name.clone()),
2896        QueryExpr::DropCollection(q) => out.push(q.name.clone()),
2897        QueryExpr::Truncate(q) => out.push(q.name.clone()),
2898        QueryExpr::AlterTable(q) => out.push(q.name.clone()),
2899        QueryExpr::CreateIndex(q) => out.push(q.table.clone()),
2900        QueryExpr::DropIndex(q) => out.push(q.table.clone()),
2901        QueryExpr::CreateTimeSeries(q) => out.push(q.name.clone()),
2902        QueryExpr::CreateMetric(q) => out.push(q.path.clone()),
2903        QueryExpr::AlterMetric(q) => out.push(q.path.clone()),
2904        QueryExpr::CreateSlo(q) => out.push(q.path.clone()),
2905        QueryExpr::DropTimeSeries(q) => out.push(q.name.clone()),
2906        QueryExpr::CreateQueue(q) => out.push(q.name.clone()),
2907        QueryExpr::AlterQueue(q) => out.push(q.name.clone()),
2908        QueryExpr::DropQueue(q) => out.push(q.name.clone()),
2909        QueryExpr::QueueCommand(QueueCommand::Move {
2910            source,
2911            destination,
2912            ..
2913        }) => {
2914            out.push(source.clone());
2915            out.push(destination.clone());
2916        }
2917        QueryExpr::CreatePolicy(q) => out.push(q.table.clone()),
2918        QueryExpr::CreateView(q) => out.push(q.name.clone()),
2919        QueryExpr::DropView(q) => out.push(q.name.clone()),
2920        QueryExpr::RefreshMaterializedView(q) => out.push(q.name.clone()),
2921
2922        // Vector / Hybrid / Graph / Path / commands reference
2923        // collections through fields whose shape varies; without a
2924        // uniform accessor we fall back to the global lock only —
2925        // benign because every runtime path still holds the global
2926        // mode.
2927        _ => {}
2928    }
2929}
2930
2931impl RedDBRuntime {
2932    pub fn in_memory() -> RedDBResult<Self> {
2933        Self::with_options(RedDBOptions::in_memory())
2934    }
2935
2936    /// Handle to the intent-lock manager for tests + introspection.
2937    /// Production code acquires via `LockerGuard::new(rt.lock_manager())`
2938    /// rather than touching the manager directly.
2939    pub fn lock_manager(&self) -> std::sync::Arc<crate::storage::transaction::lock::LockManager> {
2940        self.inner.lock_manager.clone()
2941    }
2942
2943    /// Process-local governance registry for managed policy/config guardrails.
2944    pub fn config_registry(&self) -> std::sync::Arc<crate::auth::registry::ConfigRegistry> {
2945        self.inner.config_registry.clone()
2946    }
2947
2948    pub fn query_audit(&self) -> std::sync::Arc<crate::runtime::query_audit::QueryAuditStream> {
2949        self.inner.query_audit.clone()
2950    }
2951
2952    pub fn control_events_require_persistence(&self) -> bool {
2953        self.inner.control_event_config.require_persistence()
2954    }
2955
2956    pub fn control_event_config(&self) -> crate::runtime::control_events::ControlEventConfig {
2957        self.inner.control_event_config
2958    }
2959
2960    pub fn control_event_ledger(
2961        &self,
2962    ) -> Arc<dyn crate::runtime::control_events::ControlEventLedger> {
2963        self.inner.control_event_ledger.read().clone()
2964    }
2965
2966    #[doc(hidden)]
2967    pub fn replace_control_event_ledger_for_tests(
2968        &self,
2969        ledger: Arc<dyn crate::runtime::control_events::ControlEventLedger>,
2970    ) {
2971        *self.inner.control_event_ledger.write() = ledger;
2972    }
2973
2974    #[inline(never)]
2975    pub fn with_options(options: RedDBOptions) -> RedDBResult<Self> {
2976        Self::with_pool(options, ConnectionPoolConfig::default())
2977    }
2978
2979    pub fn with_pool(
2980        options: RedDBOptions,
2981        pool_config: ConnectionPoolConfig,
2982    ) -> RedDBResult<Self> {
2983        // PLAN.md Phase 9.1 — capture wall-clock before storage
2984        // open so the cold-start phase markers can be backfilled
2985        // once Lifecycle is constructed below. Storage open
2986        // encapsulates auto-restore + WAL replay; we treat the
2987        // whole window as one combined "restore" + "wal_replay"
2988        // phase split at the same boundary because the storage
2989        // layer doesn't yet emit a finer signal.
2990        let boot_open_start_ms = std::time::SystemTime::now()
2991            .duration_since(std::time::UNIX_EPOCH)
2992            .map(|d| d.as_millis() as u64)
2993            .unwrap_or(0);
2994        let db = Arc::new(
2995            RedDB::open_with_options(&options)
2996                .map_err(|err| RedDBError::Internal(err.to_string()))?,
2997        );
2998        let result_blob_cache = crate::storage::cache::BlobCache::open_with_l2(
2999            crate::storage::cache::BlobCacheConfig::default().with_l2_path(
3000                options
3001                    .resolved_path("data.rdb")
3002                    .with_extension("result-cache.l2"),
3003            ),
3004        )
3005        .map_err(|err| {
3006            RedDBError::Internal(format!("open result Blob Cache L2 failed: {err:?}"))
3007        })?;
3008        let storage_ready_ms = std::time::SystemTime::now()
3009            .duration_since(std::time::UNIX_EPOCH)
3010            .map(|d| d.as_millis() as u64)
3011            .unwrap_or(0);
3012
3013        let runtime = Self {
3014            inner: Arc::new(RuntimeInner {
3015                db: db.clone(),
3016                layout: PhysicalLayout::from_options(&options),
3017                indices: IndexCatalog::register_default_vector_graph(
3018                    options.has_capability(crate::api::Capability::Table),
3019                    options.has_capability(crate::api::Capability::Graph),
3020                ),
3021                pool_config,
3022                pool: Mutex::new(PoolState::default()),
3023                started_at_unix_ms: SystemTime::now()
3024                    .duration_since(UNIX_EPOCH)
3025                    .unwrap_or_default()
3026                    .as_millis(),
3027                probabilistic: super::probabilistic_store::ProbabilisticStore::new(),
3028                index_store: super::index_store::IndexStore::new(),
3029                cdc: crate::replication::cdc::CdcBuffer::new(100_000),
3030                backup_scheduler: crate::replication::scheduler::BackupScheduler::new(3600),
3031                query_cache: parking_lot::RwLock::new(
3032                    crate::storage::query::planner::cache::PlanCache::new(1000),
3033                ),
3034                result_cache: parking_lot::RwLock::new((
3035                    HashMap::new(),
3036                    std::collections::VecDeque::new(),
3037                )),
3038                result_blob_cache,
3039                result_blob_entries: parking_lot::RwLock::new((
3040                    HashMap::new(),
3041                    std::collections::VecDeque::new(),
3042                )),
3043                ask_answer_cache_entries: parking_lot::RwLock::new((
3044                    HashSet::new(),
3045                    std::collections::VecDeque::new(),
3046                )),
3047                result_cache_shadow_divergences: std::sync::atomic::AtomicU64::new(0),
3048                result_cache_hits: std::sync::atomic::AtomicU64::new(0),
3049                result_cache_misses: std::sync::atomic::AtomicU64::new(0),
3050                result_cache_evictions: std::sync::atomic::AtomicU64::new(0),
3051                ask_daily_spend: parking_lot::RwLock::new(HashMap::new()),
3052                queue_message_locks: parking_lot::RwLock::new(HashMap::new()),
3053                rmw_locks: RmwLockTable::new(),
3054                planner_dirty_tables: parking_lot::RwLock::new(HashSet::new()),
3055                ec_registry: Arc::new(crate::ec::config::EcRegistry::new()),
3056                config_registry: Arc::new(crate::auth::registry::ConfigRegistry::new()),
3057                ec_worker: crate::ec::worker::EcWorker::new(),
3058                auth_store: parking_lot::RwLock::new(None),
3059                oauth_validator: parking_lot::RwLock::new(None),
3060                views: parking_lot::RwLock::new(HashMap::new()),
3061                materialized_views: parking_lot::RwLock::new(
3062                    crate::storage::cache::result::MaterializedViewCache::new(),
3063                ),
3064                retention_sweeper: parking_lot::RwLock::new(
3065                    crate::runtime::retention_sweeper::RetentionSweeperState::new(),
3066                ),
3067                snapshot_manager: Arc::new(
3068                    crate::storage::transaction::snapshot::SnapshotManager::new(),
3069                ),
3070                tx_contexts: parking_lot::RwLock::new(HashMap::new()),
3071                tx_local_tenants: parking_lot::RwLock::new(HashMap::new()),
3072                env_config_overrides: crate::runtime::config_overlay::collect_env_overrides(),
3073                lock_manager: Arc::new({
3074                    // Sourced from the matrix: Tier B key
3075                    // `concurrency.locking.deadlock_timeout_ms`
3076                    // (default 5000). Env var wins at boot so
3077                    // operators can tune without touching red_config.
3078                    let env = crate::runtime::config_overlay::collect_env_overrides();
3079                    let timeout_ms = env
3080                        .get("concurrency.locking.deadlock_timeout_ms")
3081                        .and_then(|raw| raw.parse::<u64>().ok())
3082                        .unwrap_or_else(|| {
3083                            match crate::runtime::config_matrix::default_for(
3084                                "concurrency.locking.deadlock_timeout_ms",
3085                            ) {
3086                                Some(crate::serde_json::Value::Number(n)) => n as u64,
3087                                _ => 5000,
3088                            }
3089                        });
3090                    let cfg = crate::storage::transaction::lock::LockConfig {
3091                        default_timeout: std::time::Duration::from_millis(timeout_ms),
3092                        ..Default::default()
3093                    };
3094                    crate::storage::transaction::lock::LockManager::new(cfg)
3095                }),
3096                rls_policies: parking_lot::RwLock::new(HashMap::new()),
3097                rls_enabled_tables: parking_lot::RwLock::new(HashSet::new()),
3098                foreign_tables: Arc::new(crate::storage::fdw::ForeignTableRegistry::with_builtins()),
3099                pending_tombstones: parking_lot::RwLock::new(HashMap::new()),
3100                pending_versioned_updates: parking_lot::RwLock::new(HashMap::new()),
3101                pending_kv_watch_events: parking_lot::RwLock::new(HashMap::new()),
3102                pending_store_wal_actions: parking_lot::RwLock::new(HashMap::new()),
3103                queue_wait_registry: std::sync::Arc::new(
3104                    crate::runtime::queue_wait_registry::QueueWaitRegistry::new(),
3105                ),
3106                pending_queue_wakes: parking_lot::RwLock::new(HashMap::new()),
3107                tenant_tables: parking_lot::RwLock::new(HashMap::new()),
3108                ddl_epoch: std::sync::atomic::AtomicU64::new(0),
3109                write_gate: Arc::new(crate::runtime::write_gate::WriteGate::from_options(
3110                    &options,
3111                )),
3112                lifecycle: crate::runtime::lifecycle::Lifecycle::new(),
3113                resource_limits: crate::runtime::resource_limits::ResourceLimits::from_env(),
3114                audit_log: {
3115                    // Default audit-log path for the in-memory case
3116                    // sits in the system temp dir; persistent runs
3117                    // place it next to data.rdb.
3118                    //
3119                    // gh-471 iter 2: route through the resolved
3120                    // `LogDestination`. Performance/Max tiers emit a
3121                    // `File(...)` under `<dbname>.rdb.red/logs/`;
3122                    // lower tiers / ephemeral runs report `Stderr`
3123                    // and we keep the legacy file-next-to-data sink.
3124                    let data_path = options
3125                        .data_path
3126                        .clone()
3127                        .unwrap_or_else(|| std::env::temp_dir().join("reddb"));
3128                    let (audit_dest, _) = crate::api::tier_wiring::current_log_destinations();
3129                    Arc::new(crate::runtime::audit_log::AuditLogger::for_destination(
3130                        &audit_dest,
3131                        &data_path,
3132                    ))
3133                },
3134                control_event_ledger: parking_lot::RwLock::new(Arc::new(
3135                    crate::runtime::control_events::RuntimeLedger::new(db.store()),
3136                )),
3137                control_event_config: options.control_events,
3138                query_audit: Arc::new(crate::runtime::query_audit::QueryAuditStream::new(
3139                    db.store(),
3140                    options.query_audit.clone(),
3141                )),
3142                lease_lifecycle: std::sync::OnceLock::new(),
3143                replica_apply_metrics: std::sync::Arc::new(
3144                    crate::replication::logical::ReplicaApplyMetrics::default(),
3145                ),
3146                quota_bucket: crate::runtime::quota_bucket::QuotaBucket::from_env(),
3147                schema_vocabulary: parking_lot::RwLock::new(
3148                    crate::runtime::schema_vocabulary::SchemaVocabulary::new(),
3149                ),
3150                slow_query_logger: {
3151                    // Issue #205 — slow-query sink lives in the same
3152                    // directory the audit log uses, so backup/restore
3153                    // ships them together. Threshold + sample-pct
3154                    // default conservatively (1 s, 100% sampling) so
3155                    // emitted lines are rare and complete. Operators
3156                    // tune via env / config matrix in a follow-up.
3157                    //
3158                    // gh-471 iter 2: same routing as the audit log —
3159                    // `LogDestination::File(...)` for Performance/Max
3160                    // lands under `<dbname>.rdb.red/logs/slow.log`;
3161                    // lower tiers fall back to `red-slow.log` in the
3162                    // data directory.
3163                    let fallback_dir = options
3164                        .data_path
3165                        .as_ref()
3166                        .and_then(|p| p.parent().map(std::path::PathBuf::from))
3167                        .unwrap_or_else(|| std::env::temp_dir().join("reddb"));
3168                    let threshold_ms = std::env::var("RED_SLOW_QUERY_THRESHOLD_MS")
3169                        .ok()
3170                        .and_then(|s| s.parse::<u64>().ok())
3171                        .unwrap_or(1000);
3172                    let sample_pct = std::env::var("RED_SLOW_QUERY_SAMPLE_PCT")
3173                        .ok()
3174                        .and_then(|s| s.parse::<u8>().ok())
3175                        .unwrap_or(100);
3176                    let (_, slow_dest) = crate::api::tier_wiring::current_log_destinations();
3177                    crate::telemetry::slow_query_logger::SlowQueryLogger::for_destination(
3178                        &slow_dest,
3179                        &fallback_dir,
3180                        threshold_ms,
3181                        sample_pct,
3182                    )
3183                },
3184                kv_stats: crate::runtime::KvStatsCounters::default(),
3185                metrics_ingest_stats: crate::runtime::MetricsIngestCounters::default(),
3186                metrics_tenant_activity_stats:
3187                    crate::runtime::MetricsTenantActivityCounters::default(),
3188                queue_telemetry: Arc::new(
3189                    crate::runtime::queue_telemetry::QueueTelemetryCounters::default(),
3190                ),
3191                queue_presence: Arc::new(
3192                    crate::storage::queue::presence::ConsumerPresenceRegistry::new(),
3193                ),
3194                vector_introspection: Arc::new(
3195                    crate::storage::vector::introspection::VectorIntrospectionRegistry::new(),
3196                ),
3197                kv_tag_index: crate::runtime::KvTagIndex::default(),
3198                chain_tip_cache: parking_lot::Mutex::new(HashMap::new()),
3199                chain_integrity_broken: parking_lot::Mutex::new(HashMap::new()),
3200                integrity_tombstones: parking_lot::Mutex::new(Vec::new()),
3201                integrity_tombstones_state: std::sync::atomic::AtomicU8::new(0),
3202            }),
3203        };
3204
3205        // Issue #205 — install the process-wide OperatorEvent sink so
3206        // emit sites buried in storage / replication / signal handlers
3207        // can record without threading an `&AuditLogger` through every
3208        // call stack. First registration wins; subsequent in-memory
3209        // runtimes (test harnesses) fall through to tracing+eprintln.
3210        crate::telemetry::operator_event::install_global_audit_sink(Arc::clone(
3211            &runtime.inner.audit_log,
3212        ));
3213
3214        // PLAN.md Phase 9.1 — backfill cold-start phase markers
3215        // from the wall-clock captured before storage open. The
3216        // entire `RedDB::open_with_options` call covers both
3217        // auto-restore (when configured) and WAL replay. We
3218        // record both phases against the same boundary today;
3219        // a follow-up will split them once the storage layer
3220        // surfaces a finer-grained event.
3221        runtime
3222            .inner
3223            .lifecycle
3224            .set_restore_started_at_ms(boot_open_start_ms);
3225        runtime
3226            .inner
3227            .lifecycle
3228            .set_restore_ready_at_ms(storage_ready_ms);
3229        runtime
3230            .inner
3231            .lifecycle
3232            .set_wal_replay_started_at_ms(boot_open_start_ms);
3233        runtime
3234            .inner
3235            .lifecycle
3236            .set_wal_replay_ready_at_ms(storage_ready_ms);
3237
3238        let restored_cdc_lsn = runtime
3239            .inner
3240            .db
3241            .replication
3242            .as_ref()
3243            .map(|repl| {
3244                repl.logical_wal_spool
3245                    .as_ref()
3246                    .map(|spool| spool.current_lsn())
3247                    .unwrap_or(0)
3248            })
3249            .unwrap_or(0)
3250            .max(runtime.config_u64("red.config.timeline.last_archived_lsn", 0));
3251        runtime.inner.cdc.set_current_lsn(restored_cdc_lsn);
3252        runtime.rehydrate_snapshot_xid_floor();
3253        runtime.bootstrap_system_keyed_collections()?;
3254        runtime.rehydrate_declared_column_schemas();
3255        runtime.load_probabilistic_state()?;
3256
3257        // Phase 2.5.4: replay `tenant_tables.{table}.column` markers so
3258        // tables declared via `TENANT BY (col)` survive restart. Each
3259        // entry re-registers the auto-policy and flips RLS on again.
3260        runtime.rehydrate_tenant_tables();
3261        // Issue #593 slice 9a — replay persisted materialized-view
3262        // descriptors so `CREATE MATERIALIZED VIEW v AS …` survives a
3263        // restart. Runs after the system-keyed collections bootstrap
3264        // and before the API opens.
3265        runtime.rehydrate_materialized_view_descriptors();
3266        if let Some(repl) = &runtime.inner.db.replication {
3267            repl.wal_buffer.set_current_lsn(restored_cdc_lsn);
3268        }
3269
3270        // Save system info to red_config on boot
3271        {
3272            let sys = SystemInfo::collect();
3273            runtime.inner.db.store().set_config_tree(
3274                "red.system",
3275                &crate::serde_json::json!({
3276                    "pid": sys.pid,
3277                    "cpu_cores": sys.cpu_cores,
3278                    "total_memory_bytes": sys.total_memory_bytes,
3279                    "available_memory_bytes": sys.available_memory_bytes,
3280                    "os": sys.os,
3281                    "arch": sys.arch,
3282                    "hostname": sys.hostname,
3283                    "started_at": SystemTime::now()
3284                        .duration_since(UNIX_EPOCH)
3285                        .unwrap_or_default()
3286                        .as_millis() as u64
3287                }),
3288            );
3289
3290            // Seed defaults on first boot (only if red_config is empty or missing defaults)
3291            let store = runtime.inner.db.store();
3292            if store
3293                .get_collection("red_config")
3294                .map(|m| m.query_all(|_| true).len())
3295                .unwrap_or(0)
3296                <= 10
3297            {
3298                store.set_config_tree("red.ai", &crate::json!({
3299                    "default": crate::json!({
3300                        "provider": "openai",
3301                        "model": crate::ai::DEFAULT_OPENAI_PROMPT_MODEL
3302                    }),
3303                    "max_embedding_inputs": 256,
3304                    "max_prompt_batch": 256,
3305                    "timeout": crate::json!({ "connect_secs": 10, "read_secs": 90, "write_secs": 30 })
3306                }));
3307                store.set_config_tree(
3308                    "red.server",
3309                    &crate::json!({
3310                        "max_scan_limit": 1000,
3311                        "max_body_size": 1048576,
3312                        "read_timeout_ms": 5000,
3313                        "write_timeout_ms": 5000
3314                    }),
3315                );
3316                store.set_config_tree(
3317                    "red.storage",
3318                    &crate::json!({
3319                        "page_size": 4096,
3320                        "page_cache_capacity": 100000,
3321                        "auto_checkpoint_pages": 1000,
3322                        "snapshot_retention": 16,
3323                        "verify_checksums": true,
3324                        "segment": crate::json!({
3325                            "max_entities": 100000,
3326                            "max_bytes": 268435456_u64,
3327                            "compression_level": 6
3328                        }),
3329                        "hnsw": crate::json!({ "m": 16, "ef_construction": 100, "ef_search": 50 }),
3330                        "ivf": crate::json!({ "n_lists": 100, "n_probes": 10 }),
3331                        "bm25": crate::json!({ "k1": 1.2, "b": 0.75 })
3332                    }),
3333                );
3334                store.set_config_tree(
3335                    "red.search",
3336                    &crate::json!({
3337                        "rag": crate::json!({
3338                            "max_chunks_per_source": 10,
3339                            "max_total_chunks": 25,
3340                            "similarity_threshold": 0.8,
3341                            "graph_depth": 2,
3342                            "min_relevance": 0.3
3343                        }),
3344                        "fusion": crate::json!({
3345                            "vector_weight": 0.5,
3346                            "graph_weight": 0.3,
3347                            "table_weight": 0.2,
3348                            "dedup_threshold": 0.85
3349                        })
3350                    }),
3351                );
3352                store.set_config_tree(
3353                    "red.auth",
3354                    &crate::json!({
3355                        "enabled": false,
3356                        "session_ttl_secs": 3600,
3357                        "require_auth": false
3358                    }),
3359                );
3360                store.set_config_tree(
3361                    "red.query",
3362                    &crate::json!({
3363                        "connection_pool": crate::json!({ "max_connections": 64, "max_idle": 16 }),
3364                        "max_recursion_depth": 1000
3365                    }),
3366                );
3367                store.set_config_tree(
3368                    "red.indexes",
3369                    &crate::json!({
3370                        "auto_select": true,
3371                        "bloom_filter": crate::json!({
3372                            "enabled": true,
3373                            "false_positive_rate": 0.01,
3374                            "prune_on_scan": true
3375                        }),
3376                        "hash": crate::json!({ "enabled": true }),
3377                        "bitmap": crate::json!({ "enabled": true, "max_cardinality": 1000 }),
3378                        "spatial": crate::json!({ "enabled": true })
3379                    }),
3380                );
3381                store.set_config_tree(
3382                    "red.memtable",
3383                    &crate::json!({
3384                        "enabled": true,
3385                        "max_bytes": 67108864_u64,
3386                        "flush_threshold": 0.75
3387                    }),
3388                );
3389                store.set_config_tree(
3390                    "red.probabilistic",
3391                    &crate::json!({
3392                        "hll_registers": 16384,
3393                        "sketch_default_width": 1000,
3394                        "sketch_default_depth": 5,
3395                        "filter_default_capacity": 100000
3396                    }),
3397                );
3398                store.set_config_tree(
3399                    "red.timeseries",
3400                    &crate::json!({
3401                        "default_chunk_size": 1024,
3402                        "compression": crate::json!({
3403                            "timestamps": "delta_of_delta",
3404                            "values": "gorilla_xor"
3405                        }),
3406                        "default_retention_days": 0
3407                    }),
3408                );
3409                store.set_config_tree(
3410                    "red.queue",
3411                    &crate::json!({
3412                        "default_max_size": 0,
3413                        "default_max_attempts": 3,
3414                        "visibility_timeout_ms": 30000,
3415                        "consumer_idle_timeout_ms": 60000
3416                    }),
3417                );
3418                store.set_config_tree(
3419                    "red.backup",
3420                    &crate::json!({
3421                        "enabled": false,
3422                        "interval_secs": 3600,
3423                        "retention_count": 24,
3424                        "upload": false,
3425                        "backend": "local"
3426                    }),
3427                );
3428                store.set_config_tree(
3429                    "red.wal",
3430                    &crate::json!({
3431                        "archive": crate::json!({
3432                            "enabled": false,
3433                            "retention_hours": 168,
3434                            "prefix": "wal/"
3435                        })
3436                    }),
3437                );
3438                store.set_config_tree(
3439                    "red.cdc",
3440                    &crate::json!({
3441                        "enabled": true,
3442                        "buffer_size": 100000
3443                    }),
3444                );
3445                store.set_config_tree(
3446                    "red.config.secret",
3447                    &crate::json!({
3448                        "auto_encrypt": true,
3449                        "auto_decrypt": true
3450                    }),
3451                );
3452            }
3453
3454            // Perf-parity config matrix: heal the Tier A (critical)
3455            // keys unconditionally on every boot. Idempotent — only
3456            // writes the default when the key is missing. Keeps
3457            // `SHOW CONFIG` showing every guarantee the operator has
3458            // (durability.mode, concurrency.locking.enabled, …) even
3459            // on long-running datadirs that predate the matrix.
3460            crate::runtime::config_matrix::heal_critical_keys(store.as_ref());
3461
3462            // Phase 5 — Lehman-Yao runtime flag. Read the Tier A
3463            // `storage.btree.lehman_yao` value from the matrix (env
3464            // > file > red_config > default) and publish it to the
3465            // storage layer's atomic so the B-tree read / split
3466            // paths can branch without re-reading the config on
3467            // every hot-path call.
3468            let lehman_yao = runtime.config_bool("storage.btree.lehman_yao", true);
3469            crate::storage::engine::btree::lehman_yao::set_enabled(lehman_yao);
3470            if lehman_yao {
3471                tracing::info!(
3472                    "storage.btree.lehman_yao=true — lock-free concurrent descent enabled"
3473                );
3474            }
3475
3476            // Config file overlay — mounted `/etc/reddb/config.json`
3477            // (override path via REDDB_CONFIG_FILE). Writes keys with
3478            // write-if-absent semantics so a later user `SET CONFIG`
3479            // always wins. Missing file = silent no-op.
3480            let overlay_path = crate::runtime::config_overlay::config_file_path();
3481            let _ =
3482                crate::runtime::config_overlay::apply_config_file(store.as_ref(), &overlay_path);
3483        }
3484
3485        // VCS ("Git for Data") — create the `red_*` metadata
3486        // collections on first boot. Idempotent: `get_or_create_collection`
3487        // is a no-op if the collection already exists.
3488        {
3489            let store = runtime.inner.db.store();
3490            for name in crate::application::vcs_collections::ALL {
3491                let _ = store.get_or_create_collection(*name);
3492            }
3493            // Seed VCS config namespace with sensible defaults on first
3494            // boot, matching the pattern used by red.ai / red.storage.
3495            store.set_config_tree(
3496                crate::application::vcs_collections::CONFIG_NAMESPACE,
3497                &crate::json!({
3498                    "default_branch": "main",
3499                    "author": crate::json!({
3500                        "name": "reddb",
3501                        "email": "reddb@localhost"
3502                    }),
3503                    "protected_branches": crate::json!(["main"]),
3504                    "closure": crate::json!({
3505                        "enabled": true,
3506                        "lazy": true
3507                    }),
3508                    "merge": crate::json!({
3509                        "default_strategy": "auto",
3510                        "fast_forward": true
3511                    })
3512                }),
3513            );
3514        }
3515
3516        // Migrations — create the `red_migrations` / `red_migration_deps`
3517        // system collections on first boot. Idempotent.
3518        {
3519            let store = runtime.inner.db.store();
3520            for name in crate::application::migration_collections::ALL {
3521                let _ = store.get_or_create_collection(*name);
3522            }
3523        }
3524
3525        // Topology graph (#803) — ensure the built-in `red.topology.cluster`
3526        // graph collection (declared WITH ANALYTICS) and its metadata sidecar
3527        // exist. Idempotent and survives restarts via the WAL-backed contract.
3528        let _ = crate::application::topology_collections::ensure(&runtime);
3529
3530        // Start background maintenance thread (context index refresh +
3531        // session purge). Held by a WEAK reference to `RuntimeInner`
3532        // so dropping the last `RedDBRuntime` handle actually releases
3533        // the underlying Arc<Pager> (and its file lock). Polling at
3534        // 200ms means shutdown latency is bounded; the real 60-second
3535        // work cadence is tracked independently via a `last_work`
3536        // timestamp.
3537        //
3538        // The previous version captured `rt = runtime.clone()` by
3539        // strong reference and ran an unterminated `loop`, which held
3540        // Arc<RuntimeInner> forever — reopening a persistent database
3541        // in the same process failed with "Database is locked" because
3542        // the pager could never drop. See the regression test
3543        // `finding_1_select_after_bulk_insert_persistent_reopen`.
3544        {
3545            let weak = Arc::downgrade(&runtime.inner);
3546            std::thread::Builder::new()
3547                .name("reddb-maintenance".into())
3548                .spawn(move || {
3549                    let tick = std::time::Duration::from_millis(200);
3550                    let work_interval = std::time::Duration::from_secs(60);
3551                    let mut last_work = std::time::Instant::now();
3552                    loop {
3553                        std::thread::sleep(tick);
3554                        let Some(inner) = weak.upgrade() else {
3555                            // All strong references dropped — the
3556                            // runtime is gone, exit cleanly.
3557                            break;
3558                        };
3559                        if last_work.elapsed() >= work_interval {
3560                            let _stats = inner.db.store().context_index().stats();
3561                            last_work = std::time::Instant::now();
3562                        }
3563                    }
3564                })
3565                .ok();
3566        }
3567
3568        // Start backup scheduler if enabled via red_config
3569        {
3570            let store = runtime.inner.db.store();
3571            let mut backup_enabled = false;
3572            let mut backup_interval = 3600u64;
3573
3574            if let Some(manager) = store.get_collection("red_config") {
3575                manager.for_each_entity(|entity| {
3576                    if let Some(row) = entity.data.as_row() {
3577                        let key = row.get_field("key").and_then(|v| match v {
3578                            crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3579                            _ => None,
3580                        });
3581                        let val = row.get_field("value");
3582                        if key == Some("red.config.backup.enabled") {
3583                            backup_enabled = match val {
3584                                Some(crate::storage::schema::Value::Boolean(true)) => true,
3585                                Some(crate::storage::schema::Value::Text(s)) => &**s == "true",
3586                                _ => false,
3587                            };
3588                        } else if key == Some("red.config.backup.interval_secs") {
3589                            if let Some(crate::storage::schema::Value::Integer(n)) = val {
3590                                backup_interval = *n as u64;
3591                            }
3592                        }
3593                    }
3594                    true
3595                });
3596            }
3597
3598            if backup_enabled {
3599                runtime.inner.backup_scheduler.set_interval(backup_interval);
3600                let rt = runtime.clone();
3601                runtime
3602                    .inner
3603                    .backup_scheduler
3604                    .start(move || rt.trigger_backup().map_err(|e| format!("{}", e)));
3605            }
3606        }
3607
3608        // Load EC registry from red_config and start worker
3609        {
3610            runtime
3611                .inner
3612                .ec_registry
3613                .load_from_config_store(runtime.inner.db.store().as_ref());
3614            if !runtime.inner.ec_registry.async_configs().is_empty() {
3615                runtime.inner.ec_worker.start(
3616                    Arc::clone(&runtime.inner.ec_registry),
3617                    Arc::clone(&runtime.inner.db.store()),
3618                );
3619            }
3620        }
3621
3622        if let crate::replication::ReplicationRole::Replica { primary_addr } =
3623            runtime.inner.db.options().replication.role.clone()
3624        {
3625            let rt = runtime.clone();
3626            std::thread::Builder::new()
3627                .name("reddb-replica".into())
3628                .spawn(move || rt.run_replica_loop(primary_addr))
3629                .ok();
3630        }
3631
3632        // PLAN.md Phase 1 — Lifecycle Contract. Mark Ready once every
3633        // boot stage above has completed (WAL replay, restore-from-
3634        // remote, replica-loop spawn). Health probes flip from 503 to
3635        // 200 here; shutdown begins from this state.
3636        runtime.inner.lifecycle.mark_ready();
3637
3638        // Issue #583 slice 10 — ContinuousMaterializedView scheduler.
3639        // Low-priority background ticker that drains the cache's
3640        // `claim_due_at` set every ~50ms. Holds only a Weak<RuntimeInner>
3641        // so the thread exits cleanly when the runtime drops (≤50ms
3642        // latency between drop and exit). Materialized views without
3643        // a `REFRESH EVERY` clause stay on the manual-refresh path
3644        // and are skipped by `claim_due_at`, so the loop is a no-op
3645        // when no scheduled views exist.
3646        {
3647            let weak_inner = Arc::downgrade(&runtime.inner);
3648            std::thread::Builder::new()
3649                .name("reddb-mv-scheduler".into())
3650                .spawn(move || loop {
3651                    std::thread::sleep(std::time::Duration::from_millis(50));
3652                    let Some(inner) = weak_inner.upgrade() else {
3653                        break;
3654                    };
3655                    let rt = RedDBRuntime { inner };
3656                    rt.refresh_due_materialized_views();
3657                })
3658                .ok();
3659        }
3660
3661        // Issue #584 slice 12 — DeclarativeRetention background sweeper.
3662        // Low-priority ticker that physically reclaims rows whose
3663        // timestamp has fallen beyond the retention window. Holds a
3664        // `Weak<RuntimeInner>` so the thread exits within one tick of
3665        // the runtime drop (graceful shutdown leaves storage consistent
3666        // because each tick goes through the standard DELETE path —
3667        // there is no half-finished mutation state to clean up). The
3668        // tick interval is intentionally longer than the MV scheduler
3669        // (500ms) because retention is order-of-seconds at minimum.
3670        if !runtime.write_gate().is_read_only() {
3671            let weak_inner = Arc::downgrade(&runtime.inner);
3672            std::thread::Builder::new()
3673                .name("reddb-retention-sweeper".into())
3674                .spawn(move || loop {
3675                    std::thread::sleep(std::time::Duration::from_millis(500));
3676                    let Some(inner) = weak_inner.upgrade() else {
3677                        break;
3678                    };
3679                    let rt = RedDBRuntime { inner };
3680                    rt.sweep_retention_tick(
3681                        crate::runtime::retention_sweeper::DEFAULT_SWEEPER_BATCH,
3682                    );
3683                })
3684                .ok();
3685        }
3686
3687        Ok(runtime)
3688    }
3689
3690    fn rehydrate_snapshot_xid_floor(&self) {
3691        let store = self.inner.db.store();
3692        for collection in store.list_collections() {
3693            let Some(manager) = store.get_collection(&collection) else {
3694                continue;
3695            };
3696            for entity in manager.query_all(|_| true) {
3697                self.inner
3698                    .snapshot_manager
3699                    .observe_committed_xid(entity.xmin);
3700                self.inner
3701                    .snapshot_manager
3702                    .observe_committed_xid(entity.xmax);
3703            }
3704        }
3705    }
3706
3707    /// Provision an empty Table-shaped collection that backs a
3708    /// `CREATE MATERIALIZED VIEW v` (issue #594 slice 9b of #575).
3709    /// `SELECT FROM v` reads this collection directly; the rewriter is
3710    /// configured to skip materialized views so the body is no longer
3711    /// substituted. REFRESH still writes to the cache slot — wiring it
3712    /// into this backing collection is the job of slice 9c.
3713    ///
3714    /// Idempotent: re-running for the same name leaves the existing
3715    /// collection in place (mirrors `CREATE TABLE IF NOT EXISTS`
3716    /// semantics). This keeps `CREATE OR REPLACE MATERIALIZED VIEW v`
3717    /// cheap — the body change does not invalidate already-buffered
3718    /// rows. Until 9c lands the backing is always empty anyway.
3719    pub(crate) fn ensure_materialized_view_backing(&self, name: &str) -> RedDBResult<()> {
3720        let store = self.inner.db.store();
3721        let mut changed = false;
3722        if store.get_collection(name).is_none() {
3723            store.get_or_create_collection(name);
3724            changed = true;
3725        }
3726        if self.inner.db.collection_contract(name).is_none() {
3727            self.inner
3728                .db
3729                .save_collection_contract(system_keyed_collection_contract(
3730                    name,
3731                    crate::catalog::CollectionModel::Table,
3732                ))
3733                .map_err(|err| RedDBError::Internal(err.to_string()))?;
3734            changed = true;
3735        }
3736        if changed {
3737            self.inner
3738                .db
3739                .persist_metadata()
3740                .map_err(|err| RedDBError::Internal(err.to_string()))?;
3741        }
3742        Ok(())
3743    }
3744
3745    /// Inverse of [`ensure_materialized_view_backing`] — drops the
3746    /// backing collection on `DROP MATERIALIZED VIEW v`. No-op when
3747    /// the collection was never created (e.g. a `DROP MATERIALIZED
3748    /// VIEW IF EXISTS v` against an unknown name).
3749    pub(crate) fn drop_materialized_view_backing(&self, name: &str) -> RedDBResult<()> {
3750        let store = self.inner.db.store();
3751        if store.get_collection(name).is_none() {
3752            return Ok(());
3753        }
3754        store
3755            .drop_collection(name)
3756            .map_err(|err| RedDBError::Internal(err.to_string()))?;
3757        // The contract may have been dropped already (DROP TABLE path)
3758        // — ignore "not found" errors by checking presence first.
3759        if self.inner.db.collection_contract(name).is_some() {
3760            self.inner
3761                .db
3762                .remove_collection_contract(name)
3763                .map_err(|err| RedDBError::Internal(err.to_string()))?;
3764        }
3765        self.invalidate_result_cache();
3766        self.inner
3767            .db
3768            .persist_metadata()
3769            .map_err(|err| RedDBError::Internal(err.to_string()))?;
3770        Ok(())
3771    }
3772
3773    fn bootstrap_system_keyed_collections(&self) -> RedDBResult<()> {
3774        let mut changed = false;
3775        for (name, model) in [
3776            ("red.config", crate::catalog::CollectionModel::Config),
3777            ("red.vault", crate::catalog::CollectionModel::Vault),
3778            // Issue #593 — materialized-view catalog. One row per
3779            // `CREATE MATERIALIZED VIEW`; rehydrated at boot before
3780            // the API opens.
3781            (
3782                crate::runtime::continuous_materialized_view::CATALOG_COLLECTION,
3783                crate::catalog::CollectionModel::Config,
3784            ),
3785        ] {
3786            if self.inner.db.store().get_collection(name).is_none() {
3787                self.inner.db.store().get_or_create_collection(name);
3788                changed = true;
3789            }
3790            if self.inner.db.collection_contract(name).is_none() {
3791                self.inner
3792                    .db
3793                    .save_collection_contract(system_keyed_collection_contract(name, model))
3794                    .map_err(|err| RedDBError::Internal(err.to_string()))?;
3795                changed = true;
3796            }
3797        }
3798        if changed {
3799            self.inner
3800                .db
3801                .persist_metadata()
3802                .map_err(|err| RedDBError::Internal(err.to_string()))?;
3803        }
3804        Ok(())
3805    }
3806
3807    pub fn db(&self) -> Arc<RedDB> {
3808        Arc::clone(&self.inner.db)
3809    }
3810
3811    /// Direct access to the runtime's secondary-index store.
3812    /// Used by bulk-insert entry points (gRPC binary bulk, HTTP bulk,
3813    /// wire bulk) that need to push new rows through the per-index
3814    /// maintenance hook after `store.bulk_insert` returns.
3815    pub fn index_store_ref(&self) -> &super::index_store::IndexStore {
3816        &self.inner.index_store
3817    }
3818
3819    /// Apply a DDL event to the schema-vocabulary reverse index
3820    /// (issue #120). Called by DDL execution paths after the catalog
3821    /// mutation has succeeded so the index never holds entries for
3822    /// half-applied DDL.
3823    pub(crate) fn schema_vocabulary_apply(
3824        &self,
3825        event: crate::runtime::schema_vocabulary::DdlEvent,
3826    ) {
3827        self.inner.schema_vocabulary.write().on_ddl(event);
3828    }
3829
3830    /// Lookup `token` in the schema-vocabulary reverse index. Returns
3831    /// an owned `Vec<VocabHit>` because the underlying read lock
3832    /// cannot be borrowed across the call boundary; the slice from
3833    /// `SchemaVocabulary::lookup` is cloned per hit.
3834    pub fn schema_vocabulary_lookup(
3835        &self,
3836        token: &str,
3837    ) -> Vec<crate::runtime::schema_vocabulary::VocabHit> {
3838        self.inner.schema_vocabulary.read().lookup(token).to_vec()
3839    }
3840
3841    /// Inject an AuthStore into the runtime. Called by server boot
3842    /// after the vault has been bootstrapped, so that `Value::Secret`
3843    /// auto-encrypt/decrypt can reach the vault AES key.
3844    pub fn set_auth_store(&self, store: Arc<crate::auth::store::AuthStore>) {
3845        *self.inner.auth_store.write() = Some(store);
3846    }
3847
3848    /// Snapshot the current AuthStore (if any). Used by the wire listener
3849    /// to validate bearer tokens issued via HTTP `/auth/login`.
3850    pub fn auth_store(&self) -> Option<Arc<crate::auth::store::AuthStore>> {
3851        self.inner.auth_store.read().clone()
3852    }
3853
3854    /// Read a vault KV secret from the configured AuthStore, if present.
3855    pub fn vault_kv_get(&self, key: &str) -> Option<String> {
3856        self.inner
3857            .auth_store
3858            .read()
3859            .as_ref()
3860            .and_then(|store| store.vault_kv_get(key))
3861    }
3862
3863    /// Write a vault KV secret and fail if the encrypted vault write is
3864    /// unavailable or cannot be made durable.
3865    pub fn vault_kv_try_set(&self, key: String, value: String) -> RedDBResult<()> {
3866        let store = self.inner.auth_store.read().clone().ok_or_else(|| {
3867            RedDBError::Query("secret storage requires an enabled, unsealed vault".to_string())
3868        })?;
3869        store
3870            .vault_kv_try_set(key, value)
3871            .map_err(|err| RedDBError::Query(err.to_string()))
3872    }
3873
3874    /// Inject an `OAuthValidator` into the runtime. When set, HTTP and
3875    /// wire transports try OAuth JWT validation before falling back to
3876    /// the local AuthStore lookup. Pass `None` to disable.
3877    pub fn set_oauth_validator(&self, validator: Option<Arc<crate::auth::oauth::OAuthValidator>>) {
3878        *self.inner.oauth_validator.write() = validator;
3879    }
3880
3881    /// Returns a clone of the configured `OAuthValidator` Arc, if any.
3882    /// Hot path: called per HTTP request when an Authorization header
3883    /// is present, so we hand back a cheap Arc clone.
3884    pub fn oauth_validator(&self) -> Option<Arc<crate::auth::oauth::OAuthValidator>> {
3885        self.inner.oauth_validator.read().clone()
3886    }
3887
3888    /// Returns the vault AES key (`red.secret.aes_key`) if an auth
3889    /// store is wired and a key has been generated. Used by the
3890    /// `Value::Secret` encrypt/decrypt pipeline.
3891    pub(crate) fn secret_aes_key(&self) -> Option<[u8; 32]> {
3892        let guard = self.inner.auth_store.read();
3893        guard.as_ref().and_then(|s| s.vault_secret_key())
3894    }
3895
3896    /// Resolve a boolean flag from `red_config`. Defaults to `default`
3897    /// when the key is missing or not coercible. If the same key has
3898    /// been written multiple times (SET CONFIG appends new rows), the
3899    /// most recent entity wins. Env-var overrides
3900    /// (`REDDB_<UP_DOTTED>`) take highest precedence.
3901    pub(crate) fn config_bool(&self, key: &str, default: bool) -> bool {
3902        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3903            if let Some(crate::storage::schema::Value::Boolean(b)) =
3904                crate::runtime::config_overlay::coerce_env_value(key, raw)
3905            {
3906                return b;
3907            }
3908        }
3909        let store = self.inner.db.store();
3910        let Some(manager) = store.get_collection("red_config") else {
3911            return default;
3912        };
3913        let mut result = default;
3914        let mut latest_id: u64 = 0;
3915        manager.for_each_entity(|entity| {
3916            if let Some(row) = entity.data.as_row() {
3917                let entry_key = row.get_field("key").and_then(|v| match v {
3918                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3919                    _ => None,
3920                });
3921                if entry_key == Some(key) {
3922                    let id = entity.id.raw();
3923                    if id >= latest_id {
3924                        latest_id = id;
3925                        result = match row.get_field("value") {
3926                            Some(crate::storage::schema::Value::Boolean(b)) => *b,
3927                            Some(crate::storage::schema::Value::Text(s)) => {
3928                                matches!(s.as_ref(), "true" | "TRUE" | "True" | "1")
3929                            }
3930                            Some(crate::storage::schema::Value::Integer(n)) => *n != 0,
3931                            _ => default,
3932                        };
3933                    }
3934                }
3935            }
3936            true
3937        });
3938        result
3939    }
3940
3941    pub(crate) fn config_u64(&self, key: &str, default: u64) -> u64 {
3942        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3943            if let Some(crate::storage::schema::Value::UnsignedInteger(n)) =
3944                crate::runtime::config_overlay::coerce_env_value(key, raw)
3945            {
3946                return n;
3947            }
3948        }
3949        let store = self.inner.db.store();
3950        let Some(manager) = store.get_collection("red_config") else {
3951            return default;
3952        };
3953        let mut result = default;
3954        let mut latest_id: u64 = 0;
3955        manager.for_each_entity(|entity| {
3956            if let Some(row) = entity.data.as_row() {
3957                let entry_key = row.get_field("key").and_then(|v| match v {
3958                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3959                    _ => None,
3960                });
3961                if entry_key == Some(key) {
3962                    let id = entity.id.raw();
3963                    if id >= latest_id {
3964                        latest_id = id;
3965                        result = match row.get_field("value") {
3966                            Some(crate::storage::schema::Value::Integer(n)) => *n as u64,
3967                            Some(crate::storage::schema::Value::UnsignedInteger(n)) => *n,
3968                            Some(crate::storage::schema::Value::Text(s)) => {
3969                                s.parse::<u64>().unwrap_or(default)
3970                            }
3971                            _ => default,
3972                        };
3973                    }
3974                }
3975            }
3976            true
3977        });
3978        result
3979    }
3980
3981    pub(crate) fn config_f64(&self, key: &str, default: f64) -> f64 {
3982        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3983            if let Ok(n) = raw.parse::<f64>() {
3984                return n;
3985            }
3986        }
3987        let store = self.inner.db.store();
3988        let Some(manager) = store.get_collection("red_config") else {
3989            return default;
3990        };
3991        let mut result = default;
3992        let mut latest_id: u64 = 0;
3993        manager.for_each_entity(|entity| {
3994            if let Some(row) = entity.data.as_row() {
3995                let entry_key = row.get_field("key").and_then(|v| match v {
3996                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3997                    _ => None,
3998                });
3999                if entry_key == Some(key) {
4000                    let id = entity.id.raw();
4001                    if id >= latest_id {
4002                        latest_id = id;
4003                        result = match row.get_field("value") {
4004                            Some(crate::storage::schema::Value::Float(n)) => *n,
4005                            Some(crate::storage::schema::Value::Integer(n)) => *n as f64,
4006                            Some(crate::storage::schema::Value::UnsignedInteger(n)) => *n as f64,
4007                            Some(crate::storage::schema::Value::Text(s)) => {
4008                                s.parse::<f64>().unwrap_or(default)
4009                            }
4010                            _ => default,
4011                        };
4012                    }
4013                }
4014            }
4015            true
4016        });
4017        result
4018    }
4019
4020    pub(crate) fn config_string(&self, key: &str, default: &str) -> String {
4021        if let Some(raw) = self.inner.env_config_overrides.get(key) {
4022            return raw.clone();
4023        }
4024        let store = self.inner.db.store();
4025        let Some(manager) = store.get_collection("red_config") else {
4026            return default.to_string();
4027        };
4028        let mut result = default.to_string();
4029        let mut latest_id: u64 = 0;
4030        manager.for_each_entity(|entity| {
4031            if let Some(row) = entity.data.as_row() {
4032                let entry_key = row.get_field("key").and_then(|v| match v {
4033                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
4034                    _ => None,
4035                });
4036                if entry_key == Some(key) {
4037                    let id = entity.id.raw();
4038                    if id >= latest_id {
4039                        latest_id = id;
4040                        if let Some(crate::storage::schema::Value::Text(value)) =
4041                            row.get_field("value")
4042                        {
4043                            result = value.to_string();
4044                        }
4045                    }
4046                }
4047            }
4048            true
4049        });
4050        result
4051    }
4052
4053    fn latest_metadata_for(
4054        &self,
4055        collection: &str,
4056        entity_id: u64,
4057    ) -> Option<crate::serde_json::Value> {
4058        self.inner
4059            .db
4060            .store()
4061            .get_metadata(collection, EntityId::new(entity_id))
4062            .map(|metadata| metadata_to_json(&metadata))
4063    }
4064
4065    fn persist_replica_lsn(&self, lsn: u64) {
4066        self.inner.db.store().set_config_tree(
4067            "red.replication",
4068            &crate::json!({
4069                "last_applied_lsn": lsn
4070            }),
4071        );
4072    }
4073
4074    /// Resolve this replica's stable identity (issue #812). The primary keys
4075    /// per-replica progress off this id, so it MUST be stable across reboots
4076    /// — a changing id would make the primary treat every restart as a brand
4077    /// new replica. Honours an operator-configured `red.replication.replica_id`
4078    /// first; otherwise generates one once and persists it so the next boot
4079    /// reuses the same value.
4080    fn resolve_replica_id(&self) -> String {
4081        let configured = self.config_string("red.replication.replica_id", "");
4082        if !configured.is_empty() {
4083            return configured;
4084        }
4085        let generated = crate::crypto::uuid::Uuid::new_v4().to_string();
4086        self.inner.db.store().set_config_tree(
4087            "red.replication",
4088            &crate::json!({
4089                "replica_id": generated.clone()
4090            }),
4091        );
4092        generated
4093    }
4094
4095    fn persist_replication_health(
4096        &self,
4097        state: &str,
4098        last_error: &str,
4099        primary_lsn: Option<u64>,
4100        oldest_available_lsn: Option<u64>,
4101    ) {
4102        self.inner.db.store().set_config_tree(
4103            "red.replication",
4104            &crate::json!({
4105                "state": state,
4106                "last_error": last_error,
4107                "last_seen_primary_lsn": primary_lsn.unwrap_or(0),
4108                "last_seen_oldest_lsn": oldest_available_lsn.unwrap_or(0),
4109                "updated_at_unix_ms": SystemTime::now()
4110                    .duration_since(UNIX_EPOCH)
4111                    .unwrap_or_default()
4112                    .as_millis() as u64
4113            }),
4114        );
4115    }
4116
4117    /// Whether `SECRET('...')` literals should be encrypted with the
4118    /// vault AES key on INSERT. Default `true`.
4119    pub(crate) fn secret_auto_encrypt(&self) -> bool {
4120        self.config_bool("red.config.secret.auto_encrypt", true)
4121    }
4122
4123    /// Whether `Value::Secret` columns should be decrypted back to
4124    /// plaintext on SELECT when the vault is unsealed. Default `true`.
4125    /// Turning this off keeps secrets masked as `***` even while the
4126    /// vault is open — useful for audit trails or read-only exports.
4127    pub(crate) fn secret_auto_decrypt(&self) -> bool {
4128        self.config_bool("red.config.secret.auto_decrypt", true)
4129    }
4130
4131    /// Walk every record in `result` and swap `Value::Secret(bytes)`
4132    /// for the decrypted plaintext when the runtime has the vault
4133    /// AES key AND `red.config.secret.auto_decrypt = true`. If the
4134    /// key is missing, the vault is sealed, or auto_decrypt is off,
4135    /// secrets are left as `Value::Secret` which every formatter
4136    /// (Display, JSON) already masks as `***`.
4137    pub(crate) fn apply_secret_decryption(&self, result: &mut RuntimeQueryResult) {
4138        if !self.secret_auto_decrypt() {
4139            return;
4140        }
4141        let Some(key) = self.secret_aes_key() else {
4142            return;
4143        };
4144        for record in result.result.records.iter_mut() {
4145            for value in record.values_mut() {
4146                if let Value::Secret(ref bytes) = value {
4147                    if let Some(plain) =
4148                        super::impl_dml::decrypt_secret_payload(&key, bytes.as_slice())
4149                    {
4150                        if let Ok(text) = String::from_utf8(plain) {
4151                            *value = Value::text(text);
4152                        }
4153                    }
4154                }
4155            }
4156        }
4157    }
4158
4159    /// Emit a CDC change event and replicate to WAL buffer.
4160    /// Create a `MutationEngine` bound to this runtime.
4161    ///
4162    /// The engine is cheap to construct (no allocation) and should be
4163    /// dropped after `apply` returns. Use this from application-layer
4164    /// `create_row` / `create_rows_batch` instead of calling
4165    /// `bulk_insert` + `index_entity_insert` + `cdc_emit` separately.
4166    pub(crate) fn mutation_engine(&self) -> crate::runtime::mutation::MutationEngine<'_> {
4167        crate::runtime::mutation::MutationEngine::new(self)
4168    }
4169
4170    /// Public-mutation gate snapshot (PLAN.md W1).
4171    ///
4172    /// Surfaces that accept untrusted client requests (SQL DML/DDL,
4173    /// gRPC mutating RPCs, HTTP/native wire mutations, admin
4174    /// maintenance, serverless lifecycle) call `check_write` before
4175    /// dispatching to storage. Returns `RedDBError::ReadOnly` on any
4176    /// instance running as a replica or with `options.read_only =
4177    /// true`. The replica internal logical-WAL apply path reaches into
4178    /// the store directly and never calls this method, so legitimate
4179    /// replica catch-up still works.
4180    pub fn check_write(&self, kind: crate::runtime::write_gate::WriteKind) -> RedDBResult<()> {
4181        self.inner.write_gate.check(kind)
4182    }
4183
4184    /// Read-only handle to the gate, useful for transports that want
4185    /// to surface the policy in health/status output without taking on
4186    /// a dependency on the concrete enum.
4187    pub fn write_gate(&self) -> &crate::runtime::write_gate::WriteGate {
4188        &self.inner.write_gate
4189    }
4190
4191    /// Process lifecycle handle (PLAN.md Phase 1). Health probes,
4192    /// admin/shutdown, and signal handlers consult this single
4193    /// state machine.
4194    pub fn lifecycle(&self) -> &crate::runtime::lifecycle::Lifecycle {
4195        &self.inner.lifecycle
4196    }
4197
4198    /// Operator-imposed resource limits (PLAN.md Phase 4.1).
4199    pub fn resource_limits(&self) -> &crate::runtime::resource_limits::ResourceLimits {
4200        &self.inner.resource_limits
4201    }
4202
4203    /// Append-only audit log for admin mutations (PLAN.md Phase 6.5).
4204    pub fn audit_log(&self) -> &crate::runtime::audit_log::AuditLogger {
4205        &self.inner.audit_log
4206    }
4207
4208    /// Shared `Arc` to the audit logger — used by collaborators (the
4209    /// lease lifecycle, future request-context plumbing) that need to
4210    /// keep the logger alive past the runtime's stack frame.
4211    pub fn audit_log_arc(&self) -> Arc<crate::runtime::audit_log::AuditLogger> {
4212        Arc::clone(&self.inner.audit_log)
4213    }
4214
4215    pub(crate) fn emit_control_event(
4216        &self,
4217        kind: crate::runtime::control_events::EventKind,
4218        outcome: crate::runtime::control_events::Outcome,
4219        action: &'static str,
4220        resource: Option<String>,
4221        reason: Option<String>,
4222        extra_fields: Vec<(String, crate::runtime::control_events::Sensitivity)>,
4223    ) -> RedDBResult<()> {
4224        use crate::runtime::control_events::{
4225            ActorRef, ControlEvent, ControlEventCtx, ControlEventLedger, Sensitivity,
4226        };
4227
4228        let tenant = current_tenant();
4229        let principal = current_auth_identity();
4230        let actor_user = principal
4231            .as_ref()
4232            .map(|(principal, _)| UserId::from_parts(tenant.as_deref(), principal));
4233        let actor = actor_user
4234            .as_ref()
4235            .map(ActorRef::User)
4236            .unwrap_or(ActorRef::Anonymous);
4237        let ctx = ControlEventCtx {
4238            actor,
4239            scope: tenant
4240                .as_ref()
4241                .map(|scope| std::borrow::Cow::Borrowed(scope.as_str())),
4242            request_id: Some(std::borrow::Cow::Owned(format!(
4243                "conn-{}",
4244                current_connection_id()
4245            ))),
4246            trace_id: None,
4247        };
4248        let mut fields = std::collections::HashMap::new();
4249        fields.insert(
4250            "connection_id".to_string(),
4251            Sensitivity::raw(current_connection_id().to_string()),
4252        );
4253        if let Some((_, role)) = principal {
4254            fields.insert("actor_role".to_string(), Sensitivity::raw(role.as_str()));
4255        }
4256        for (key, value) in extra_fields {
4257            fields.insert(key, value);
4258        }
4259        let event = ControlEvent {
4260            kind,
4261            outcome,
4262            action: std::borrow::Cow::Borrowed(action),
4263            resource,
4264            reason,
4265            matched_policy_id: None,
4266            fields,
4267        };
4268        let ledger = self.inner.control_event_ledger.read();
4269        match ledger.emit(&ctx, event) {
4270            Ok(_) => Ok(()),
4271            Err(err) if self.inner.control_event_config.require_persistence() => {
4272                Err(RedDBError::Internal(err.to_string()))
4273            }
4274            Err(_) => Ok(()),
4275        }
4276    }
4277
4278    fn policy_mutation_control_ctx<'a>(
4279        &self,
4280        actor: &'a crate::auth::UserId,
4281        tenant: Option<&'a str>,
4282    ) -> crate::runtime::control_events::ControlEventCtx<'a> {
4283        crate::runtime::control_events::ControlEventCtx {
4284            actor: crate::runtime::control_events::ActorRef::User(actor),
4285            scope: tenant.map(std::borrow::Cow::Borrowed),
4286            request_id: Some(std::borrow::Cow::Owned(format!(
4287                "conn-{}",
4288                current_connection_id()
4289            ))),
4290            trace_id: None,
4291        }
4292    }
4293
4294    fn emit_query_audit(
4295        &self,
4296        query: &str,
4297        plan: &QueryAuditPlan,
4298        duration_ms: u64,
4299        result: &RuntimeQueryResult,
4300    ) {
4301        if !self.inner.query_audit.has_rules() {
4302            return;
4303        }
4304        let actor = current_auth_identity().map(|(principal, _)| principal);
4305        let tenant = current_tenant();
4306        let row_count = if result.statement_type == "select" {
4307            result.result.records.len() as u64
4308        } else {
4309            result.affected_rows
4310        };
4311        self.inner
4312            .query_audit
4313            .emit(crate::runtime::query_audit::QueryAuditEvent {
4314                actor,
4315                tenant,
4316                statement_kind: plan.statement_kind,
4317                touched_collections: plan.collections.clone(),
4318                duration_ms,
4319                row_count,
4320                request_id: Some(crate::crypto::uuid::Uuid::new_v7().to_string()),
4321                query_hash: Some(blake3::hash(query.as_bytes()).to_hex().to_string()),
4322            });
4323    }
4324
4325    /// Slice 10 of issue #527 — shared queue telemetry counters
4326    /// (delivered/acked/nacked). Cloned by `queue_delivery.rs` on
4327    /// each transition.
4328    pub(crate) fn queue_telemetry(
4329        &self,
4330    ) -> &crate::runtime::queue_telemetry::QueueTelemetryCounters {
4331        &self.inner.queue_telemetry
4332    }
4333
4334    /// Snapshots of the queue telemetry counters in label-deterministic
4335    /// order for `/metrics` rendering and the integration test.
4336    pub fn queue_telemetry_snapshot(
4337        &self,
4338    ) -> crate::runtime::queue_telemetry::QueueTelemetrySnapshot {
4339        crate::runtime::queue_telemetry::QueueTelemetrySnapshot {
4340            delivered: self.inner.queue_telemetry.delivered_snapshot(),
4341            acked: self.inner.queue_telemetry.acked_snapshot(),
4342            nacked: self.inner.queue_telemetry.nacked_snapshot(),
4343            wait_started: self.inner.queue_telemetry.wait_started_snapshot(),
4344            wait_woken: self.inner.queue_telemetry.wait_woken_snapshot(),
4345            wait_timed_out: self.inner.queue_telemetry.wait_timed_out_snapshot(),
4346            wait_cancelled: self.inner.queue_telemetry.wait_cancelled_snapshot(),
4347            wait_duration: self.inner.queue_telemetry.wait_duration_snapshot(),
4348        }
4349    }
4350
4351    /// Issue #742 — consumer presence registry. Heartbeats land here
4352    /// from `QUEUE READ` (and, in a follow-up slice, an explicit
4353    /// `QUEUE HEARTBEAT` command); Red UI and `red.queue_consumers`
4354    /// read snapshots through `queue_consumer_presence_snapshot`.
4355    pub(crate) fn queue_presence(
4356        &self,
4357    ) -> &std::sync::Arc<crate::storage::queue::presence::ConsumerPresenceRegistry> {
4358        &self.inner.queue_presence
4359    }
4360
4361    /// Issue #742 — point-in-time presence snapshot, classifying each
4362    /// `(queue, group, consumer)` as active/stale/expired against the
4363    /// supplied TTL. Wall-clock is read once here so the lifecycle
4364    /// flags inside the snapshot are internally consistent.
4365    pub fn queue_consumer_presence_snapshot(
4366        &self,
4367        ttl_ms: u64,
4368    ) -> Vec<crate::storage::queue::presence::ConsumerPresence> {
4369        let now_ns = std::time::SystemTime::now()
4370            .duration_since(std::time::UNIX_EPOCH)
4371            .map(|d| d.as_nanos() as u64)
4372            .unwrap_or(0);
4373        self.inner.queue_presence.snapshot(now_ns, ttl_ms)
4374    }
4375
4376    /// Issue #742 — active-consumer count per `(queue, group)` for the
4377    /// queue-metadata surface. Stale/expired entries are excluded by
4378    /// definition; they are still visible in the per-row snapshot.
4379    pub fn queue_active_consumer_counts(
4380        &self,
4381        ttl_ms: u64,
4382    ) -> std::collections::HashMap<(String, String), u32> {
4383        let now_ns = std::time::SystemTime::now()
4384            .duration_since(std::time::UNIX_EPOCH)
4385            .map(|d| d.as_nanos() as u64)
4386            .unwrap_or(0);
4387        self.inner
4388            .queue_presence
4389            .count_active_by_group(now_ns, ttl_ms)
4390    }
4391
4392    /// Issue #743 — vector + TurboQuant introspection registry. Engine
4393    /// publish points (collection create, artifact build start /
4394    /// finish, fallback toggle, drop) update this; Red UI and
4395    /// `red.*` vector virtual tables read snapshots through
4396    /// `vector_introspection_snapshot` / `vector_introspection_get`.
4397    pub(crate) fn vector_introspection_registry(
4398        &self,
4399    ) -> &std::sync::Arc<crate::storage::vector::introspection::VectorIntrospectionRegistry> {
4400        &self.inner.vector_introspection
4401    }
4402
4403    /// Issue #743 — full snapshot of every tracked vector collection's
4404    /// `(VectorMetadata, ArtifactMetadata)`. Deterministically ordered
4405    /// by collection name so Red UI tables and tests both see a
4406    /// stable shape.
4407    pub fn vector_introspection_snapshot(
4408        &self,
4409    ) -> Vec<crate::storage::vector::introspection::VectorIntrospection> {
4410        self.inner.vector_introspection.snapshot()
4411    }
4412
4413    /// Issue #743 — single-collection lookup, for the per-collection
4414    /// metadata endpoint Red UI hits when an operator opens one
4415    /// vector's toolbar.
4416    pub fn vector_introspection_get(
4417        &self,
4418        collection: &str,
4419    ) -> Option<crate::storage::vector::introspection::VectorIntrospection> {
4420        self.inner.vector_introspection.get(collection)
4421    }
4422
4423    /// Slice 10 of issue #527 — render-time scan of pending entries
4424    /// per (queue, group) for the `queue_pending_gauge` exposition.
4425    /// Walks `red_queue_meta` live so the gauge cannot drift from
4426    /// the source of truth.
4427    pub fn queue_pending_counts(&self) -> Vec<((String, String), u64)> {
4428        let store = self.inner.db.store();
4429        crate::runtime::impl_queue::pending_counts_by_group(store.as_ref())
4430            .into_iter()
4431            .collect()
4432    }
4433
4434    /// Shared `Arc` to the write gate. Same rationale as
4435    /// `audit_log_arc`: collaborators (lease lifecycle, refresh
4436    /// thread) need a clone-cheap handle they can move into a
4437    /// background thread.
4438    pub fn write_gate_arc(&self) -> Arc<crate::runtime::write_gate::WriteGate> {
4439        Arc::clone(&self.inner.write_gate)
4440    }
4441
4442    /// Serverless writer-lease state machine. `None` when the operator
4443    /// did not opt into lease fencing (`RED_LEASE_REQUIRED` unset).
4444    pub fn lease_lifecycle(&self) -> Option<&Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>> {
4445        self.inner.lease_lifecycle.get()
4446    }
4447
4448    /// Install the lease lifecycle. Idempotent; subsequent calls
4449    /// return the previously stored value untouched.
4450    pub fn set_lease_lifecycle(
4451        &self,
4452        lifecycle: Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>,
4453    ) -> Result<(), Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>> {
4454        self.inner.lease_lifecycle.set(lifecycle)
4455    }
4456
4457    /// Reject the call when the requested batch size exceeds
4458    /// `RED_MAX_BATCH_SIZE`. Returns `RedDBError::QuotaExceeded`
4459    /// shaped so the HTTP layer can map it to 413 Payload Too
4460    /// Large (PLAN.md Phase 4.1).
4461    pub fn check_batch_size(&self, requested: usize) -> RedDBResult<()> {
4462        if self.inner.resource_limits.batch_size_exceeded(requested) {
4463            let max = self.inner.resource_limits.max_batch_size.unwrap_or(0);
4464            return Err(RedDBError::QuotaExceeded(format!(
4465                "max_batch_size:{requested}:{max}"
4466            )));
4467        }
4468        Ok(())
4469    }
4470
4471    /// Reject the call when the local DB file exceeds
4472    /// `RED_MAX_DB_SIZE_BYTES`. Reads file metadata once per call —
4473    /// the cost is a single `stat()` syscall, negligible against the
4474    /// I/O the caller is about to do. Returns `QuotaExceeded` shaped
4475    /// for HTTP 507 Insufficient Storage.
4476    pub fn check_db_size(&self) -> RedDBResult<()> {
4477        let Some(limit) = self.inner.resource_limits.max_db_size_bytes else {
4478            return Ok(());
4479        };
4480        if limit == 0 {
4481            return Ok(());
4482        }
4483        let Some(path) = self.inner.db.path() else {
4484            return Ok(());
4485        };
4486        let current = std::fs::metadata(path).map(|m| m.len()).unwrap_or(0);
4487        if current > limit {
4488            return Err(RedDBError::QuotaExceeded(format!(
4489                "max_db_size_bytes:{current}:{limit}"
4490            )));
4491        }
4492        Ok(())
4493    }
4494
4495    /// Graceful shutdown coordinator (PLAN.md Phase 1.1).
4496    ///
4497    /// Steps, in order, all idempotent across re-entrant calls:
4498    ///   1. Move lifecycle into `ShuttingDown` (concurrent callers
4499    ///      observe `Stopped` after first finishes).
4500    ///   2. Flush WAL + run final checkpoint via `db.flush()` so
4501    ///      every acked write is durable on disk.
4502    ///   3. If `backup_on_shutdown == true` and a remote backend is
4503    ///      configured, run a synchronous `trigger_backup()` so the
4504    ///      remote head reflects the final state.
4505    ///   4. Stamp the report and move to `Stopped`. Subsequent calls
4506    ///      return the cached report without re-running anything.
4507    ///
4508    /// On any error, the runtime is still marked `Stopped` so the
4509    /// process can exit; the caller logs the error context but does
4510    /// not retry the same shutdown — the operator can inspect the
4511    /// report fields to see which step failed.
4512    pub fn graceful_shutdown(
4513        &self,
4514        backup_on_shutdown: bool,
4515    ) -> RedDBResult<crate::runtime::lifecycle::ShutdownReport> {
4516        if !self.inner.lifecycle.begin_shutdown() {
4517            // Someone else already shut down (or is in flight). Return
4518            // the cached report so the HTTP caller and SIGTERM handler
4519            // get the same idempotent answer.
4520            return Ok(self.inner.lifecycle.shutdown_report().unwrap_or_default());
4521        }
4522
4523        let started_ms = std::time::SystemTime::now()
4524            .duration_since(std::time::UNIX_EPOCH)
4525            .map(|d| d.as_millis() as u64)
4526            .unwrap_or(0);
4527        let mut report = crate::runtime::lifecycle::ShutdownReport {
4528            started_at_ms: started_ms,
4529            ..Default::default()
4530        };
4531
4532        // Flush WAL + run any pending checkpoint. Local fsync is
4533        // unconditional — even a lease-lost replica needs its WAL on
4534        // disk before exit so a future restore has the latest tail.
4535        // The remote upload is gated separately so a lost-lease writer
4536        // doesn't clobber the new holder's state on its way out.
4537        let flush_res = self.inner.db.flush_local_only();
4538        report.flushed_wal = flush_res.is_ok();
4539        report.final_checkpoint = flush_res.is_ok();
4540        if let Err(err) = &flush_res {
4541            tracing::error!(
4542                target: "reddb::lifecycle",
4543                error = %err,
4544                "graceful_shutdown: local flush failed"
4545            );
4546        } else if let Err(lease_err) =
4547            self.assert_remote_write_allowed("shutdown/checkpoint_upload")
4548        {
4549            tracing::warn!(
4550                target: "reddb::serverless::lease",
4551                error = %lease_err,
4552                "graceful_shutdown: remote upload skipped — lease not held"
4553            );
4554        } else if let Err(err) = self.inner.db.upload_to_remote_backend() {
4555            tracing::error!(
4556                target: "reddb::lifecycle",
4557                error = %err,
4558                "graceful_shutdown: remote upload failed"
4559            );
4560        }
4561
4562        // Optional final backup. Skipped silently when no remote
4563        // backend is configured — `trigger_backup()` returns Err
4564        // anyway in that case, but logging it as a shutdown failure
4565        // would be misleading on a standalone (no-backend) runtime.
4566        if backup_on_shutdown && self.inner.db.remote_backend.is_some() {
4567            // The trigger_backup gate now reads `WriteKind::Backup`,
4568            // which a replica/read_only instance refuses. That's
4569            // intentional — replicas don't drive backups; only the
4570            // primary does. We still want shutdown to flush its WAL
4571            // even if the backup branch is gated off.
4572            match self.trigger_backup() {
4573                Ok(result) => {
4574                    report.backup_uploaded = result.uploaded;
4575                }
4576                Err(err) => {
4577                    tracing::warn!(
4578                        target: "reddb::lifecycle",
4579                        error = %err,
4580                        "graceful_shutdown: final backup skipped"
4581                    );
4582                }
4583            }
4584        }
4585
4586        let completed_ms = std::time::SystemTime::now()
4587            .duration_since(std::time::UNIX_EPOCH)
4588            .map(|d| d.as_millis() as u64)
4589            .unwrap_or(started_ms);
4590        report.completed_at_ms = completed_ms;
4591        report.duration_ms = completed_ms.saturating_sub(started_ms);
4592
4593        self.inner.lifecycle.finish_shutdown(report.clone());
4594        Ok(report)
4595    }
4596
4597    /// Emit a CDC record without invalidating the result cache.
4598    ///
4599    /// Used by `MutationEngine::append_batch` which calls
4600    /// `invalidate_result_cache` once for the whole batch before this
4601    /// loop, avoiding N write-lock acquisitions.
4602    pub(crate) fn cdc_emit_no_cache_invalidate(
4603        &self,
4604        operation: crate::replication::cdc::ChangeOperation,
4605        collection: &str,
4606        entity_id: u64,
4607        entity_kind: &str,
4608    ) -> u64 {
4609        let lsn = self
4610            .inner
4611            .cdc
4612            .emit(operation, collection, entity_id, entity_kind);
4613
4614        // Append to logical WAL replication buffer (if primary mode)
4615        if let Some(ref primary) = self.inner.db.replication {
4616            let store = self.inner.db.store();
4617            let entity = if operation == crate::replication::cdc::ChangeOperation::Delete {
4618                None
4619            } else {
4620                store.get(collection, EntityId::new(entity_id))
4621            };
4622            let record = ChangeRecord {
4623                term: self.current_replication_term(),
4624                lsn,
4625                timestamp: SystemTime::now()
4626                    .duration_since(UNIX_EPOCH)
4627                    .unwrap_or_default()
4628                    .as_millis() as u64,
4629                operation,
4630                collection: collection.to_string(),
4631                entity_id,
4632                entity_kind: entity_kind.to_string(),
4633                entity_bytes: entity
4634                    .as_ref()
4635                    .map(|e| UnifiedStore::serialize_entity(e, store.format_version())),
4636                metadata: self.latest_metadata_for(collection, entity_id),
4637                refresh_records: None,
4638            };
4639            let encoded = record.encode();
4640            primary.append_logical_record(record.lsn, encoded);
4641        }
4642        lsn
4643    }
4644
4645    pub(crate) fn cdc_emit_insert_batch_no_cache_invalidate(
4646        &self,
4647        collection: &str,
4648        ids: &[EntityId],
4649        entity_kind: &str,
4650    ) -> Vec<u64> {
4651        if ids.is_empty() {
4652            return Vec::new();
4653        }
4654
4655        // Without logical replication, CDC only needs the in-memory event
4656        // ring. Reserve all LSNs and push the batch under one mutex instead
4657        // of taking the ring lock once per inserted row.
4658        if self.inner.db.replication.is_none() {
4659            return self.inner.cdc.emit_batch_same_collection(
4660                crate::replication::cdc::ChangeOperation::Insert,
4661                collection,
4662                entity_kind,
4663                ids.iter().map(|id| id.raw()),
4664            );
4665        }
4666
4667        // Replication needs one logical-WAL record per entity with the
4668        // serialized entity bytes, so keep the existing per-row path.
4669        ids.iter()
4670            .map(|id| {
4671                self.cdc_emit_no_cache_invalidate(
4672                    crate::replication::cdc::ChangeOperation::Insert,
4673                    collection,
4674                    id.raw(),
4675                    entity_kind,
4676                )
4677            })
4678            .collect()
4679    }
4680
4681    pub fn cdc_emit(
4682        &self,
4683        operation: crate::replication::cdc::ChangeOperation,
4684        collection: &str,
4685        entity_id: u64,
4686        entity_kind: &str,
4687    ) -> u64 {
4688        let lsn = self
4689            .inner
4690            .cdc
4691            .emit(operation, collection, entity_id, entity_kind);
4692        // Perf: prior to this we called `invalidate_result_cache()`
4693        // which wipes EVERY cached query, across every table, under
4694        // a write lock — turning each INSERT into a serialisation
4695        // point for all readers. Swap to the per-table variant so
4696        // unrelated query caches survive.
4697        self.invalidate_result_cache_for_table(collection);
4698
4699        // Append to logical WAL replication buffer (if primary mode)
4700        if let Some(ref primary) = self.inner.db.replication {
4701            let store = self.inner.db.store();
4702            let entity = if operation == crate::replication::cdc::ChangeOperation::Delete {
4703                None
4704            } else {
4705                store.get(collection, EntityId::new(entity_id))
4706            };
4707            let record = ChangeRecord {
4708                term: self.current_replication_term(),
4709                lsn,
4710                timestamp: SystemTime::now()
4711                    .duration_since(UNIX_EPOCH)
4712                    .unwrap_or_default()
4713                    .as_millis() as u64,
4714                operation,
4715                collection: collection.to_string(),
4716                entity_id,
4717                entity_kind: entity_kind.to_string(),
4718                entity_bytes: entity
4719                    .as_ref()
4720                    .map(|entity| UnifiedStore::serialize_entity(entity, store.format_version())),
4721                metadata: self.latest_metadata_for(collection, entity_id),
4722                refresh_records: None,
4723            };
4724            let encoded = record.encode();
4725            primary.append_logical_record(record.lsn, encoded);
4726        }
4727        lsn
4728    }
4729
4730    pub(crate) fn cdc_emit_kv(
4731        &self,
4732        operation: crate::replication::cdc::ChangeOperation,
4733        collection: &str,
4734        key: &str,
4735        entity_id: u64,
4736        before: Option<crate::json::Value>,
4737        after: Option<crate::json::Value>,
4738    ) -> u64 {
4739        let lsn = self
4740            .inner
4741            .cdc
4742            .emit_kv(operation, collection, key, entity_id, before, after);
4743        self.inner.kv_stats.incr_watch_events_emitted();
4744        self.invalidate_result_cache_for_table(collection);
4745        lsn
4746    }
4747
4748    pub(crate) fn record_kv_watch_event(
4749        &self,
4750        operation: crate::replication::cdc::ChangeOperation,
4751        collection: &str,
4752        key: &str,
4753        entity_id: u64,
4754        before: Option<crate::json::Value>,
4755        after: Option<crate::json::Value>,
4756    ) {
4757        if self.current_xid().is_some() {
4758            let conn_id = current_connection_id();
4759            let event = crate::replication::cdc::KvWatchEvent {
4760                collection: collection.to_string(),
4761                key: key.to_string(),
4762                op: operation,
4763                before,
4764                after,
4765                lsn: 0,
4766                committed_at: 0,
4767                dropped_event_count: 0,
4768            };
4769            self.inner
4770                .pending_kv_watch_events
4771                .write()
4772                .entry(conn_id)
4773                .or_default()
4774                .push(event);
4775            return;
4776        }
4777
4778        self.cdc_emit_kv(operation, collection, key, entity_id, before, after);
4779    }
4780
4781    pub(crate) fn cdc_emit_prebuilt(
4782        &self,
4783        operation: crate::replication::cdc::ChangeOperation,
4784        collection: &str,
4785        entity: &UnifiedEntity,
4786        entity_kind: &str,
4787        metadata: Option<&crate::storage::Metadata>,
4788        invalidate_cache: bool,
4789    ) -> u64 {
4790        self.cdc_emit_prebuilt_with_columns(
4791            operation,
4792            collection,
4793            entity,
4794            entity_kind,
4795            metadata,
4796            invalidate_cache,
4797            None,
4798        )
4799    }
4800
4801    /// `cdc_emit_prebuilt` plus the list of column names whose values
4802    /// changed on this update. Callers that have already computed a
4803    /// `RowDamageVector` pass it here so downstream CDC consumers can
4804    /// filter events by touched column without re-diffing.
4805    /// `changed_columns` is only meaningful for `Update` operations —
4806    /// insert and delete events ignore it.
4807    pub(crate) fn cdc_emit_prebuilt_with_columns(
4808        &self,
4809        operation: crate::replication::cdc::ChangeOperation,
4810        collection: &str,
4811        entity: &UnifiedEntity,
4812        entity_kind: &str,
4813        metadata: Option<&crate::storage::Metadata>,
4814        invalidate_cache: bool,
4815        changed_columns: Option<Vec<String>>,
4816    ) -> u64 {
4817        if invalidate_cache {
4818            self.invalidate_result_cache();
4819        }
4820
4821        let public_id = entity.logical_id().raw();
4822        let lsn = self.inner.cdc.emit_with_columns(
4823            operation,
4824            collection,
4825            public_id,
4826            entity_kind,
4827            changed_columns,
4828        );
4829
4830        if let Some(ref primary) = self.inner.db.replication {
4831            let store = self.inner.db.store();
4832            let record = ChangeRecord {
4833                term: self.current_replication_term(),
4834                lsn,
4835                timestamp: SystemTime::now()
4836                    .duration_since(UNIX_EPOCH)
4837                    .unwrap_or_default()
4838                    .as_millis() as u64,
4839                operation,
4840                collection: collection.to_string(),
4841                entity_id: entity.id.raw(),
4842                entity_kind: entity_kind.to_string(),
4843                entity_bytes: Some(UnifiedStore::serialize_entity(
4844                    entity,
4845                    store.format_version(),
4846                )),
4847                metadata: metadata
4848                    .map(metadata_to_json)
4849                    .or_else(|| self.latest_metadata_for(collection, entity.id.raw())),
4850                refresh_records: None,
4851            };
4852            let encoded = record.encode();
4853            primary.append_logical_record(record.lsn, encoded);
4854        }
4855
4856        lsn
4857    }
4858
4859    pub(crate) fn current_replication_term(&self) -> u64 {
4860        self.inner.db.options().replication.term
4861    }
4862
4863    pub(crate) fn cdc_emit_prebuilt_batch<'a, I>(
4864        &self,
4865        operation: crate::replication::cdc::ChangeOperation,
4866        entity_kind: &str,
4867        items: I,
4868        invalidate_cache: bool,
4869    ) where
4870        I: IntoIterator<
4871            Item = (
4872                &'a str,
4873                &'a UnifiedEntity,
4874                Option<&'a crate::storage::Metadata>,
4875            ),
4876        >,
4877    {
4878        let items: Vec<(&str, &UnifiedEntity, Option<&crate::storage::Metadata>)> =
4879            items.into_iter().collect();
4880        if items.is_empty() {
4881            return;
4882        }
4883
4884        if invalidate_cache {
4885            self.invalidate_result_cache();
4886        }
4887
4888        for (collection, entity, metadata) in items {
4889            self.cdc_emit_prebuilt(operation, collection, entity, entity_kind, metadata, false);
4890        }
4891    }
4892
4893    fn run_replica_loop(&self, primary_addr: String) {
4894        let endpoint = if primary_addr.starts_with("http") {
4895            primary_addr
4896        } else {
4897            format!("http://{primary_addr}")
4898        };
4899        let poll_ms = self.inner.db.options().replication.poll_interval_ms;
4900        let max_count = self.inner.db.options().replication.max_batch_size;
4901        let mut since_lsn = self.config_u64("red.replication.last_applied_lsn", 0);
4902        // Issue #812 — stable identity sent on every WAL pull so the primary
4903        // can self-register this replica and attribute pulls to it.
4904        let replica_id = self.resolve_replica_id();
4905
4906        let runtime = match tokio::runtime::Builder::new_current_thread()
4907            .enable_all()
4908            .build()
4909        {
4910            Ok(runtime) => runtime,
4911            Err(_) => return,
4912        };
4913
4914        runtime.block_on(async move {
4915            use crate::grpc::proto::red_db_client::RedDbClient;
4916            use crate::grpc::proto::JsonPayloadRequest;
4917
4918            let mut client = loop {
4919                match RedDbClient::connect(endpoint.clone()).await {
4920                    Ok(client) => {
4921                        self.persist_replication_health("connecting", "", None, None);
4922                        break client;
4923                    }
4924                    Err(_) => {
4925                        self.persist_replication_health(
4926                            "connecting",
4927                            "waiting for primary connection",
4928                            None,
4929                            None,
4930                        );
4931                        std::thread::sleep(std::time::Duration::from_millis(poll_ms.max(250)))
4932                    }
4933                }
4934            };
4935
4936            // PLAN.md Phase 11.5 — stateful applier guards LSN
4937            // monotonicity across pulls. Seed with the persisted
4938            // `last_applied_lsn` so reboots don't lose the chain
4939            // pointer.
4940            let applier = crate::replication::logical::LogicalChangeApplier::with_metrics(
4941                since_lsn,
4942                self.inner.replica_apply_metrics.clone(),
4943            );
4944
4945            loop {
4946                let payload = crate::json!({
4947                    "since_lsn": since_lsn,
4948                    "max_count": max_count,
4949                    "replica_id": replica_id,
4950                    "await_data": true,
4951                    "await_timeout_ms": 30_000
4952                });
4953                let request = tonic::Request::new(JsonPayloadRequest {
4954                    payload_json: crate::json::to_string(&payload)
4955                        .unwrap_or_else(|_| "{}".to_string()),
4956                });
4957
4958                if let Ok(response) = client.pull_wal_records(request).await {
4959                    if let Ok(value) =
4960                        crate::json::from_str::<crate::json::Value>(&response.into_inner().payload)
4961                    {
4962                        let current_lsn =
4963                            value.get("current_lsn").and_then(crate::json::Value::as_u64);
4964                        let oldest_available_lsn = value
4965                            .get("oldest_available_lsn")
4966                            .and_then(crate::json::Value::as_u64);
4967                        if value
4968                            .get("needs_rebootstrap")
4969                            .and_then(crate::json::Value::as_bool)
4970                            .unwrap_or(false)
4971                        {
4972                            let reason = value
4973                                .get("invalidation_reason")
4974                                .and_then(crate::json::Value::as_str)
4975                                .unwrap_or("unknown");
4976                            self.persist_replication_health(
4977                                "rebootstrap_required",
4978                                &format!("replication slot invalidated ({reason}); re-bootstrap required"),
4979                                current_lsn,
4980                                oldest_available_lsn,
4981                            );
4982                            std::thread::sleep(std::time::Duration::from_millis(poll_ms.max(250)));
4983                            continue;
4984                        }
4985                        if since_lsn > 0
4986                            && oldest_available_lsn
4987                                .map(|oldest| oldest > since_lsn.saturating_add(1))
4988                                .unwrap_or(false)
4989                        {
4990                            self.persist_replication_health(
4991                                "rebootstrap_required",
4992                                "replica is behind the oldest logical WAL available on primary; re-bootstrap required",
4993                                current_lsn,
4994                                oldest_available_lsn,
4995                            );
4996                            std::thread::sleep(std::time::Duration::from_millis(poll_ms.max(250)));
4997                            continue;
4998                        }
4999                        if let Some(records) =
5000                            value.get("records").and_then(crate::json::Value::as_array)
5001                        {
5002                            let mut batch_applied_lsn = None;
5003                            let mut ack_failed = false;
5004                            for record in records {
5005                                let Some(data_hex) =
5006                                    record.get("data").and_then(crate::json::Value::as_str)
5007                                else {
5008                                    continue;
5009                                };
5010                                let Ok(data) = hex::decode(data_hex) else {
5011                                    self.inner.replica_apply_metrics.record(
5012                                        crate::replication::logical::ApplyErrorKind::Decode,
5013                                    );
5014                                    self.persist_replication_health(
5015                                        "apply_error",
5016                                        "failed to decode WAL record hex payload",
5017                                        current_lsn,
5018                                        oldest_available_lsn,
5019                                    );
5020                                    continue;
5021                                };
5022                                let Ok(change) = ChangeRecord::decode(&data) else {
5023                                    self.inner.replica_apply_metrics.record(
5024                                        crate::replication::logical::ApplyErrorKind::Decode,
5025                                    );
5026                                    self.persist_replication_health(
5027                                        "apply_error",
5028                                        "failed to decode logical WAL record",
5029                                        current_lsn,
5030                                        oldest_available_lsn,
5031                                    );
5032                                    continue;
5033                                };
5034                                match applier.apply(
5035                                    self.inner.db.as_ref(),
5036                                    &change,
5037                                    ApplyMode::Replica,
5038                                ) {
5039                                    Ok(crate::replication::logical::ApplyOutcome::Applied) => {
5040                                        self.invalidate_result_cache_for_table(&change.collection);
5041                                        since_lsn = since_lsn.max(change.lsn);
5042                                        self.persist_replica_lsn(since_lsn);
5043                                        batch_applied_lsn = Some(since_lsn);
5044                                    }
5045                                    Ok(_) => {
5046                                        // Idempotent / Skipped: no advance, no error.
5047                                    }
5048                                    Err(err) => {
5049                                        self.inner.replica_apply_metrics.record(err.kind());
5050                                        // Issue #205 — emit operator-grade event
5051                                        // for the two replication-fatal kinds. `Gap`
5052                                        // / `Apply` / `Decode` already persist via
5053                                        // `persist_replication_health`; the
5054                                        // OperatorEvent variants only cover the
5055                                        // two "stream is broken" / "follower
5056                                        // diverged" conditions an operator must act
5057                                        // on out-of-band.
5058                                        match &err {
5059                                            crate::replication::logical::LogicalApplyError::Divergence { lsn, expected: _, got: _, .. } => {
5060                                                crate::telemetry::operator_event::OperatorEvent::Divergence {
5061                                                    peer: "primary".to_string(),
5062                                                    leader_lsn: *lsn,
5063                                                    follower_lsn: since_lsn,
5064                                                }
5065                                                .emit_global();
5066                                            }
5067                                            crate::replication::logical::LogicalApplyError::Gap { last, next } => {
5068                                                crate::telemetry::operator_event::OperatorEvent::ReplicationBroken {
5069                                                    peer: "primary".to_string(),
5070                                                    reason: format!("stalled gap last={last} next={next}"),
5071                                                }
5072                                                .emit_global();
5073                                            }
5074                                            _ => {}
5075                                        }
5076                                        let kind = match &err {
5077                                            crate::replication::logical::LogicalApplyError::Gap { .. } => "stalled_gap",
5078                                            crate::replication::logical::LogicalApplyError::Divergence { .. } => "divergence",
5079                                            // Issue #835 — a stale-term record from a
5080                                            // returning ex-primary was fenced. The
5081                                            // replica stays put (no apply, no watermark
5082                                            // advance) until the legitimate primary's
5083                                            // current-term stream resumes.
5084                                            crate::replication::logical::LogicalApplyError::StaleTermFenced { .. } => "stale_term_fenced",
5085                                            _ => "apply_error",
5086                                        };
5087                                        self.persist_replication_health(
5088                                            kind,
5089                                            &format!("replica apply rejected: {err}"),
5090                                            current_lsn,
5091                                            oldest_available_lsn,
5092                                        );
5093                                        // Stop applying this batch. The
5094                                        // outer loop will retry on next
5095                                        // pull, which on a real Gap will
5096                                        // not magically heal — operator
5097                                        // must rebootstrap. For
5098                                        // Divergence, we explicitly do
5099                                        // not advance; this keeps the
5100                                        // replica visibly unhealthy
5101                                        // instead of silently swallowing
5102                                        // corruption.
5103                                        break;
5104                                    }
5105                                }
5106                            }
5107                            if let Some(applied_lsn) = batch_applied_lsn {
5108                                let apply_errors = self.replica_apply_error_counts();
5109                                let apply_errors_total =
5110                                    apply_errors.iter().map(|(_, count)| *count).sum::<u64>();
5111                                let divergence_total = apply_errors
5112                                    .iter()
5113                                    .find(|(kind, _)| {
5114                                        matches!(
5115                                            kind,
5116                                            crate::replication::logical::ApplyErrorKind::Divergence
5117                                        )
5118                                    })
5119                                    .map(|(_, count)| *count)
5120                                    .unwrap_or(0);
5121                                let ack_payload = crate::json!({
5122                                    "replica_id": replica_id.clone(),
5123                                    "applied_lsn": applied_lsn,
5124                                    "durable_lsn": applied_lsn,
5125                                    "apply_errors_total": apply_errors_total,
5126                                    "divergence_total": divergence_total
5127                                });
5128                                let ack_request = tonic::Request::new(JsonPayloadRequest {
5129                                    payload_json: crate::json::to_string(&ack_payload)
5130                                        .unwrap_or_else(|_| "{}".to_string()),
5131                                });
5132                                if client.ack_replica_lsn(ack_request).await.is_err() {
5133                                    ack_failed = true;
5134                                    self.persist_replication_health(
5135                                        "ack_error",
5136                                        "primary ack_replica_lsn request failed",
5137                                        current_lsn,
5138                                        oldest_available_lsn,
5139                                    );
5140                                }
5141                            }
5142                            if ack_failed {
5143                                std::thread::sleep(std::time::Duration::from_millis(poll_ms));
5144                                continue;
5145                            }
5146                        }
5147                        self.persist_replication_health(
5148                            "healthy",
5149                            "",
5150                            current_lsn,
5151                            oldest_available_lsn,
5152                        );
5153                    } else {
5154                        self.persist_replication_health(
5155                            "apply_error",
5156                            "failed to parse pull_wal_records response",
5157                            None,
5158                            None,
5159                        );
5160                    }
5161                } else {
5162                    self.persist_replication_health(
5163                        "connecting",
5164                        "primary pull_wal_records request failed",
5165                        None,
5166                        None,
5167                    );
5168                    std::thread::sleep(std::time::Duration::from_millis(poll_ms.max(250)));
5169                }
5170            }
5171        });
5172    }
5173
5174    /// Poll CDC events since a given LSN.
5175    pub fn cdc_poll(
5176        &self,
5177        since_lsn: u64,
5178        max_count: usize,
5179    ) -> Vec<crate::replication::cdc::ChangeEvent> {
5180        self.inner.cdc.poll(since_lsn, max_count)
5181    }
5182
5183    /// PLAN.md Phase 11.4 — current CDC LSN. Public mutation
5184    /// surfaces (HTTP query, gRPC entity ops) call this immediately
5185    /// after a successful write to feed `enforce_commit_policy`.
5186    pub fn cdc_current_lsn(&self) -> u64 {
5187        self.inner.cdc.current_lsn()
5188    }
5189
5190    pub fn kv_watch_events_since(
5191        &self,
5192        collection: &str,
5193        key: &str,
5194        since_lsn: u64,
5195        max_count: usize,
5196    ) -> Vec<crate::replication::cdc::KvWatchEvent> {
5197        self.inner
5198            .cdc
5199            .poll(since_lsn, max_count)
5200            .into_iter()
5201            .filter_map(|event| event.kv)
5202            .filter(|event| event.collection == collection && event.key == key)
5203            .collect()
5204    }
5205
5206    pub fn kv_watch_events_since_prefix(
5207        &self,
5208        collection: &str,
5209        prefix: &str,
5210        since_lsn: u64,
5211        max_count: usize,
5212    ) -> Vec<crate::replication::cdc::KvWatchEvent> {
5213        self.inner
5214            .cdc
5215            .poll(since_lsn, max_count)
5216            .into_iter()
5217            .filter_map(|event| event.kv)
5218            .filter(|event| event.collection == collection && event.key.starts_with(prefix))
5219            .collect()
5220    }
5221
5222    pub(crate) fn kv_watch_subscribe<'a>(
5223        &'a self,
5224        collection: impl Into<String>,
5225        key: impl Into<String>,
5226        from_lsn: Option<u64>,
5227    ) -> crate::runtime::kv_watch::KvWatchStream<'a> {
5228        crate::runtime::kv_watch::KvWatchStream::subscribe(
5229            &self.inner.cdc,
5230            &self.inner.kv_stats,
5231            collection,
5232            key,
5233            from_lsn,
5234            self.kv_watch_idle_timeout_ms(),
5235        )
5236    }
5237
5238    pub(crate) fn kv_watch_subscribe_prefix<'a>(
5239        &'a self,
5240        collection: impl Into<String>,
5241        prefix: impl Into<String>,
5242        from_lsn: Option<u64>,
5243    ) -> crate::runtime::kv_watch::KvWatchStream<'a> {
5244        crate::runtime::kv_watch::KvWatchStream::subscribe_prefix(
5245            &self.inner.cdc,
5246            &self.inner.kv_stats,
5247            collection,
5248            prefix,
5249            from_lsn,
5250            self.kv_watch_idle_timeout_ms(),
5251        )
5252    }
5253
5254    pub(crate) fn kv_watch_idle_timeout_ms(&self) -> u64 {
5255        self.config_u64("red.config.kv.watch.idle_timeout_ms", 60_000)
5256    }
5257
5258    /// Get backup scheduler status.
5259    pub fn backup_status(&self) -> crate::replication::scheduler::BackupStatus {
5260        self.inner.backup_scheduler.status()
5261    }
5262
5263    /// Borrow the runtime's result Blob Cache.
5264    ///
5265    /// Wired for the `/admin/blob_cache/sweep` and
5266    /// `/admin/blob_cache/flush_namespace` HTTP handlers (issue #148
5267    /// follow-up): both delegate to
5268    /// `crate::storage::cache::sweeper::BlobCacheSweeper`, which takes a
5269    /// `&BlobCache`. Also used by `trigger_backup` when
5270    /// `red.config.backup.include_blob_cache=true` to locate the L2
5271    /// directory for archival.
5272    pub fn result_blob_cache(&self) -> &crate::storage::cache::BlobCache {
5273        &self.inner.result_blob_cache
5274    }
5275
5276    /// PLAN.md Phase 11.4 — owned snapshot of every registered
5277    /// replica's state on this primary. Returns empty vec on
5278    /// non-primary instances or when no replicas are registered yet.
5279    pub fn primary_replica_snapshots(&self) -> Vec<crate::replication::primary::ReplicaState> {
5280        self.inner
5281            .db
5282            .replication
5283            .as_ref()
5284            .map(|repl| repl.replica_snapshots())
5285            .unwrap_or_default()
5286    }
5287
5288    /// Issue #839 — the primary's current logical-WAL head LSN, used as
5289    /// the reference point for per-replica lag. `0` on non-primary
5290    /// instances or before the logical spool has any records.
5291    pub fn primary_logical_head_lsn(&self) -> u64 {
5292        self.inner
5293            .db
5294            .replication
5295            .as_ref()
5296            .map(|repl| repl.current_logical_lsn())
5297            .unwrap_or(0)
5298    }
5299
5300    /// Issue #839 — count of pulls that forced a full re-bootstrap since
5301    /// process start. The primary operator alert signal; always `0` on a
5302    /// non-primary instance.
5303    pub fn replication_full_resync_count(&self) -> u64 {
5304        self.inner
5305            .db
5306            .replication
5307            .as_ref()
5308            .map(|repl| repl.full_resync_count())
5309            .unwrap_or(0)
5310    }
5311
5312    /// Issue #839 — count of pulls served as a partial (incremental)
5313    /// resync since process start. Always `0` on a non-primary instance.
5314    pub fn replication_partial_resync_count(&self) -> u64 {
5315        self.inner
5316            .db
5317            .replication
5318            .as_ref()
5319            .map(|repl| repl.partial_resync_count())
5320            .unwrap_or(0)
5321    }
5322
5323    /// Issue #839 — this node's stable identity, surfaced as the leader
5324    /// identity in `/replication/status` when the node is the primary.
5325    /// Reuses the same persisted id a replica advertises to the primary,
5326    /// so a cluster has one stable name per node regardless of role.
5327    pub fn node_id(&self) -> String {
5328        self.resolve_replica_id()
5329    }
5330
5331    /// Issue #826 — re-evaluate write-admission flow control from the
5332    /// live primary replica registry and return the resulting throttle
5333    /// state. Computes the max lag across in-quorum replicas (async
5334    /// read-replicas excluded) against the primary's current LSN and
5335    /// engages/releases the `WriteGate` throttle accordingly.
5336    ///
5337    /// No-op (returns `false`) on non-primary instances or when flow
5338    /// control is disabled (soft target `0`). Cheap enough to call on
5339    /// the replica-ack path and from `/metrics` scrapes so the throttle
5340    /// tracks lag without a dedicated background loop.
5341    pub fn refresh_replication_flow_control(&self) -> bool {
5342        let flow = self.inner.write_gate.flow_control();
5343        if !flow.is_enabled() {
5344            return false;
5345        }
5346        let Some(repl) = self.inner.db.replication.as_ref() else {
5347            return false;
5348        };
5349        let primary_lsn = repl.current_logical_lsn();
5350        let replicas = repl.replica_snapshots();
5351        flow.observe(&replicas, primary_lsn)
5352    }
5353
5354    /// PLAN.md Phase 11.4 — active commit policy. Reads
5355    /// `RED_PRIMARY_COMMIT_POLICY` once at runtime construction;
5356    /// future env reloads will need a reload endpoint. Default is
5357    /// `Local` — current behavior, no replica blocking.
5358    pub fn commit_policy(&self) -> crate::replication::CommitPolicy {
5359        crate::replication::CommitPolicy::from_env()
5360    }
5361
5362    /// PLAN.md Phase 11.5 — accessor for replica-side apply error
5363    /// counters (gap / divergence / apply / decode / apply_miss). Returned
5364    /// snapshot is consistent across the counters; the labels match
5365    /// `reddb_replica_apply_errors_total{kind}`. Issue #814 adds the
5366    /// `apply_miss` kind for deletes against a missing target.
5367    pub fn replica_apply_error_counts(
5368        &self,
5369    ) -> [(crate::replication::logical::ApplyErrorKind, u64); 6] {
5370        self.inner.replica_apply_metrics.snapshot()
5371    }
5372
5373    /// PLAN.md Phase 4.4 — per-caller quota bucket. Always
5374    /// returned; `is_configured()` lets callers short-circuit.
5375    pub fn quota_bucket(&self) -> &crate::runtime::quota_bucket::QuotaBucket {
5376        &self.inner.quota_bucket
5377    }
5378
5379    /// PLAN.md Phase 11.4 — observability snapshot of every
5380    /// replica's durable LSN as known to the commit waiter. Empty
5381    /// vec on non-primary instances or when no replica has acked.
5382    pub fn commit_waiter_snapshot(&self) -> Vec<(String, u64)> {
5383        self.inner
5384            .db
5385            .replication
5386            .as_ref()
5387            .map(|repl| repl.commit_waiter.snapshot())
5388            .unwrap_or_default()
5389    }
5390
5391    /// PLAN.md Phase 11.4 — `(reached, timed_out, not_required, last_micros)`
5392    /// counters for /metrics. Always-zero on non-primary instances.
5393    pub fn commit_waiter_metrics_snapshot(&self) -> (u64, u64, u64, u64) {
5394        self.inner
5395            .db
5396            .replication
5397            .as_ref()
5398            .map(|repl| repl.commit_waiter.metrics_snapshot())
5399            .unwrap_or((0, 0, 0, 0))
5400    }
5401
5402    /// Named commit watermark: highest LSN durable on the active
5403    /// synchronous commit quorum. Returns 0 when the active policy does
5404    /// not require replica durability.
5405    pub fn commit_watermark(&self) -> u64 {
5406        match self.commit_policy() {
5407            crate::replication::CommitPolicy::AckN(n) if n > 0 => self
5408                .inner
5409                .db
5410                .replication
5411                .as_ref()
5412                .map(|repl| repl.commit_waiter.commit_watermark(n))
5413                .unwrap_or(0),
5414            crate::replication::CommitPolicy::Quorum => self
5415                .inner
5416                .db
5417                .quorum
5418                .as_ref()
5419                .map(|q| q.commit_watermark())
5420                .unwrap_or(0),
5421            _ => 0,
5422        }
5423    }
5424
5425    /// PLAN.md Phase 11.4 — block until at least `count` replicas
5426    /// have durably applied through `target_lsn`, or `timeout`
5427    /// elapses. Returns the `AwaitOutcome` so the caller can decide
5428    /// whether to surface a timeout error to the client or continue
5429    /// (the policy mapping lives in the commit dispatcher).
5430    ///
5431    /// Used by the `ack_n` commit policy once the operator flips
5432    /// `RED_PRIMARY_COMMIT_POLICY` away from `local`.
5433    pub fn await_replica_acks(
5434        &self,
5435        target_lsn: u64,
5436        count: u32,
5437        timeout: std::time::Duration,
5438    ) -> crate::replication::AwaitOutcome {
5439        match &self.inner.db.replication {
5440            Some(repl) => repl.commit_waiter.await_acks(target_lsn, count, timeout),
5441            None => {
5442                // No replication configured: policy must be `Local`.
5443                // Treat as immediate `NotRequired` so callers don't
5444                // block on a degenerate setup.
5445                crate::replication::AwaitOutcome::NotRequired
5446            }
5447        }
5448    }
5449
5450    /// PLAN.md Phase 11.4 — enforce the configured commit policy
5451    /// against `post_lsn` (the LSN of the just-completed write).
5452    /// Returns `Ok(AwaitOutcome)` on every successful enforcement
5453    /// (including `Reached` and `TimedOut` when fail-on-timeout is
5454    /// off). Returns `Err(ReadOnly)` only when a synchronous policy
5455    /// misses its threshold and `RED_COMMIT_FAIL_ON_TIMEOUT=true` is
5456    /// set.
5457    ///
5458    /// The HTTP / gRPC / wire surfaces map the error to 504 / wire
5459    /// backoff. Default behaviour (env unset) logs warn and returns
5460    /// success — matches PLAN.md "default v1 stays local" semantics
5461    /// while still letting the operator opt into hard-blocking.
5462    pub fn enforce_commit_policy(
5463        &self,
5464        post_lsn: u64,
5465    ) -> RedDBResult<crate::replication::AwaitOutcome> {
5466        let policy = self.commit_policy();
5467        if matches!(policy, crate::replication::CommitPolicy::Quorum) {
5468            return match self.inner.db.wait_for_replication_quorum(post_lsn) {
5469                Ok(()) => Ok(crate::replication::AwaitOutcome::Reached(0)),
5470                Err(err) => {
5471                    tracing::warn!(
5472                        target: "reddb::commit",
5473                        post_lsn,
5474                        error = %err,
5475                        "quorum: timed out waiting for commit watermark"
5476                    );
5477                    let fail = std::env::var("RED_COMMIT_FAIL_ON_TIMEOUT")
5478                        .ok()
5479                        .map(|v| {
5480                            let t = v.trim();
5481                            t.eq_ignore_ascii_case("true")
5482                                || t == "1"
5483                                || t.eq_ignore_ascii_case("yes")
5484                        })
5485                        .unwrap_or(false);
5486                    if fail {
5487                        return Err(RedDBError::ReadOnly(format!(
5488                            "commit policy timed out at lsn {post_lsn}: {err} (RED_COMMIT_FAIL_ON_TIMEOUT=true)"
5489                        )));
5490                    }
5491                    Ok(crate::replication::AwaitOutcome::TimedOut {
5492                        observed: 0,
5493                        required: 1,
5494                    })
5495                }
5496            };
5497        }
5498
5499        let n = match policy {
5500            crate::replication::CommitPolicy::AckN(n) if n > 0 => n,
5501            _ => return Ok(crate::replication::AwaitOutcome::NotRequired),
5502        };
5503        let timeout_ms = std::env::var("RED_REPLICATION_ACK_TIMEOUT_MS")
5504            .ok()
5505            .and_then(|v| v.parse::<u64>().ok())
5506            .unwrap_or(5_000);
5507        let outcome =
5508            self.await_replica_acks(post_lsn, n, std::time::Duration::from_millis(timeout_ms));
5509        {
5510            use crate::runtime::control_events::{EventKind, Outcome, Sensitivity};
5511            let (event_outcome, fields) = match &outcome {
5512                crate::replication::AwaitOutcome::Reached(count) => (
5513                    Outcome::Allowed,
5514                    vec![
5515                        (
5516                            "post_lsn".to_string(),
5517                            Sensitivity::raw(post_lsn.to_string()),
5518                        ),
5519                        ("required".to_string(), Sensitivity::raw(n.to_string())),
5520                        ("observed".to_string(), Sensitivity::raw(count.to_string())),
5521                        (
5522                            "timeout_ms".to_string(),
5523                            Sensitivity::raw(timeout_ms.to_string()),
5524                        ),
5525                    ],
5526                ),
5527                crate::replication::AwaitOutcome::TimedOut { observed, required } => (
5528                    Outcome::Error,
5529                    vec![
5530                        (
5531                            "post_lsn".to_string(),
5532                            Sensitivity::raw(post_lsn.to_string()),
5533                        ),
5534                        (
5535                            "required".to_string(),
5536                            Sensitivity::raw(required.to_string()),
5537                        ),
5538                        (
5539                            "observed".to_string(),
5540                            Sensitivity::raw(observed.to_string()),
5541                        ),
5542                        (
5543                            "timeout_ms".to_string(),
5544                            Sensitivity::raw(timeout_ms.to_string()),
5545                        ),
5546                    ],
5547                ),
5548                crate::replication::AwaitOutcome::NotRequired => (Outcome::Allowed, Vec::new()),
5549            };
5550            if !fields.is_empty() {
5551                self.emit_control_event(
5552                    EventKind::ReplicationSafety,
5553                    event_outcome,
5554                    "replication_commit_policy",
5555                    Some(format!("replication:lsn:{post_lsn}")),
5556                    None,
5557                    fields,
5558                )?;
5559            }
5560        }
5561        if let crate::replication::AwaitOutcome::TimedOut { observed, required } = &outcome {
5562            tracing::warn!(
5563                target: "reddb::commit",
5564                post_lsn,
5565                observed = *observed,
5566                required = *required,
5567                timeout_ms,
5568                "ack_n: timed out waiting for replicas"
5569            );
5570            let fail = std::env::var("RED_COMMIT_FAIL_ON_TIMEOUT")
5571                .ok()
5572                .map(|v| {
5573                    let t = v.trim();
5574                    t.eq_ignore_ascii_case("true") || t == "1" || t.eq_ignore_ascii_case("yes")
5575                })
5576                .unwrap_or(false);
5577            if fail {
5578                return Err(RedDBError::ReadOnly(format!(
5579                    "commit policy timed out at lsn {post_lsn}: observed={observed} required={required} (RED_COMMIT_FAIL_ON_TIMEOUT=true)"
5580                )));
5581            }
5582        }
5583        Ok(outcome)
5584    }
5585
5586    /// PLAN.md Phase 6.3 — whether at-rest encryption is configured.
5587    /// Reads `RED_ENCRYPTION_KEY` / `RED_ENCRYPTION_KEY_FILE` lazily;
5588    /// returns `("enabled", None)` when a key is loadable, `("error", Some(msg))`
5589    /// when the operator set the env but it doesn't parse, and
5590    /// `("disabled", None)` when no key is configured. The pager
5591    /// hookup is deferred — this accessor surfaces the operator's
5592    /// intent for /admin/status without yet using the key in writes.
5593    pub fn encryption_at_rest_status(&self) -> (&'static str, Option<String>) {
5594        match crate::crypto::page_encryption::key_from_env() {
5595            Ok(Some(_)) => ("enabled", None),
5596            Ok(None) => ("disabled", None),
5597            Err(err) => ("error", Some(err)),
5598        }
5599    }
5600
5601    /// PLAN.md Phase 11.5 — current replica apply health label
5602    /// (`ok`, `gap`, `divergence`, `apply_error`, `connecting`,
5603    /// `stalled_gap`). Read from the persisted `red.replication.state`
5604    /// config key updated by the replica loop. Returns `None` on
5605    /// non-replica instances or when no apply has run yet.
5606    pub fn replica_apply_health(&self) -> Option<String> {
5607        let state = self.config_string("red.replication.state", "");
5608        if state.is_empty() {
5609            None
5610        } else {
5611            Some(state)
5612        }
5613    }
5614
5615    /// Current local LSN paired with the LSN of the most recently
5616    /// archived WAL segment. The difference is the replication /
5617    /// archive lag operators alert on (PLAN.md Phase 5.1). Returns
5618    /// `(0, 0)` when neither replication nor archiving is configured.
5619    pub fn wal_archive_progress(&self) -> (u64, u64) {
5620        let current_lsn = self
5621            .inner
5622            .db
5623            .replication
5624            .as_ref()
5625            .map(|repl| {
5626                repl.logical_wal_spool
5627                    .as_ref()
5628                    .map(|spool| spool.current_lsn())
5629                    .unwrap_or_else(|| repl.wal_buffer.current_lsn())
5630            })
5631            .unwrap_or_else(|| self.inner.cdc.current_lsn());
5632        let last_archived_lsn = self.config_u64("red.config.timeline.last_archived_lsn", 0);
5633        (current_lsn, last_archived_lsn)
5634    }
5635
5636    /// Trigger an immediate backup.
5637    pub fn trigger_backup(&self) -> RedDBResult<crate::replication::scheduler::BackupResult> {
5638        let result = (|| {
5639            self.check_write(crate::runtime::write_gate::WriteKind::Backup)?;
5640            // Defense in depth — check_write above already rejects when
5641            // the lease is NotHeld, but log + audit the lease angle here
5642            // explicitly so dashboards distinguish "lease lost" from a
5643            // generic read-only refusal.
5644            self.assert_remote_write_allowed("admin/backup")?;
5645            let started = std::time::Instant::now();
5646            let snapshot = self.create_snapshot()?;
5647            let mut uploaded = false;
5648
5649            if let (Some(backend), Some(path)) =
5650                (&self.inner.db.remote_backend, self.inner.db.path())
5651            {
5652                let default_snapshot_prefix = self.inner.db.options().default_snapshot_prefix();
5653                let default_wal_prefix = self.inner.db.options().default_wal_archive_prefix();
5654                let default_head_key = self.inner.db.options().default_backup_head_key();
5655                let snapshot_prefix = self.config_string(
5656                    "red.config.backup.snapshot_prefix",
5657                    &default_snapshot_prefix,
5658                );
5659                let wal_prefix =
5660                    self.config_string("red.config.wal.archive.prefix", &default_wal_prefix);
5661                let head_key = self.config_string("red.config.backup.head_key", &default_head_key);
5662                let timeline_id = self.config_string("red.config.timeline.id", "main");
5663                let snapshot_key = crate::storage::wal::archive_snapshot(
5664                    backend.as_ref(),
5665                    path,
5666                    snapshot.snapshot_id,
5667                    &snapshot_prefix,
5668                )
5669                .map_err(|err| RedDBError::Internal(err.to_string()))?;
5670                let current_lsn = self
5671                    .inner
5672                    .db
5673                    .replication
5674                    .as_ref()
5675                    .map(|repl| {
5676                        repl.logical_wal_spool
5677                            .as_ref()
5678                            .map(|spool| spool.current_lsn())
5679                            .unwrap_or_else(|| repl.wal_buffer.current_lsn())
5680                    })
5681                    .unwrap_or_else(|| self.inner.cdc.current_lsn());
5682                let last_archived_lsn = self.config_u64("red.config.timeline.last_archived_lsn", 0);
5683                // Hash the local snapshot bytes so the manifest can carry
5684                // the digest for restore-side verification (PLAN.md
5685                // Phase 4). Failure to hash is non-fatal — we still
5686                // publish the manifest, just without a checksum, so a
5687                // future fix can backfill rather than losing the backup.
5688                let snapshot_sha256 =
5689                    crate::storage::wal::SnapshotManifest::compute_snapshot_sha256(path)
5690                        .map_err(|err| {
5691                            tracing::warn!(
5692                                target: "reddb::backup",
5693                                error = %err,
5694                                snapshot_id = snapshot.snapshot_id,
5695                                "snapshot hash failed; manifest will lack checksum"
5696                            );
5697                        })
5698                        .ok();
5699                let manifest = crate::storage::wal::SnapshotManifest {
5700                    timeline_id: timeline_id.clone(),
5701                    snapshot_key: snapshot_key.clone(),
5702                    snapshot_id: snapshot.snapshot_id,
5703                    snapshot_time: snapshot.created_at_unix_ms as u64,
5704                    base_lsn: current_lsn,
5705                    schema_version: crate::api::REDDB_FORMAT_VERSION,
5706                    format_version: crate::api::REDDB_FORMAT_VERSION,
5707                    snapshot_sha256,
5708                };
5709                crate::storage::wal::publish_snapshot_manifest(backend.as_ref(), &manifest)
5710                    .map_err(|err| RedDBError::Internal(err.to_string()))?;
5711
5712                // PLAN.md Phase 11.3 — read the head of the WAL hash chain
5713                // so the new segment can link back. `None` means we're
5714                // starting a fresh timeline (after a clean restore or on
5715                // first archive ever); the segment's `prev_hash` will be
5716                // `None` and restore-side validation accepts that only for
5717                // the first segment in `plan.wal_segments`.
5718                let prev_segment_hash =
5719                    self.config_string("red.config.timeline.last_segment_hash", "");
5720                let prev_hash_arg = if prev_segment_hash.is_empty() {
5721                    None
5722                } else {
5723                    Some(prev_segment_hash)
5724                };
5725
5726                let archived_lsn = if let Some(primary) = &self.inner.db.replication {
5727                    let oldest = primary
5728                        .logical_wal_spool
5729                        .as_ref()
5730                        .and_then(|spool| spool.oldest_lsn().ok().flatten())
5731                        .or_else(|| primary.wal_buffer.oldest_lsn())
5732                        .unwrap_or(last_archived_lsn);
5733                    if last_archived_lsn > 0 && last_archived_lsn < oldest.saturating_sub(1) {
5734                        return Err(RedDBError::Internal(format!(
5735                        "logical WAL gap detected: last_archived_lsn={last_archived_lsn}, oldest_available_lsn={oldest}"
5736                    )));
5737                    }
5738                    let records = if let Some(spool) = &primary.logical_wal_spool {
5739                        spool
5740                            .read_since(last_archived_lsn, usize::MAX)
5741                            .map_err(|err| RedDBError::Internal(err.to_string()))?
5742                    } else {
5743                        primary.wal_buffer.read_since(last_archived_lsn, usize::MAX)
5744                    };
5745                    if let Some(meta) = crate::storage::wal::archive_change_records(
5746                        backend.as_ref(),
5747                        &wal_prefix,
5748                        &records,
5749                        prev_hash_arg,
5750                    )
5751                    .map_err(|err| RedDBError::Internal(err.to_string()))?
5752                    {
5753                        let _ = primary.prune_retained_wal_through(meta.lsn_end);
5754                        // Advance the chain head so the next archive call
5755                        // links to this segment's hash. If the segment has
5756                        // no sha256 (legacy / hashing failed) we leave the
5757                        // head as-is — the next segment then carries the
5758                        // prior chain head, preserving continuity.
5759                        if let Some(sha) = &meta.sha256 {
5760                            self.inner.db.store().set_config_tree(
5761                                "red.config.timeline",
5762                                &crate::json!({ "last_segment_hash": sha }),
5763                            );
5764                        }
5765                        meta.lsn_end
5766                    } else {
5767                        last_archived_lsn
5768                    }
5769                } else {
5770                    last_archived_lsn
5771                };
5772
5773                let head = crate::storage::wal::BackupHead {
5774                    timeline_id,
5775                    snapshot_key,
5776                    snapshot_id: snapshot.snapshot_id,
5777                    snapshot_time: snapshot.created_at_unix_ms as u64,
5778                    current_lsn,
5779                    last_archived_lsn: archived_lsn,
5780                    wal_prefix,
5781                };
5782                crate::storage::wal::publish_backup_head(backend.as_ref(), &head_key, &head)
5783                    .map_err(|err| RedDBError::Internal(err.to_string()))?;
5784                self.inner.db.store().set_config_tree(
5785                    "red.config.timeline",
5786                    &crate::json!({
5787                        "last_archived_lsn": archived_lsn,
5788                        "id": head.timeline_id
5789                    }),
5790                );
5791
5792                // PLAN.md Phase 2.4 — refresh the unified `MANIFEST.json`
5793                // at the prefix root so external tooling sees a single
5794                // catalog of every snapshot + WAL segment with their
5795                // checksums. Best-effort: a manifest publish failure
5796                // doesn't fail the backup (the per-artifact sidecars
5797                // already give restore-side integrity), but it does log
5798                // so dashboards can flag stale catalogs.
5799                if let Err(err) = crate::storage::wal::publish_unified_manifest_for_prefix(
5800                    backend.as_ref(),
5801                    &snapshot_prefix,
5802                ) {
5803                    tracing::warn!(
5804                        target: "reddb::backup",
5805                        error = %err,
5806                        snapshot_prefix = %snapshot_prefix,
5807                        "unified MANIFEST.json refresh failed; per-artifact sidecars unaffected"
5808                    );
5809                }
5810
5811                // PLAN.md Phase 11.4 — when the operator picked a
5812                // commit policy that demands replica durability, block
5813                // until the configured count of replicas has acked the
5814                // archived LSN (or the timeout fires). For backup the
5815                // policy decides the *DR posture* — `local` returns
5816                // immediately, `ack_n` ensures at least N replicas saw
5817                // the new tail before we report success to the
5818                // operator. A `TimedOut` is logged but does NOT fail
5819                // the backup: the local WAL + remote upload are durable
5820                // regardless; the missing acks are reported via
5821                // /metrics and /admin/status so the operator can decide.
5822                match self.commit_policy() {
5823                    crate::replication::CommitPolicy::AckN(n) if n > 0 => {
5824                        let timeout = std::env::var("RED_REPLICATION_ACK_TIMEOUT_MS")
5825                            .ok()
5826                            .and_then(|v| v.parse::<u64>().ok())
5827                            .unwrap_or(5_000);
5828                        let outcome = self.await_replica_acks(
5829                            archived_lsn,
5830                            n,
5831                            std::time::Duration::from_millis(timeout),
5832                        );
5833                        match outcome {
5834                            crate::replication::AwaitOutcome::Reached(count) => {
5835                                tracing::debug!(
5836                                    target: "reddb::backup",
5837                                    archived_lsn,
5838                                    n,
5839                                    count,
5840                                    "ack_n: replicas synced before backup return"
5841                                );
5842                            }
5843                            crate::replication::AwaitOutcome::TimedOut { observed, required } => {
5844                                tracing::warn!(
5845                                    target: "reddb::backup",
5846                                    archived_lsn,
5847                                    observed,
5848                                    required,
5849                                    timeout_ms = timeout,
5850                                    "ack_n: timed out waiting for replicas; backup uploaded but DR posture degraded"
5851                                );
5852                            }
5853                            crate::replication::AwaitOutcome::NotRequired => {}
5854                        }
5855                    }
5856                    _ => {} // Local / RemoteWal / Quorum: no blocking yet
5857                }
5858
5859                // Issue #148 follow-up — opt-in archive of the L2 Blob Cache
5860                // directory tree. Default off so a standard backup stays
5861                // small; flip via `red.config.backup.include_blob_cache=true`
5862                // when warm-cache restore is required (per
5863                // docs/operations/blob-cache-backup-restore.md §1).
5864                //
5865                // The L2 tree is *derived* state (ADR 0006) — its absence
5866                // never causes data loss; it only affects post-restore
5867                // p99 latency until the cache re-warms. We therefore log
5868                // (not fail) on per-file upload errors so a partial L2
5869                // upload never aborts a healthy snapshot+WAL backup.
5870                if self.config_bool("red.config.backup.include_blob_cache", false) {
5871                    let blob_cache_prefix = self.config_string(
5872                        "red.config.backup.blob_cache_prefix",
5873                        &format!("{snapshot_prefix}blob_cache/"),
5874                    );
5875                    if let Some(l2_path) = self.inner.result_blob_cache.l2_path() {
5876                        match crate::storage::cache::archive_blob_cache_l2(
5877                            backend.as_ref(),
5878                            l2_path,
5879                            &blob_cache_prefix,
5880                        ) {
5881                            Ok(count) => {
5882                                tracing::info!(
5883                                    target: "reddb::backup",
5884                                    files_uploaded = count,
5885                                    blob_cache_prefix = %blob_cache_prefix,
5886                                    "include_blob_cache: archived L2 directory"
5887                                );
5888                            }
5889                            Err(err) => {
5890                                tracing::warn!(
5891                                    target: "reddb::backup",
5892                                    error = %err,
5893                                    blob_cache_prefix = %blob_cache_prefix,
5894                                    "include_blob_cache: L2 archive failed; backup proceeding (cache is derived state)"
5895                                );
5896                            }
5897                        }
5898                    } else {
5899                        tracing::debug!(
5900                            target: "reddb::backup",
5901                            "include_blob_cache=true but no L2 path configured; nothing to archive"
5902                        );
5903                    }
5904                }
5905
5906                uploaded = true;
5907            }
5908
5909            Ok(crate::replication::scheduler::BackupResult {
5910                snapshot_id: snapshot.snapshot_id,
5911                uploaded,
5912                duration_ms: started.elapsed().as_millis() as u64,
5913                timestamp: snapshot.created_at_unix_ms as u64,
5914            })
5915        })();
5916
5917        use crate::runtime::control_events::{EventKind, Outcome, Sensitivity};
5918        let (current_lsn, last_archived_lsn) = self.wal_archive_progress();
5919        let mut fields = vec![
5920            (
5921                "current_lsn".to_string(),
5922                Sensitivity::raw(current_lsn.to_string()),
5923            ),
5924            (
5925                "last_archived_lsn".to_string(),
5926                Sensitivity::raw(last_archived_lsn.to_string()),
5927            ),
5928        ];
5929        if let Ok(backup) = &result {
5930            fields.push((
5931                "snapshot_id".to_string(),
5932                Sensitivity::raw(backup.snapshot_id.to_string()),
5933            ));
5934            fields.push((
5935                "uploaded".to_string(),
5936                Sensitivity::raw(backup.uploaded.to_string()),
5937            ));
5938            fields.push((
5939                "duration_ms".to_string(),
5940                Sensitivity::raw(backup.duration_ms.to_string()),
5941            ));
5942            fields.push((
5943                "snapshot_time".to_string(),
5944                Sensitivity::raw(backup.timestamp.to_string()),
5945            ));
5946        }
5947        let outcome = match &result {
5948            Ok(_) => Outcome::Allowed,
5949            Err(err) => control_event_outcome_for_error(err),
5950        };
5951        let reason = result.as_ref().err().map(|err| err.to_string());
5952        self.emit_control_event(
5953            EventKind::BackupRun,
5954            outcome,
5955            "backup_trigger",
5956            Some("backup:trigger".to_string()),
5957            reason,
5958            fields,
5959        )?;
5960        result
5961    }
5962
5963    pub fn acquire(&self) -> RedDBResult<RuntimeConnection> {
5964        let mut pool = self
5965            .inner
5966            .pool
5967            .lock()
5968            .map_err(|e| RedDBError::Internal(format!("connection pool lock poisoned: {e}")))?;
5969        if pool.active >= self.inner.pool_config.max_connections {
5970            return Err(RedDBError::Internal(
5971                "connection pool exhausted".to_string(),
5972            ));
5973        }
5974
5975        let id = if let Some(id) = pool.idle.pop() {
5976            id
5977        } else {
5978            let id = pool.next_id;
5979            pool.next_id += 1;
5980            id
5981        };
5982        pool.active += 1;
5983        pool.total_checkouts += 1;
5984        drop(pool);
5985
5986        Ok(RuntimeConnection {
5987            id,
5988            inner: Arc::clone(&self.inner),
5989        })
5990    }
5991
5992    pub fn checkpoint(&self) -> RedDBResult<()> {
5993        // Local fsync always allowed — losing the lease shouldn't
5994        // prevent us from durably persisting what's already in memory.
5995        // The remote upload is the side-effect that risks clobbering a
5996        // peer's state, so it's behind the lease gate.
5997        self.inner.db.flush_local_only().map_err(|err| {
5998            // Issue #205 — local flush failure is a CheckpointFailed
5999            // operator-grade event. The local-flush path also covers
6000            // the WAL fsync we depend on, so a failure here doubles as
6001            // the WalFsyncFailed signal for the runtime entry point.
6002            let msg = err.to_string();
6003            crate::telemetry::operator_event::OperatorEvent::CheckpointFailed {
6004                lsn: 0,
6005                error: msg.clone(),
6006            }
6007            .emit_global();
6008            crate::telemetry::operator_event::OperatorEvent::WalFsyncFailed {
6009                path: "<flush_local_only>".to_string(),
6010                error: msg.clone(),
6011            }
6012            .emit_global();
6013            RedDBError::Engine(msg)
6014        })?;
6015        if let Err(err) = self.assert_remote_write_allowed("checkpoint") {
6016            tracing::warn!(
6017                target: "reddb::serverless::lease",
6018                error = %err,
6019                "checkpoint: skipping remote upload — lease not held"
6020            );
6021            return Ok(());
6022        }
6023        self.inner
6024            .db
6025            .upload_to_remote_backend()
6026            .map_err(|err| RedDBError::Engine(err.to_string()))
6027    }
6028
6029    /// Guard remote-mutating operations on the writer lease.
6030    /// Returns `Ok(())` when no remote backend is configured (the
6031    /// lease is irrelevant) or the lease state is `NotRequired` /
6032    /// `Held`. Returns `RedDBError::ReadOnly` when the lease is
6033    /// `NotHeld`, with an audit-friendly action label so the caller
6034    /// can record the rejection.
6035    pub(crate) fn assert_remote_write_allowed(&self, action: &str) -> RedDBResult<()> {
6036        if self.inner.db.remote_backend.is_none() {
6037            return Ok(());
6038        }
6039        match self.inner.write_gate.lease_state() {
6040            crate::runtime::write_gate::LeaseGateState::NotHeld => {
6041                self.inner.audit_log.record(
6042                    action,
6043                    "system",
6044                    "remote_backend",
6045                    "err: writer lease not held",
6046                    crate::json::Value::Null,
6047                );
6048                Err(RedDBError::ReadOnly(format!(
6049                    "writer lease not held — {action} blocked (serverless fence)"
6050                )))
6051            }
6052            _ => Ok(()),
6053        }
6054    }
6055
6056    pub fn run_maintenance(&self) -> RedDBResult<()> {
6057        self.inner
6058            .db
6059            .run_maintenance()
6060            .map_err(|err| RedDBError::Internal(err.to_string()))
6061    }
6062
6063    pub fn scan_collection(
6064        &self,
6065        collection: &str,
6066        cursor: Option<ScanCursor>,
6067        limit: usize,
6068    ) -> RedDBResult<ScanPage> {
6069        let store = self.inner.db.store();
6070        let manager = store
6071            .get_collection(collection)
6072            .ok_or_else(|| RedDBError::NotFound(collection.to_string()))?;
6073
6074        let mut entities = manager.query_all(|_| true);
6075        entities.sort_by_key(|entity| entity.id.raw());
6076
6077        let offset = cursor.map(|cursor| cursor.offset).unwrap_or(0);
6078        let total = entities.len();
6079        let end = total.min(offset.saturating_add(limit.max(1)));
6080        let items = if offset >= total {
6081            Vec::new()
6082        } else {
6083            entities[offset..end].to_vec()
6084        };
6085        let next = (end < total).then_some(ScanCursor { offset: end });
6086
6087        Ok(ScanPage {
6088            collection: collection.to_string(),
6089            items,
6090            next,
6091            total,
6092        })
6093    }
6094
6095    pub fn catalog(&self) -> CatalogModelSnapshot {
6096        self.inner.db.catalog_model_snapshot()
6097    }
6098
6099    pub fn catalog_consistency_report(&self) -> crate::catalog::CatalogConsistencyReport {
6100        self.inner.db.catalog_consistency_report()
6101    }
6102
6103    pub fn catalog_attention_summary(&self) -> CatalogAttentionSummary {
6104        crate::catalog::attention_summary(&self.catalog())
6105    }
6106
6107    pub fn collection_attention(&self) -> Vec<CollectionDescriptor> {
6108        crate::catalog::collection_attention(&self.catalog())
6109    }
6110
6111    pub fn index_attention(&self) -> Vec<CatalogIndexStatus> {
6112        crate::catalog::index_attention(&self.catalog())
6113    }
6114
6115    pub fn graph_projection_attention(&self) -> Vec<CatalogGraphProjectionStatus> {
6116        crate::catalog::graph_projection_attention(&self.catalog())
6117    }
6118
6119    pub fn analytics_job_attention(&self) -> Vec<CatalogAnalyticsJobStatus> {
6120        crate::catalog::analytics_job_attention(&self.catalog())
6121    }
6122
6123    pub fn stats(&self) -> RuntimeStats {
6124        let pool = runtime_pool_lock(self);
6125        RuntimeStats {
6126            active_connections: pool.active,
6127            idle_connections: pool.idle.len(),
6128            total_checkouts: pool.total_checkouts,
6129            paged_mode: self.inner.db.is_paged(),
6130            started_at_unix_ms: self.inner.started_at_unix_ms,
6131            store: self.inner.db.stats(),
6132            system: SystemInfo::collect(),
6133            result_blob_cache: self.inner.result_blob_cache.stats(),
6134            kv: self.inner.kv_stats.snapshot(),
6135            metrics_ingest: self.inner.metrics_ingest_stats.snapshot(),
6136        }
6137    }
6138
6139    pub(crate) fn record_metrics_ingest(
6140        &self,
6141        accepted_samples: u64,
6142        accepted_series: u64,
6143        rejected_samples: u64,
6144        rejected_series: u64,
6145    ) {
6146        self.inner.metrics_ingest_stats.record(
6147            accepted_samples,
6148            accepted_series,
6149            rejected_samples,
6150            rejected_series,
6151        );
6152    }
6153
6154    pub(crate) fn record_metrics_cardinality_budget_rejections(&self, rejected_series: u64) {
6155        self.inner
6156            .metrics_ingest_stats
6157            .record_cardinality_budget_rejections(rejected_series);
6158    }
6159
6160    pub(crate) fn record_metrics_tenant_activity(
6161        &self,
6162        tenant: &str,
6163        namespace: &str,
6164        operation: &str,
6165    ) {
6166        self.inner
6167            .metrics_tenant_activity_stats
6168            .record(tenant, namespace, operation);
6169    }
6170
6171    pub(crate) fn metrics_tenant_activity_snapshot(
6172        &self,
6173    ) -> Vec<crate::runtime::MetricsTenantActivityStats> {
6174        self.inner.metrics_tenant_activity_stats.snapshot()
6175    }
6176
6177    /// Execute a query under a typed scope override without embedding
6178    /// the tenant / user / role values into the SQL string. Use this
6179    /// from transport middleware (HTTP / gRPC / worker loops) where the
6180    /// scope is resolved from auth claims and the SQL is a parameterised
6181    /// template — avoids the string-concat injection risk of building
6182    /// `WITHIN TENANT '<id>' …` manually, and is drop-in compatible with
6183    /// prepared statements that didn't know about tenancy.
6184    ///
6185    /// Precedence matches the `WITHIN` clause: the passed `scope`
6186    /// overrides `SET LOCAL TENANT`, which overrides `SET TENANT`.
6187    /// The override is pushed on the thread-local scope stack for the
6188    /// duration of the call and popped on return — pool-shared
6189    /// connections cannot leak it across requests.
6190    pub fn execute_query_with_scope(
6191        &self,
6192        query: &str,
6193        scope: crate::runtime::within_clause::ScopeOverride,
6194    ) -> RedDBResult<RuntimeQueryResult> {
6195        if scope.is_empty() {
6196            return self.execute_query(query);
6197        }
6198        let _scope_guard = ScopeOverrideGuard::install(scope);
6199        self.execute_query(query)
6200    }
6201
6202    /// Issue #205 — single lifecycle exit for slow-query logging.
6203    ///
6204    /// `execute_query_inner` does the real work; this wrapper times it
6205    /// and, if elapsed exceeds the configured threshold, hands the
6206    /// triple `(QueryKind, elapsed_ms, sql_redacted, scope)` to the
6207    /// SlowQueryLogger. The threshold + sample_pct were captured at
6208    /// SlowQueryLogger construction (runtime startup), so the per-call
6209    /// cost on below-threshold paths is one relaxed atomic load.
6210    pub fn execute_query(&self, query: &str) -> RedDBResult<RuntimeQueryResult> {
6211        let started = std::time::Instant::now();
6212        let mut result = self.execute_query_inner(query);
6213        // Issue #765 / S6 — filter integrity-tombstoned rows out of SELECT
6214        // results before they reach any consumer. Fast no-op (one relaxed
6215        // atomic load) unless an input-stream digest mismatch has tombstoned
6216        // a RID range on this store.
6217        if let Ok(ref mut query_result) = result {
6218            if query_result.statement_type == "select" {
6219                self.filter_integrity_tombstoned(&mut query_result.result);
6220            }
6221        }
6222        let elapsed_ms = started.elapsed().as_millis() as u64;
6223
6224        // Build EffectiveScope from the same thread-locals frame-build
6225        // consults — keeps the slow-log row consistent with the audit /
6226        // RLS view of "this statement". `ai_scope()` is the canonical
6227        // builder.
6228        let scope = self.ai_scope();
6229        let kind = match result
6230            .as_ref()
6231            .map(|r| r.statement_type)
6232            .unwrap_or("select")
6233        {
6234            "select" => crate::telemetry::slow_query_logger::QueryKind::Select,
6235            "insert" => crate::telemetry::slow_query_logger::QueryKind::Insert,
6236            "update" => crate::telemetry::slow_query_logger::QueryKind::Update,
6237            "delete" => crate::telemetry::slow_query_logger::QueryKind::Delete,
6238            _ => crate::telemetry::slow_query_logger::QueryKind::Internal,
6239        };
6240        // SQL redaction: pass the raw query through. The slow-query
6241        // logger writes structured JSON so embedded literals stay
6242        // escape-safe at the JSON boundary (proven by
6243        // `adversarial_sql_is_escape_safe` in slow_query_logger.rs).
6244        // PII redaction (e.g. literal masking) is a follow-up.
6245        self.inner
6246            .slow_query_logger
6247            .record(kind, elapsed_ms, query.to_string(), &scope);
6248
6249        if let Ok(ref mut query_result) = result {
6250            if matches!(query_result.statement_type, "insert" | "update" | "delete") {
6251                let bookmark = crate::replication::CausalBookmark::new(
6252                    self.current_replication_term(),
6253                    self.cdc_current_lsn(),
6254                );
6255                query_result.bookmark = Some(bookmark.encode());
6256            }
6257        }
6258
6259        result
6260    }
6261
6262    pub fn causal_session(&self) -> crate::runtime::CausalSession {
6263        crate::runtime::CausalSession {
6264            runtime: self.clone(),
6265            bookmark: None,
6266            wait_timeout: std::time::Duration::from_secs(5),
6267        }
6268    }
6269
6270    pub fn wait_for_bookmark(
6271        &self,
6272        bookmark: &crate::replication::CausalBookmark,
6273        timeout: std::time::Duration,
6274    ) -> RedDBResult<()> {
6275        let deadline = std::time::Instant::now() + timeout;
6276        loop {
6277            let applied_lsn = self.local_contiguous_applied_lsn();
6278            if applied_lsn >= bookmark.commit_lsn() {
6279                return Ok(());
6280            }
6281            let now = std::time::Instant::now();
6282            if now >= deadline {
6283                return Err(RedDBError::InvalidOperation(format!(
6284                    "timed out waiting for causal bookmark lsn {}; applied={}",
6285                    bookmark.commit_lsn(),
6286                    applied_lsn
6287                )));
6288            }
6289            let remaining = deadline.saturating_duration_since(now);
6290            std::thread::sleep(remaining.min(std::time::Duration::from_millis(5)));
6291        }
6292    }
6293
6294    fn local_contiguous_applied_lsn(&self) -> u64 {
6295        match self.inner.db.options().replication.role {
6296            crate::replication::ReplicationRole::Replica { .. } => {
6297                self.config_u64("red.replication.last_applied_lsn", 0)
6298            }
6299            _ => self.cdc_current_lsn(),
6300        }
6301    }
6302
6303    #[inline(never)]
6304    fn execute_query_inner(&self, query: &str) -> RedDBResult<RuntimeQueryResult> {
6305        // ── ULTRA-TURBO: autocommit `SELECT * FROM t WHERE _entity_id = N` ──
6306        //
6307        // Moved above every boot-cost the normal path pays (WITHIN
6308        // strip, SET LOCAL parse, tx_local_tenants read, snapshot
6309        // guard, tracing span, tx_contexts read) because the bench's
6310        // `select_point` scenario was observed at 28× vs PostgreSQL —
6311        // the dominant cost wasn't the entity fetch but the ceremony
6312        // before it. Only fires when there's no ambient transaction
6313        // context or WITHIN override, so the snapshot install we skip
6314        // truly is a no-op for this query.
6315        if !has_scope_override_active()
6316            && !query.trim_start().starts_with("WITHIN")
6317            && !query.trim_start().starts_with("within")
6318            && !self.inner.query_audit.has_rules()
6319            && !self
6320                .inner
6321                .tx_contexts
6322                .read()
6323                .contains_key(&current_connection_id())
6324        {
6325            if let Some(result) = self.try_fast_entity_lookup(query) {
6326                return result;
6327            }
6328        }
6329
6330        // `WITHIN TENANT '<id>' [USER '<u>'] [AS ROLE '<r>'] <stmt>` —
6331        // strip the prefix, push a stack-scoped override, recurse on
6332        // the inner statement, pop on return. Stack lives in a
6333        // thread-local but is balanced by the RAII guard, so a
6334        // pool-shared connection cannot leak the override across
6335        // requests and an early `?` return still pops cleanly.
6336        match crate::runtime::within_clause::try_strip_within_prefix(query) {
6337            Ok(Some((scope, inner))) => {
6338                let _scope_guard = ScopeOverrideGuard::install(scope);
6339                // Re-enter the inner path, NOT `execute_query`, so the
6340                // slow-query lifecycle hook records exactly one row per
6341                // top-level statement (the WITHIN-stripped form would
6342                // double-record).
6343                return self.execute_query_inner(inner);
6344            }
6345            Ok(None) => {}
6346            Err(msg) => return Err(RedDBError::Query(msg)),
6347        }
6348
6349        // `EXPLAIN <stmt>` — introspection. Runs the planner on the
6350        // inner statement (WITHOUT executing it) and returns the
6351        // CanonicalLogicalNode tree as rows so the caller can see the
6352        // operator shape and estimated cost. `EXPLAIN ALTER FOR ...`
6353        // is a distinct schema-diff command and continues down the
6354        // regular SQL path.
6355        if let Some(inner) = strip_explain_prefix(query) {
6356            return self.explain_as_rows(query, inner);
6357        }
6358
6359        // `SET LOCAL TENANT '<id>'` — write the per-transaction tenant
6360        // override and return. Outside a transaction the statement is
6361        // an error (matches PG semantics: SET LOCAL only takes effect
6362        // within an active transaction).
6363        if let Some(value) = parse_set_local_tenant(query)? {
6364            let conn_id = current_connection_id();
6365            if !self.inner.tx_contexts.read().contains_key(&conn_id) {
6366                return Err(RedDBError::Query(
6367                    "SET LOCAL TENANT requires an active transaction".to_string(),
6368                ));
6369            }
6370            self.inner
6371                .tx_local_tenants
6372                .write()
6373                .insert(conn_id, value.clone());
6374            return Ok(RuntimeQueryResult::ok_message(
6375                query.to_string(),
6376                &match &value {
6377                    Some(id) => format!("local tenant set: {id}"),
6378                    None => "local tenant cleared".to_string(),
6379                },
6380                "set_local_tenant",
6381            ));
6382        }
6383
6384        if super::red_schema::is_system_schema_write(query) {
6385            return Err(RedDBError::Query(
6386                super::red_schema::READ_ONLY_ERROR.to_string(),
6387            ));
6388        }
6389
6390        if let Some(create_source) = super::analytics_source_catalog::parse_create_statement(query)?
6391        {
6392            return self.execute_create_analytics_source(query, create_source);
6393        }
6394
6395        // Issue #790 — `READ METRIC <path>` is intentionally rejected at
6396        // v0. The descriptor itself is readable through
6397        // `red.analytics.metrics`; the *output* read returns a
6398        // structured error so callers can tell "execution engine not yet
6399        // built" apart from "metric does not exist".
6400        if let Some(path) = super::metric_descriptor_catalog::parse_read_metric_statement(query) {
6401            return Err(super::metric_descriptor_catalog::read_output_unsupported(
6402                &path,
6403            ));
6404        }
6405
6406        // Issue #918 / ADR 0035 — leaderboard rank capability. These are
6407        // narrow string intercepts (the `READ METRIC` precedent) so the
6408        // surface stays off the recursive-descent grammar. `RANK() OVER`
6409        // window projections are unaffected — they parse `SELECT … RANK()`
6410        // and never match the `RANK OF`/`CREATE RANKING`/`SHOW RANKINGS`
6411        // statement heads.
6412        if let Some(parsed) = super::ranking_descriptor_catalog::parse_create_ranking(query) {
6413            return self.execute_create_ranking(query, parsed?);
6414        }
6415        if super::ranking_descriptor_catalog::parse_show_rankings(query) {
6416            return self.execute_show_rankings(query);
6417        }
6418        if let Some(parsed) = super::ranking_descriptor_catalog::parse_rank_of(query) {
6419            return self.execute_rank_of(query, parsed?);
6420        }
6421
6422        let rewritten_query = super::red_schema::rewrite_virtual_names(query);
6423        let execution_query = rewritten_query.as_deref().unwrap_or(query);
6424
6425        let frame = super::statement_frame::StatementExecutionFrame::build(self, execution_query)?;
6426        let _frame_guards = frame.install(self);
6427
6428        // Phase 6 logging: enter a span stamped with conn_id / tenant
6429        // / query_len. Every downstream tracing::info!/warn!/error!
6430        // inherits these fields — no need to thread them manually
6431        // through storage/scan layers. Entered AFTER the WITHIN /
6432        // SET LOCAL TENANT resolution above so the span reflects the
6433        // effective scope for this statement.
6434        let _log_span = crate::telemetry::span::query_span(query).entered();
6435
6436        // ── CTE prelude (#41) — `WITH x AS (...) SELECT ... FROM x` ──
6437        if let Some(rewritten) = frame.prepare_cte(execution_query)? {
6438            return self.execute_query_expr(rewritten);
6439        }
6440
6441        // ── TURBO: bypass SQL parse for SELECT * FROM x WHERE _entity_id = N ──
6442        if !self.inner.query_audit.has_rules() {
6443            if let Some(result) = self.try_fast_entity_lookup(execution_query) {
6444                return result;
6445            }
6446        }
6447
6448        // ── Result cache: return cached result if still fresh (30s TTL) ──
6449        if !self.inner.query_audit.has_rules() {
6450            if let Some(result) = frame.read_result_cache(self) {
6451                return Ok(result);
6452            }
6453        }
6454
6455        let prepared = frame.prepare_statement(self, execution_query)?;
6456        let mode = prepared.mode;
6457        let expr = prepared.expr;
6458
6459        let statement = query_expr_name(&expr);
6460        let result_cache_scopes = query_expr_result_cache_scopes(&expr);
6461        let control_event_specs = query_control_event_specs(&expr);
6462        let query_audit_plan = query_audit_plan(&expr);
6463
6464        let _lock_guard = match frame.prepare_dispatch(self, &expr) {
6465            Ok(guard) => guard,
6466            Err(err) => {
6467                let outcome = control_event_outcome_for_error(&err);
6468                for spec in &control_event_specs {
6469                    self.emit_control_event(
6470                        spec.kind,
6471                        outcome,
6472                        spec.action,
6473                        spec.resource.clone(),
6474                        Some(err.to_string()),
6475                        spec.fields.clone(),
6476                    )?;
6477                }
6478                return Err(err);
6479            }
6480        };
6481        let frame_iface: &dyn super::statement_frame::ReadFrame = &frame;
6482        let query_audit_started = std::time::Instant::now();
6483
6484        let query_result = match expr {
6485            QueryExpr::Graph(_) | QueryExpr::Path(_) => {
6486                // Apply MVCC visibility + RLS gate while materialising the
6487                // graph: every node entity is screened against the source
6488                // collection's policy chain (basic and `Nodes`-targeted)
6489                // and dropped when the caller's tenant / role doesn't
6490                // admit it. Edges are pruned automatically because the
6491                // graph builder skips edges whose endpoints aren't in
6492                // `allowed_nodes`.
6493                let (graph, node_properties, edge_properties) =
6494                    self.materialize_graph_with_rls()?;
6495                let result =
6496                    crate::storage::query::unified::UnifiedExecutor::execute_on_with_graph_properties(
6497                        &graph,
6498                        &expr,
6499                        node_properties,
6500                        edge_properties,
6501                    )
6502                        .map_err(|err| RedDBError::Query(err.to_string()))?;
6503
6504                Ok(RuntimeQueryResult {
6505                    query: query.to_string(),
6506                    mode,
6507                    statement,
6508                    engine: "materialized-graph",
6509                    result,
6510                    affected_rows: 0,
6511                    statement_type: "select",
6512                    bookmark: None,
6513                })
6514            }
6515            QueryExpr::Table(table) => {
6516                let table = self.resolve_table_expr_subqueries(
6517                    table,
6518                    &frame as &dyn super::statement_frame::ReadFrame,
6519                )?;
6520                // Table-valued functions (e.g. components(g)) dispatch to a
6521                // read-only executor before any catalog/virtual-table routing
6522                // (issue #795).
6523                if let Some(TableSource::Function {
6524                    name,
6525                    args,
6526                    named_args,
6527                }) = table.source.clone()
6528                {
6529                    // The graph-collection form is cacheable (issue #802): the
6530                    // result-cache read at the top of this function keys on the
6531                    // query string, and `result_cache_scopes` carries the graph
6532                    // collection (see `collect_table_source_scopes`) so a write
6533                    // to it invalidates the entry. Deterministic algorithm
6534                    // output is worth caching at any row count, so the write
6535                    // bypasses the generic ≤5-row payload heuristic.
6536                    let tvf_result = RuntimeQueryResult {
6537                        query: query.to_string(),
6538                        mode,
6539                        statement,
6540                        engine: "runtime-graph-tvf",
6541                        result: self.execute_table_function(&name, &args, &named_args)?,
6542                        affected_rows: 0,
6543                        statement_type: "select",
6544                        bookmark: None,
6545                    };
6546                    frame.write_result_cache(self, &tvf_result, result_cache_scopes.clone());
6547                    return Ok(tvf_result);
6548                }
6549                // Inline-graph TVF (issue #799): the graph is supplied by two
6550                // subqueries instead of a collection reference. Unlike the
6551                // graph-collection form, the result IS cacheable — its cache
6552                // key is the query string (the result-cache read at the top of
6553                // `execute_query_inner` keys on it) and `result_cache_scopes`
6554                // already carries the `nodes`/`edges` source collections, so a
6555                // write to any of them invalidates the entry.
6556                if let Some(TableSource::InlineGraphFunction {
6557                    name,
6558                    nodes,
6559                    edges,
6560                    named_args,
6561                }) = table.source.clone()
6562                {
6563                    let inline_result = RuntimeQueryResult {
6564                        query: query.to_string(),
6565                        mode,
6566                        statement,
6567                        engine: "runtime-graph-tvf-inline",
6568                        result: self.execute_inline_graph_function(
6569                            &name,
6570                            &nodes,
6571                            &edges,
6572                            &named_args,
6573                        )?,
6574                        affected_rows: 0,
6575                        statement_type: "select",
6576                        bookmark: None,
6577                    };
6578                    frame.write_result_cache(self, &inline_result, result_cache_scopes);
6579                    return Ok(inline_result);
6580                }
6581                if super::red_schema::is_virtual_table(&table.table) {
6582                    return Ok(RuntimeQueryResult {
6583                        query: query.to_string(),
6584                        mode,
6585                        statement,
6586                        engine: "runtime-red-schema",
6587                        result: super::red_schema::red_query(
6588                            self,
6589                            &table.table,
6590                            &table,
6591                            &frame as &dyn super::statement_frame::ReadFrame,
6592                        )?,
6593                        affected_rows: 0,
6594                        statement_type: "select",
6595                        bookmark: None,
6596                    });
6597                }
6598
6599                // `<graph>.<output>` analytics virtual view (issue #800).
6600                // Recomputed on demand — intentionally not result-cached, so it
6601                // always reflects the current graph data.
6602                if let Some(view_result) = self.try_resolve_analytics_view(
6603                    &table,
6604                    &frame as &dyn super::statement_frame::ReadFrame,
6605                )? {
6606                    return Ok(RuntimeQueryResult {
6607                        query: query.to_string(),
6608                        mode,
6609                        statement,
6610                        engine: "runtime-graph-analytics-view",
6611                        result: view_result,
6612                        affected_rows: 0,
6613                        statement_type: "select",
6614                        bookmark: None,
6615                    });
6616                }
6617
6618                if let Some(result) = self.execute_probabilistic_select(&table)? {
6619                    return Ok(RuntimeQueryResult {
6620                        query: query.to_string(),
6621                        mode,
6622                        statement,
6623                        engine: "runtime-probabilistic",
6624                        result,
6625                        affected_rows: 0,
6626                        statement_type: "select",
6627                        bookmark: None,
6628                    });
6629                }
6630
6631                // Foreign-table intercept (Phase 3.2.2 PG parity).
6632                //
6633                // When the referenced table matches a `CREATE FOREIGN TABLE`
6634                // registration, short-circuit into the FDW scan. Phase 3.2
6635                // wrappers don't yet support pushdown, so filters/projections
6636                // apply post-scan via `apply_foreign_table_filters` — good
6637                // enough for correctness; perf work lands in 3.2.3.
6638                if self.inner.foreign_tables.is_foreign_table(&table.table) {
6639                    let records = self
6640                        .inner
6641                        .foreign_tables
6642                        .scan(&table.table)
6643                        .map_err(|e| RedDBError::Internal(e.to_string()))?;
6644                    let result = apply_foreign_table_filters(records, &table);
6645                    return Ok(RuntimeQueryResult {
6646                        query: query.to_string(),
6647                        mode,
6648                        statement,
6649                        engine: "runtime-fdw",
6650                        result,
6651                        affected_rows: 0,
6652                        statement_type: "select",
6653                        bookmark: None,
6654                    });
6655                }
6656
6657                // Row-Level Security enforcement (Phase 2.5.2 PG parity).
6658                //
6659                // When RLS is enabled on this table, fetch every policy
6660                // that applies to the current (role, SELECT) pair and
6661                // fold them into the query's WHERE clause: policies
6662                // OR-combine (any of them admitting the row is enough),
6663                // then AND into the caller's existing filter.
6664                //
6665                // Anonymous callers (no thread-local identity) pass
6666                // `role = None`; policies with a specific `TO role`
6667                // clause skip, but `TO PUBLIC` policies still apply.
6668                //
6669                // When `inject_rls_filters` returns `None` the table has
6670                // RLS enabled but no policy admits the caller's role —
6671                // short-circuit with an empty result set instead of
6672                // synthesising a contradiction filter.
6673                let Some(table_with_rls) = self.authorize_relational_table_select(
6674                    table,
6675                    &frame as &dyn super::statement_frame::ReadFrame,
6676                )?
6677                else {
6678                    let empty = crate::storage::query::unified::UnifiedResult::empty();
6679                    return Ok(RuntimeQueryResult {
6680                        query: query.to_string(),
6681                        mode,
6682                        statement,
6683                        engine: "runtime-table-rls",
6684                        result: empty,
6685                        affected_rows: 0,
6686                        statement_type: "select",
6687                        bookmark: None,
6688                    });
6689                };
6690                Ok(RuntimeQueryResult {
6691                    query: query.to_string(),
6692                    mode,
6693                    statement,
6694                    engine: "runtime-table",
6695                    // #885: lend the frame-owned row-buffer arena to the
6696                    // streaming path so chunk buffers are reused across
6697                    // this statement's chunk-fetches instead of allocated
6698                    // fresh per chunk. This is the table-query dispatch
6699                    // that runs under a `StatementExecutionFrame`; the
6700                    // frameless prepared/subquery paths keep `None`.
6701                    result: execute_runtime_table_query_in(
6702                        &self.inner.db,
6703                        &table_with_rls,
6704                        Some(&self.inner.index_store),
6705                        Some(frame.row_arena()),
6706                    )?,
6707                    affected_rows: 0,
6708                    statement_type: "select",
6709                    bookmark: None,
6710                })
6711            }
6712            QueryExpr::Join(join) => {
6713                // Fold per-table RLS filters into each `QueryExpr::Table`
6714                // leaf of the join tree before executing. Without this
6715                // the join executor scans both tables raw and ignores
6716                // policies — a `WITHIN TENANT 'x'` against a join of
6717                // two tenant-scoped tables would leak cross-tenant rows.
6718                // When any leaf has RLS enabled and zero matching policy,
6719                // short-circuit to an empty join result instead of
6720                // emitting a contradiction filter.
6721                let join_with_rls = match self.authorize_relational_join_select(
6722                    join,
6723                    &frame as &dyn super::statement_frame::ReadFrame,
6724                )? {
6725                    Some(j) => j,
6726                    None => {
6727                        return Ok(RuntimeQueryResult {
6728                            query: query.to_string(),
6729                            mode,
6730                            statement,
6731                            engine: "runtime-join-rls",
6732                            result: crate::storage::query::unified::UnifiedResult::empty(),
6733                            affected_rows: 0,
6734                            statement_type: "select",
6735                            bookmark: None,
6736                        });
6737                    }
6738                };
6739                Ok(RuntimeQueryResult {
6740                    query: query.to_string(),
6741                    mode,
6742                    statement,
6743                    engine: "runtime-join",
6744                    result: execute_runtime_join_query(&self.inner.db, &join_with_rls)?,
6745                    affected_rows: 0,
6746                    statement_type: "select",
6747                    bookmark: None,
6748                })
6749            }
6750            QueryExpr::Vector(vector) => Ok(RuntimeQueryResult {
6751                query: query.to_string(),
6752                mode,
6753                statement,
6754                engine: "runtime-vector",
6755                result: execute_runtime_vector_query(&self.inner.db, &vector)?,
6756                affected_rows: 0,
6757                statement_type: "select",
6758                bookmark: None,
6759            }),
6760            QueryExpr::Hybrid(hybrid) => Ok(RuntimeQueryResult {
6761                query: query.to_string(),
6762                mode,
6763                statement,
6764                engine: "runtime-hybrid",
6765                result: execute_runtime_hybrid_query(&self.inner.db, &hybrid)?,
6766                affected_rows: 0,
6767                statement_type: "select",
6768                bookmark: None,
6769            }),
6770            // DML execution
6771            QueryExpr::Insert(ref insert) if super::red_schema::is_virtual_table(&insert.table) => {
6772                Err(RedDBError::Query(
6773                    super::red_schema::READ_ONLY_ERROR.to_string(),
6774                ))
6775            }
6776            QueryExpr::Update(ref update) if super::red_schema::is_virtual_table(&update.table) => {
6777                Err(RedDBError::Query(
6778                    super::red_schema::READ_ONLY_ERROR.to_string(),
6779                ))
6780            }
6781            QueryExpr::Delete(ref delete) if super::red_schema::is_virtual_table(&delete.table) => {
6782                Err(RedDBError::Query(
6783                    super::red_schema::READ_ONLY_ERROR.to_string(),
6784                ))
6785            }
6786            QueryExpr::Insert(ref insert) => self
6787                .with_deferred_store_wal_for_dml(self.insert_may_emit_events(insert), || {
6788                    self.execute_insert(query, insert)
6789                }),
6790            QueryExpr::Update(ref update) => self
6791                .with_deferred_store_wal_for_dml(self.update_may_emit_events(update), || {
6792                    self.execute_update(query, update)
6793                }),
6794            QueryExpr::Delete(ref delete) => self
6795                .with_deferred_store_wal_for_dml(self.delete_may_emit_events(delete), || {
6796                    self.execute_delete(query, delete)
6797                }),
6798            // DDL execution
6799            QueryExpr::CreateTable(ref create) => self.execute_create_table(query, create),
6800            QueryExpr::CreateCollection(ref create) => {
6801                self.execute_create_collection(query, create)
6802            }
6803            QueryExpr::CreateVector(ref create) => self.execute_create_vector(query, create),
6804            QueryExpr::DropTable(ref drop_tbl) => self.execute_drop_table(query, drop_tbl),
6805            QueryExpr::DropGraph(ref drop_graph) => self.execute_drop_graph(query, drop_graph),
6806            QueryExpr::DropVector(ref drop_vector) => self.execute_drop_vector(query, drop_vector),
6807            QueryExpr::DropDocument(ref drop_document) => {
6808                self.execute_drop_document(query, drop_document)
6809            }
6810            QueryExpr::DropKv(ref drop_kv) => self.execute_drop_kv(query, drop_kv),
6811            QueryExpr::DropCollection(ref drop_collection) => {
6812                self.execute_drop_collection(query, drop_collection)
6813            }
6814            QueryExpr::Truncate(ref truncate) => self.execute_truncate(query, truncate),
6815            QueryExpr::AlterTable(ref alter) => self.execute_alter_table(query, alter),
6816            QueryExpr::ExplainAlter(ref explain) => self.execute_explain_alter(query, explain),
6817            // Graph analytics commands
6818            QueryExpr::GraphCommand(ref cmd) => self.execute_graph_command(query, cmd),
6819            // Search commands
6820            QueryExpr::SearchCommand(ref cmd) => self.execute_search_command(query, cmd),
6821            // ASK: RAG query with LLM synthesis
6822            QueryExpr::Ask(ref ask) => self.execute_ask(query, ask),
6823            QueryExpr::CreateIndex(ref create_idx) => self.execute_create_index(query, create_idx),
6824            QueryExpr::DropIndex(ref drop_idx) => self.execute_drop_index(query, drop_idx),
6825            QueryExpr::ProbabilisticCommand(ref cmd) => {
6826                self.execute_probabilistic_command(query, cmd)
6827            }
6828            // Time-series DDL
6829            QueryExpr::CreateTimeSeries(ref ts) => self.execute_create_timeseries(query, ts),
6830            QueryExpr::CreateMetric(ref metric) => self.execute_create_metric(query, metric),
6831            QueryExpr::AlterMetric(ref alter) => self.execute_alter_metric(query, alter),
6832            QueryExpr::CreateSlo(ref slo) => self.execute_create_slo(query, slo),
6833            QueryExpr::DropTimeSeries(ref ts) => self.execute_drop_timeseries(query, ts),
6834            // Queue DDL and commands
6835            QueryExpr::CreateQueue(ref q) => self.execute_create_queue(query, q),
6836            QueryExpr::AlterQueue(ref q) => self.execute_alter_queue(query, q),
6837            QueryExpr::DropQueue(ref q) => self.execute_drop_queue(query, q),
6838            QueryExpr::QueueSelect(ref q) => self.execute_queue_select(query, q),
6839            QueryExpr::QueueCommand(ref cmd) => self.execute_queue_command(query, cmd),
6840            QueryExpr::EventsBackfill(ref backfill) => {
6841                self.execute_events_backfill(query, backfill)
6842            }
6843            QueryExpr::EventsBackfillStatus { ref collection } => Err(RedDBError::Query(format!(
6844                "EVENTS BACKFILL STATUS for '{collection}' is not implemented in this slice"
6845            ))),
6846            QueryExpr::KvCommand(ref cmd) => self.execute_kv_command(query, cmd),
6847            QueryExpr::ConfigCommand(ref cmd) => self.execute_config_command(query, cmd),
6848            QueryExpr::CreateTree(ref tree) => self.execute_create_tree(query, tree),
6849            QueryExpr::DropTree(ref tree) => self.execute_drop_tree(query, tree),
6850            QueryExpr::TreeCommand(ref cmd) => self.execute_tree_command(query, cmd),
6851            // SET CONFIG key = value
6852            QueryExpr::SetConfig { ref key, ref value } => {
6853                if key.starts_with("red.secret.") {
6854                    return Err(RedDBError::Query(
6855                        "red.secret.* is reserved for vault secrets; use SET SECRET".to_string(),
6856                    ));
6857                }
6858                match self.check_managed_config_write_for_set_config(key) {
6859                    Err(err) => Err(err),
6860                    Ok(()) => {
6861                        let store = self.inner.db.store();
6862                        let json_val = match value {
6863                            Value::Text(s) => crate::serde_json::Value::String(s.to_string()),
6864                            Value::Integer(n) => crate::serde_json::Value::Number(*n as f64),
6865                            Value::Float(n) => crate::serde_json::Value::Number(*n),
6866                            Value::Boolean(b) => crate::serde_json::Value::Bool(*b),
6867                            _ => crate::serde_json::Value::String(value.to_string()),
6868                        };
6869                        store.set_config_tree(key, &json_val);
6870                        update_current_config_value(key, value.clone());
6871                        // Config changes can flip runtime behavior mid-session
6872                        // (auto_decrypt, auto_encrypt, etc.) — invalidate the
6873                        // result cache so subsequent reads re-execute against
6874                        // the new config.
6875                        self.invalidate_result_cache();
6876                        Ok(RuntimeQueryResult::ok_message(
6877                            query.to_string(),
6878                            &format!("config set: {key}"),
6879                            "set",
6880                        ))
6881                    }
6882                }
6883            }
6884            // SET SECRET key = value
6885            QueryExpr::SetSecret { ref key, ref value } => {
6886                if key.starts_with("red.config.") {
6887                    return Err(RedDBError::Query(
6888                        "red.config.* is reserved for config; use SET CONFIG".to_string(),
6889                    ));
6890                }
6891                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
6892                    RedDBError::Query("SET SECRET requires an enabled, unsealed vault".to_string())
6893                })?;
6894                if matches!(value, Value::Null) {
6895                    auth_store
6896                        .vault_kv_try_delete(key)
6897                        .map_err(|err| RedDBError::Query(err.to_string()))?;
6898                    update_current_secret_value(key, None);
6899                    self.invalidate_result_cache();
6900                    return Ok(RuntimeQueryResult::ok_message(
6901                        query.to_string(),
6902                        &format!("secret deleted: {key}"),
6903                        "delete_secret",
6904                    ));
6905                }
6906                let value = secret_sql_value_to_string(value)?;
6907                auth_store
6908                    .vault_kv_try_set(key.clone(), value.clone())
6909                    .map_err(|err| RedDBError::Query(err.to_string()))?;
6910                update_current_secret_value(key, Some(value));
6911                self.invalidate_result_cache();
6912                Ok(RuntimeQueryResult::ok_message(
6913                    query.to_string(),
6914                    &format!("secret set: {key}"),
6915                    "set_secret",
6916                ))
6917            }
6918            // DELETE SECRET key
6919            QueryExpr::DeleteSecret { ref key } => {
6920                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
6921                    RedDBError::Query(
6922                        "DELETE SECRET requires an enabled, unsealed vault".to_string(),
6923                    )
6924                })?;
6925                let deleted = auth_store
6926                    .vault_kv_try_delete(key)
6927                    .map_err(|err| RedDBError::Query(err.to_string()))?;
6928                if deleted {
6929                    update_current_secret_value(key, None);
6930                }
6931                self.invalidate_result_cache();
6932                Ok(RuntimeQueryResult::ok_message(
6933                    query.to_string(),
6934                    &format!("secret deleted: {key}"),
6935                    if deleted {
6936                        "delete_secret"
6937                    } else {
6938                        "delete_secret_not_found"
6939                    },
6940                ))
6941            }
6942            // SHOW SECRET[S] [prefix]
6943            QueryExpr::ShowSecrets { ref prefix } => {
6944                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
6945                    RedDBError::Query("SHOW SECRET requires an enabled, unsealed vault".to_string())
6946                })?;
6947                if !auth_store.is_vault_backed() {
6948                    return Err(RedDBError::Query(
6949                        "SHOW SECRET requires an enabled, unsealed vault".to_string(),
6950                    ));
6951                }
6952                let mut keys = auth_store.vault_kv_keys();
6953                keys.sort();
6954                let mut result = UnifiedResult::with_columns(vec![
6955                    "key".into(),
6956                    "value".into(),
6957                    "status".into(),
6958                ]);
6959                for key in keys {
6960                    if let Some(ref pfx) = prefix {
6961                        if !key.starts_with(pfx) {
6962                            continue;
6963                        }
6964                    }
6965                    let mut record = UnifiedRecord::new();
6966                    record.set("key", Value::text(key));
6967                    record.set("value", Value::text("***"));
6968                    record.set("status", Value::text("active"));
6969                    result.push(record);
6970                }
6971                Ok(RuntimeQueryResult {
6972                    query: query.to_string(),
6973                    mode,
6974                    statement: "show_secrets",
6975                    engine: "runtime-secret",
6976                    result,
6977                    affected_rows: 0,
6978                    statement_type: "select",
6979                    bookmark: None,
6980                })
6981            }
6982            // SHOW CONFIG [prefix]
6983            QueryExpr::ShowConfig { ref prefix } => {
6984                let store = self.inner.db.store();
6985                let all_collections = store.list_collections();
6986                if !all_collections.contains(&"red_config".to_string()) {
6987                    let result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
6988                    return Ok(RuntimeQueryResult {
6989                        query: query.to_string(),
6990                        mode,
6991                        statement: "show_config",
6992                        engine: "runtime-config",
6993                        result,
6994                        affected_rows: 0,
6995                        statement_type: "select",
6996                        bookmark: None,
6997                    });
6998                }
6999                let manager = store
7000                    .get_collection("red_config")
7001                    .ok_or_else(|| RedDBError::NotFound("red_config".to_string()))?;
7002                let entities = manager.query_all(|_| true);
7003                let mut latest = std::collections::BTreeMap::<String, (u64, Value, Value)>::new();
7004                for entity in entities {
7005                    if let EntityData::Row(ref row) = entity.data {
7006                        if let Some(ref named) = row.named {
7007                            let key_val = named.get("key").cloned().unwrap_or(Value::Null);
7008                            let val = named.get("value").cloned().unwrap_or(Value::Null);
7009                            let key_str = match &key_val {
7010                                Value::Text(s) => s.as_ref(),
7011                                _ => continue,
7012                            };
7013                            if let Some(ref pfx) = prefix {
7014                                if !key_str.starts_with(pfx.as_str()) {
7015                                    continue;
7016                                }
7017                            }
7018                            let entity_id = entity.id.raw();
7019                            match latest.get(key_str) {
7020                                Some((prev_id, _, _)) if *prev_id > entity_id => {}
7021                                _ => {
7022                                    latest.insert(key_str.to_string(), (entity_id, key_val, val));
7023                                }
7024                            }
7025                        }
7026                    }
7027                }
7028                let mut result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
7029                for (_, key_val, val) in latest.into_values() {
7030                    let mut record = UnifiedRecord::new();
7031                    record.set("key", key_val);
7032                    record.set("value", val);
7033                    result.push(record);
7034                }
7035                Ok(RuntimeQueryResult {
7036                    query: query.to_string(),
7037                    mode,
7038                    statement: "show_config",
7039                    engine: "runtime-config",
7040                    result,
7041                    affected_rows: 0,
7042                    statement_type: "select",
7043                    bookmark: None,
7044                })
7045            }
7046            // Session-local multi-tenancy handle (Phase 2.5.3).
7047            //
7048            // SET TENANT 'id' / SET TENANT NULL / RESET TENANT — writes
7049            // the thread-local; SHOW TENANT returns it. Paired with the
7050            // CURRENT_TENANT() scalar for use in RLS policies.
7051            QueryExpr::SetTenant(ref value) => {
7052                match value {
7053                    Some(id) => set_current_tenant(id.clone()),
7054                    None => clear_current_tenant(),
7055                }
7056                Ok(RuntimeQueryResult::ok_message(
7057                    query.to_string(),
7058                    &match value {
7059                        Some(id) => format!("tenant set: {id}"),
7060                        None => "tenant cleared".to_string(),
7061                    },
7062                    "set_tenant",
7063                ))
7064            }
7065            QueryExpr::ShowTenant => {
7066                let mut result = UnifiedResult::with_columns(vec!["tenant".into()]);
7067                let mut record = UnifiedRecord::new();
7068                record.set(
7069                    "tenant",
7070                    current_tenant().map(Value::text).unwrap_or(Value::Null),
7071                );
7072                result.push(record);
7073                Ok(RuntimeQueryResult {
7074                    query: query.to_string(),
7075                    mode,
7076                    statement: "show_tenant",
7077                    engine: "runtime-tenant",
7078                    result,
7079                    affected_rows: 0,
7080                    statement_type: "select",
7081                    bookmark: None,
7082                })
7083            }
7084            // Transaction control (Phase 2.3 PG parity).
7085            //
7086            // BEGIN allocates a real `Xid` and stores a `TxnContext` keyed by
7087            // the current connection's id. COMMIT/ROLLBACK release it through
7088            // the `SnapshotManager` so future snapshots see the correct set of
7089            // active/aborted transactions.
7090            //
7091            // Tuple stamping (xmin/xmax) and read-path visibility filtering
7092            // land in Phase 2.3.2 — this dispatch only manages the snapshot
7093            // registry. Statements running outside a TxnContext still behave
7094            // as autocommit (xid=0 → visible to every snapshot).
7095            QueryExpr::TransactionControl(ref ctl) => {
7096                use crate::storage::query::ast::TxnControl;
7097                use crate::storage::transaction::snapshot::{TxnContext, Xid};
7098                use crate::storage::transaction::IsolationLevel;
7099
7100                // Phase 2.3 keys transactions by a thread-local connection id.
7101                // The stdio/gRPC paths wire a real per-connection id later;
7102                // for embedded use (one RedDBRuntime per process-ish caller)
7103                // we fall back to a deterministic placeholder.
7104                let conn_id = current_connection_id();
7105
7106                let (kind, msg) = match ctl {
7107                    TxnControl::Begin => {
7108                        let mgr = Arc::clone(&self.inner.snapshot_manager);
7109                        let xid = mgr.begin();
7110                        let snapshot = mgr.snapshot(xid);
7111                        let ctx = TxnContext {
7112                            xid,
7113                            isolation: IsolationLevel::SnapshotIsolation,
7114                            snapshot,
7115                            savepoints: Vec::new(),
7116                            released_sub_xids: Vec::new(),
7117                        };
7118                        self.inner.tx_contexts.write().insert(conn_id, ctx);
7119                        ("begin", format!("BEGIN — xid={xid} (snapshot isolation)"))
7120                    }
7121                    TxnControl::Commit => {
7122                        // SET LOCAL TENANT ends with the transaction.
7123                        self.inner.tx_local_tenants.write().remove(&conn_id);
7124                        let ctx = self.inner.tx_contexts.write().remove(&conn_id);
7125                        match ctx {
7126                            Some(ctx) => {
7127                                let mut own_xids = std::collections::HashSet::new();
7128                                own_xids.insert(ctx.xid);
7129                                for (_, sub) in &ctx.savepoints {
7130                                    own_xids.insert(*sub);
7131                                }
7132                                for sub in &ctx.released_sub_xids {
7133                                    own_xids.insert(*sub);
7134                                }
7135                                if let Err(err) = self.check_table_row_write_conflicts(
7136                                    conn_id,
7137                                    &ctx.snapshot,
7138                                    &own_xids,
7139                                ) {
7140                                    for (_, sub) in &ctx.savepoints {
7141                                        self.inner.snapshot_manager.rollback(*sub);
7142                                    }
7143                                    for sub in &ctx.released_sub_xids {
7144                                        self.inner.snapshot_manager.rollback(*sub);
7145                                    }
7146                                    self.inner.snapshot_manager.rollback(ctx.xid);
7147                                    self.revive_pending_versioned_updates(conn_id);
7148                                    self.revive_pending_tombstones(conn_id);
7149                                    self.discard_pending_kv_watch_events(conn_id);
7150                                    self.discard_pending_queue_wakes(conn_id);
7151                                    self.discard_pending_store_wal_actions(conn_id);
7152                                    return Err(err);
7153                                }
7154                                self.restore_pending_write_stamps(conn_id);
7155                                if let Err(err) = self.flush_pending_store_wal_actions(conn_id) {
7156                                    for (_, sub) in &ctx.savepoints {
7157                                        self.inner.snapshot_manager.rollback(*sub);
7158                                    }
7159                                    for sub in &ctx.released_sub_xids {
7160                                        self.inner.snapshot_manager.rollback(*sub);
7161                                    }
7162                                    self.inner.snapshot_manager.rollback(ctx.xid);
7163                                    self.revive_pending_versioned_updates(conn_id);
7164                                    self.revive_pending_tombstones(conn_id);
7165                                    self.discard_pending_kv_watch_events(conn_id);
7166                                    return Err(err);
7167                                }
7168                                // Phase 2.3.2e: commit every open sub-xid
7169                                // so they also become visible. Their
7170                                // work is promoted to the parent txn's
7171                                // result exactly like a RELEASE would
7172                                // have done.
7173                                for (_, sub) in &ctx.savepoints {
7174                                    self.inner.snapshot_manager.commit(*sub);
7175                                }
7176                                for sub in &ctx.released_sub_xids {
7177                                    self.inner.snapshot_manager.commit(*sub);
7178                                }
7179                                self.inner.snapshot_manager.commit(ctx.xid);
7180                                self.finalize_pending_versioned_updates(conn_id);
7181                                self.finalize_pending_tombstones(conn_id);
7182                                self.finalize_pending_kv_watch_events(conn_id);
7183                                self.finalize_pending_queue_wakes(conn_id);
7184                                ("commit", format!("COMMIT — xid={} committed", ctx.xid))
7185                            }
7186                            None => (
7187                                "commit",
7188                                "COMMIT outside transaction — no-op (autocommit)".to_string(),
7189                            ),
7190                        }
7191                    }
7192                    TxnControl::Rollback => {
7193                        self.inner.tx_local_tenants.write().remove(&conn_id);
7194                        let ctx = self.inner.tx_contexts.write().remove(&conn_id);
7195                        match ctx {
7196                            Some(ctx) => {
7197                                // Phase 2.3.2e: abort every open sub-xid
7198                                // too so their writes stay hidden.
7199                                for (_, sub) in &ctx.savepoints {
7200                                    self.inner.snapshot_manager.rollback(*sub);
7201                                }
7202                                for sub in &ctx.released_sub_xids {
7203                                    self.inner.snapshot_manager.rollback(*sub);
7204                                }
7205                                self.inner.snapshot_manager.rollback(ctx.xid);
7206                                // Phase 2.3.2b: tuples that the txn had
7207                                // xmax-stamped become live again — wipe xmax
7208                                // back to 0 so later snapshots see them.
7209                                self.revive_pending_versioned_updates(conn_id);
7210                                self.revive_pending_tombstones(conn_id);
7211                                self.discard_pending_kv_watch_events(conn_id);
7212                                self.discard_pending_queue_wakes(conn_id);
7213                                self.discard_pending_store_wal_actions(conn_id);
7214                                ("rollback", format!("ROLLBACK — xid={} aborted", ctx.xid))
7215                            }
7216                            None => (
7217                                "rollback",
7218                                "ROLLBACK outside transaction — no-op (autocommit)".to_string(),
7219                            ),
7220                        }
7221                    }
7222                    // Phase 2.3.2e: savepoints map onto sub-xids. Each
7223                    // SAVEPOINT allocates a fresh xid and pushes it
7224                    // onto the per-txn stack so subsequent writes can
7225                    // be selectively rolled back. RELEASE pops without
7226                    // aborting; ROLLBACK TO aborts the sub-xid (and
7227                    // any nested ones) + revives their tombstones.
7228                    TxnControl::Savepoint(name) => {
7229                        let mgr = Arc::clone(&self.inner.snapshot_manager);
7230                        let mut guard = self.inner.tx_contexts.write();
7231                        match guard.get_mut(&conn_id) {
7232                            Some(ctx) => {
7233                                let sub = mgr.begin();
7234                                ctx.savepoints.push((name.clone(), sub));
7235                                ("savepoint", format!("SAVEPOINT {name} — sub_xid={sub}"))
7236                            }
7237                            None => (
7238                                "savepoint",
7239                                "SAVEPOINT outside transaction — no-op".to_string(),
7240                            ),
7241                        }
7242                    }
7243                    TxnControl::ReleaseSavepoint(name) => {
7244                        let mut guard = self.inner.tx_contexts.write();
7245                        match guard.get_mut(&conn_id) {
7246                            Some(ctx) => {
7247                                let pos = ctx
7248                                    .savepoints
7249                                    .iter()
7250                                    .position(|(n, _)| n == name)
7251                                    .ok_or_else(|| {
7252                                        RedDBError::Internal(format!(
7253                                            "savepoint {name} does not exist"
7254                                        ))
7255                                    })?;
7256                                // RELEASE pops the named savepoint and
7257                                // any nested ones. Their sub-xids move
7258                                // to `released_sub_xids` so they commit
7259                                // (or roll back) alongside the parent
7260                                // xid — PG semantics: released
7261                                // savepoints still contribute their
7262                                // work, but their names are gone.
7263                                let released = ctx.savepoints.len() - pos;
7264                                let popped: Vec<Xid> = ctx
7265                                    .savepoints
7266                                    .split_off(pos)
7267                                    .into_iter()
7268                                    .map(|(_, x)| x)
7269                                    .collect();
7270                                ctx.released_sub_xids.extend(popped);
7271                                (
7272                                    "release_savepoint",
7273                                    format!("RELEASE SAVEPOINT {name} — {released} level(s)"),
7274                                )
7275                            }
7276                            None => (
7277                                "release_savepoint",
7278                                "RELEASE outside transaction — no-op".to_string(),
7279                            ),
7280                        }
7281                    }
7282                    TxnControl::RollbackToSavepoint(name) => {
7283                        let mgr = Arc::clone(&self.inner.snapshot_manager);
7284                        // Splice out the savepoint + nested ones under
7285                        // a narrow lock, then run the snapshot-manager
7286                        // + tombstone side-effects without the tx map
7287                        // held so nothing re-enters.
7288                        let drop_result: Option<(Xid, Vec<Xid>)> = {
7289                            let mut guard = self.inner.tx_contexts.write();
7290                            if let Some(ctx) = guard.get_mut(&conn_id) {
7291                                let pos = ctx
7292                                    .savepoints
7293                                    .iter()
7294                                    .position(|(n, _)| n == name)
7295                                    .ok_or_else(|| {
7296                                        RedDBError::Internal(format!(
7297                                            "savepoint {name} does not exist"
7298                                        ))
7299                                    })?;
7300                                let savepoint_xid = ctx.savepoints[pos].1;
7301                                let aborted: Vec<Xid> = ctx
7302                                    .savepoints
7303                                    .split_off(pos)
7304                                    .into_iter()
7305                                    .map(|(_, x)| x)
7306                                    .collect();
7307                                Some((savepoint_xid, aborted))
7308                            } else {
7309                                None
7310                            }
7311                        };
7312
7313                        match drop_result {
7314                            Some((savepoint_xid, aborted)) => {
7315                                for x in &aborted {
7316                                    mgr.rollback(*x);
7317                                }
7318                                let reverted_updates =
7319                                    self.revive_versioned_updates_since(conn_id, savepoint_xid);
7320                                let revived = self.revive_tombstones_since(conn_id, savepoint_xid);
7321                                (
7322                                    "rollback_to_savepoint",
7323                                    format!(
7324                                        "ROLLBACK TO SAVEPOINT {name} — aborted {} sub_xid(s), reverted {reverted_updates} update(s), revived {revived} tombstone(s)",
7325                                        aborted.len(),
7326                                    ),
7327                                )
7328                            }
7329                            None => (
7330                                "rollback_to_savepoint",
7331                                "ROLLBACK TO outside transaction — no-op".to_string(),
7332                            ),
7333                        }
7334                    }
7335                };
7336                Ok(RuntimeQueryResult::ok_message(
7337                    query.to_string(),
7338                    &msg,
7339                    kind,
7340                ))
7341            }
7342            // Schema + Sequence DDL (Phase 1.3 PG parity).
7343            //
7344            // Schemas are lightweight logical namespaces: a CREATE SCHEMA call
7345            // just registers the name in `red_config` under `schema.{name}`.
7346            // Table lookups still happen by collection name; clients using
7347            // `schema.table` qualified names collapse to collection `schema.table`.
7348            //
7349            // Sequences persist a 64-bit counter + metadata (start, increment)
7350            // in `red_config` under `sequence.{name}.*`. Scalar callers
7351            // `nextval('name')` / `currval('name')` arrive with the MVCC phase
7352            // once we have a proper mutating-function dispatch path; for now the
7353            // DDL just establishes the catalog entry so clients don't error.
7354            QueryExpr::CreateSchema(ref q) => {
7355                let store = self.inner.db.store();
7356                let key = format!("schema.{}", q.name);
7357                if store.get_config(&key).is_some() {
7358                    if q.if_not_exists {
7359                        return Ok(RuntimeQueryResult::ok_message(
7360                            query.to_string(),
7361                            &format!("schema {} already exists — skipped", q.name),
7362                            "create_schema",
7363                        ));
7364                    }
7365                    return Err(RedDBError::Internal(format!(
7366                        "schema {} already exists",
7367                        q.name
7368                    )));
7369                }
7370                store.set_config_tree(&key, &crate::serde_json::Value::Bool(true));
7371                Ok(RuntimeQueryResult::ok_message(
7372                    query.to_string(),
7373                    &format!("schema {} created", q.name),
7374                    "create_schema",
7375                ))
7376            }
7377            QueryExpr::DropSchema(ref q) => {
7378                let store = self.inner.db.store();
7379                let key = format!("schema.{}", q.name);
7380                let existed = store.get_config(&key).is_some();
7381                if !existed && !q.if_exists {
7382                    return Err(RedDBError::Internal(format!(
7383                        "schema {} does not exist",
7384                        q.name
7385                    )));
7386                }
7387                // Remove marker from red_config via set to null.
7388                store.set_config_tree(&key, &crate::serde_json::Value::Null);
7389                let suffix = if q.cascade {
7390                    " (CASCADE accepted — tables untouched)"
7391                } else {
7392                    ""
7393                };
7394                Ok(RuntimeQueryResult::ok_message(
7395                    query.to_string(),
7396                    &format!("schema {} dropped{}", q.name, suffix),
7397                    "drop_schema",
7398                ))
7399            }
7400            QueryExpr::CreateSequence(ref q) => {
7401                let store = self.inner.db.store();
7402                let base = format!("sequence.{}", q.name);
7403                let start_key = format!("{base}.start");
7404                let incr_key = format!("{base}.increment");
7405                let curr_key = format!("{base}.current");
7406                if store.get_config(&start_key).is_some() {
7407                    if q.if_not_exists {
7408                        return Ok(RuntimeQueryResult::ok_message(
7409                            query.to_string(),
7410                            &format!("sequence {} already exists — skipped", q.name),
7411                            "create_sequence",
7412                        ));
7413                    }
7414                    return Err(RedDBError::Internal(format!(
7415                        "sequence {} already exists",
7416                        q.name
7417                    )));
7418                }
7419                // Persist start + increment, and set current so the first
7420                // nextval returns `start`.
7421                let initial_current = q.start - q.increment;
7422                store.set_config_tree(
7423                    &start_key,
7424                    &crate::serde_json::Value::Number(q.start as f64),
7425                );
7426                store.set_config_tree(
7427                    &incr_key,
7428                    &crate::serde_json::Value::Number(q.increment as f64),
7429                );
7430                store.set_config_tree(
7431                    &curr_key,
7432                    &crate::serde_json::Value::Number(initial_current as f64),
7433                );
7434                Ok(RuntimeQueryResult::ok_message(
7435                    query.to_string(),
7436                    &format!(
7437                        "sequence {} created (start={}, increment={})",
7438                        q.name, q.start, q.increment
7439                    ),
7440                    "create_sequence",
7441                ))
7442            }
7443            QueryExpr::DropSequence(ref q) => {
7444                let store = self.inner.db.store();
7445                let base = format!("sequence.{}", q.name);
7446                let existed = store.get_config(&format!("{base}.start")).is_some();
7447                if !existed && !q.if_exists {
7448                    return Err(RedDBError::Internal(format!(
7449                        "sequence {} does not exist",
7450                        q.name
7451                    )));
7452                }
7453                for k in ["start", "increment", "current"] {
7454                    store.set_config_tree(&format!("{base}.{k}"), &crate::serde_json::Value::Null);
7455                }
7456                Ok(RuntimeQueryResult::ok_message(
7457                    query.to_string(),
7458                    &format!("sequence {} dropped", q.name),
7459                    "drop_sequence",
7460                ))
7461            }
7462            // Views — CREATE [MATERIALIZED] VIEW (Phase 2.1 PG parity).
7463            //
7464            // The view definition is stored in-memory on RuntimeInner (not
7465            // persisted). SELECTs that reference the view name will substitute
7466            // the stored `QueryExpr` via `resolve_view_reference` during
7467            // planning (same entry point used by table-name resolution).
7468            //
7469            // Materialized views additionally allocate a slot in
7470            // `MaterializedViewCache`; a REFRESH repopulates that slot.
7471            QueryExpr::CreateView(ref q) => {
7472                let mut views = self.inner.views.write();
7473                if views.contains_key(&q.name) && !q.or_replace {
7474                    if q.if_not_exists {
7475                        return Ok(RuntimeQueryResult::ok_message(
7476                            query.to_string(),
7477                            &format!("view {} already exists — skipped", q.name),
7478                            "create_view",
7479                        ));
7480                    }
7481                    return Err(RedDBError::Internal(format!(
7482                        "view {} already exists",
7483                        q.name
7484                    )));
7485                }
7486                views.insert(q.name.clone(), Arc::new(q.clone()));
7487                drop(views);
7488
7489                // Materialized view: register cache slot (data is empty until REFRESH).
7490                if q.materialized {
7491                    use crate::storage::cache::result::{MaterializedViewDef, RefreshPolicy};
7492                    let refresh = match q.refresh_every_ms {
7493                        Some(ms) => RefreshPolicy::Periodic(std::time::Duration::from_millis(ms)),
7494                        None => RefreshPolicy::Manual,
7495                    };
7496                    let dependencies = collect_table_refs(&q.query);
7497                    let def = MaterializedViewDef {
7498                        name: q.name.clone(),
7499                        query: format!("<parsed view {}>", q.name),
7500                        dependencies: dependencies.clone(),
7501                        refresh,
7502                        retention_duration_ms: q.retention_duration_ms,
7503                    };
7504                    self.inner.materialized_views.write().register(def);
7505
7506                    // Issue #593 slice 9a — persist the descriptor to
7507                    // the system catalog so the definition survives a
7508                    // restart. Upsert semantics (delete-then-insert by
7509                    // name) keep the catalog free of duplicate rows
7510                    // across `CREATE OR REPLACE` churn.
7511                    let descriptor =
7512                        crate::runtime::continuous_materialized_view::MaterializedViewDescriptor {
7513                            name: q.name.clone(),
7514                            source_sql: query.to_string(),
7515                            source_collections: dependencies,
7516                            refresh_every_ms: q.refresh_every_ms,
7517                            retention_duration_ms: q.retention_duration_ms,
7518                        };
7519                    let store = self.inner.db.store();
7520                    crate::runtime::continuous_materialized_view::persist_descriptor(
7521                        store.as_ref(),
7522                        &descriptor,
7523                    )?;
7524
7525                    // Issue #594 slice 9b — provision a Table-shaped
7526                    // backing collection named after the view. The
7527                    // rewriter skips materialized views (see
7528                    // `rewrite_view_refs_inner`) so `SELECT FROM v`
7529                    // resolves to this collection directly. Empty
7530                    // until REFRESH wires through it in 9c.
7531                    self.ensure_materialized_view_backing(&q.name)?;
7532                }
7533                // Plan cache may have cached a plan that didn't know about this
7534                // view — invalidate so future references pick up the new binding.
7535                // Result cache gets flushed too: OR REPLACE must not serve a
7536                // prior execution of the obsolete body.
7537                self.invalidate_plan_cache();
7538                self.invalidate_result_cache();
7539
7540                Ok(RuntimeQueryResult::ok_message(
7541                    query.to_string(),
7542                    &format!(
7543                        "{}view {} created",
7544                        if q.materialized { "materialized " } else { "" },
7545                        q.name
7546                    ),
7547                    "create_view",
7548                ))
7549            }
7550            QueryExpr::DropView(ref q) => {
7551                let mut views = self.inner.views.write();
7552                let removed = views.remove(&q.name);
7553                let existed = removed.is_some();
7554                let removed_materialized =
7555                    removed.as_ref().map(|v| v.materialized).unwrap_or(false);
7556                drop(views);
7557                if q.materialized || existed {
7558                    // Try the materialised cache too — silent if absent.
7559                    self.inner.materialized_views.write().remove(&q.name);
7560                    // Issue #593 slice 9a — remove any persisted
7561                    // catalog row. Idempotent: a no-op when the view
7562                    // was never materialized (no row was ever written).
7563                    let store = self.inner.db.store();
7564                    crate::runtime::continuous_materialized_view::remove_by_name(
7565                        store.as_ref(),
7566                        &q.name,
7567                    )?;
7568                }
7569                // Issue #594 slice 9b — drop the backing collection
7570                // that was provisioned at CREATE time. Only mat views
7571                // ever had one; regular views never did.
7572                if removed_materialized || q.materialized {
7573                    self.drop_materialized_view_backing(&q.name)?;
7574                }
7575                // Drop any plan / result cache entries that baked the
7576                // view body into their QueryExpr.
7577                self.invalidate_plan_cache();
7578                self.invalidate_result_cache();
7579                if !existed && !q.if_exists {
7580                    return Err(RedDBError::Internal(format!(
7581                        "view {} does not exist",
7582                        q.name
7583                    )));
7584                }
7585                self.invalidate_plan_cache();
7586                Ok(RuntimeQueryResult::ok_message(
7587                    query.to_string(),
7588                    &format!("view {} dropped", q.name),
7589                    "drop_view",
7590                ))
7591            }
7592            QueryExpr::RefreshMaterializedView(ref q) => {
7593                // Look up the view definition, execute its underlying query,
7594                // and stash the serialized result in the materialised cache.
7595                let view = {
7596                    let views = self.inner.views.read();
7597                    views.get(&q.name).cloned()
7598                };
7599                let view = match view {
7600                    Some(v) => v,
7601                    None => {
7602                        return Err(RedDBError::Internal(format!(
7603                            "view {} does not exist",
7604                            q.name
7605                        )))
7606                    }
7607                };
7608                if !view.materialized {
7609                    return Err(RedDBError::Internal(format!(
7610                        "view {} is not materialized — REFRESH requires \
7611                         CREATE MATERIALIZED VIEW",
7612                        q.name
7613                    )));
7614                }
7615                // Execute the underlying query fresh.
7616                let started = std::time::Instant::now();
7617                let now_ms = std::time::SystemTime::now()
7618                    .duration_since(std::time::UNIX_EPOCH)
7619                    .map(|d| d.as_millis() as u64)
7620                    .unwrap_or(0);
7621                match self.execute_query_expr((*view.query).clone()) {
7622                    Ok(inner_result) => {
7623                        // Issue #595 slice 9c — atomically replace the
7624                        // backing collection's contents under a single
7625                        // WAL group. Concurrent SELECT from the view
7626                        // sees either the prior or new contents, never
7627                        // partial. A crash before the WAL commit lands
7628                        // leaves the prior contents intact on recovery.
7629                        let entities =
7630                            view_records_to_entities(&q.name, &inner_result.result.records);
7631                        let row_count = entities.len() as u64;
7632                        let store = self.inner.db.store();
7633                        let serialized_records = match store.refresh_collection(&q.name, entities) {
7634                            Ok(records) => records,
7635                            Err(err) => {
7636                                let duration_ms = started.elapsed().as_millis() as u64;
7637                                let msg = err.to_string();
7638                                self.inner
7639                                    .materialized_views
7640                                    .write()
7641                                    .record_refresh_failure(
7642                                        &q.name,
7643                                        msg.clone(),
7644                                        duration_ms,
7645                                        now_ms,
7646                                    );
7647                                return Err(RedDBError::Internal(format!(
7648                                    "REFRESH MATERIALIZED VIEW {}: {msg}",
7649                                    q.name
7650                                )));
7651                            }
7652                        };
7653
7654                        // Issue #596 slice 9d — emit a Refresh
7655                        // ChangeRecord into the logical-WAL spool so
7656                        // replicas deterministically replay the same
7657                        // backing-collection contents via
7658                        // `LogicalChangeApplier::apply_record`.
7659                        if let Some(ref primary) = self.inner.db.replication {
7660                            let lsn = self.inner.cdc.emit(
7661                                crate::replication::cdc::ChangeOperation::Refresh,
7662                                &q.name,
7663                                0,
7664                                "refresh",
7665                            );
7666                            self.invalidate_result_cache_for_table(&q.name);
7667                            let timestamp = std::time::SystemTime::now()
7668                                .duration_since(std::time::UNIX_EPOCH)
7669                                .unwrap_or_default()
7670                                .as_millis() as u64;
7671                            let record = ChangeRecord::for_refresh(
7672                                lsn,
7673                                timestamp,
7674                                q.name.clone(),
7675                                serialized_records,
7676                            )
7677                            .with_term(self.current_replication_term());
7678                            let encoded = record.encode();
7679                            primary.append_logical_record(record.lsn, encoded);
7680                        }
7681
7682                        let duration_ms = started.elapsed().as_millis() as u64;
7683                        let serialized = format!("{:?}", inner_result.result);
7684                        self.inner
7685                            .materialized_views
7686                            .write()
7687                            .record_refresh_success(
7688                                &q.name,
7689                                serialized.into_bytes(),
7690                                row_count,
7691                                duration_ms,
7692                                now_ms,
7693                            );
7694                        // SELECT FROM v now reads through the rewriter
7695                        // skip into the backing collection — drop the
7696                        // result cache so prior empty-backing reads
7697                        // don't shadow the new contents.
7698                        self.invalidate_result_cache();
7699                        Ok(RuntimeQueryResult::ok_message(
7700                            query.to_string(),
7701                            &format!("materialized view {} refreshed", q.name),
7702                            "refresh_materialized_view",
7703                        ))
7704                    }
7705                    Err(err) => {
7706                        let duration_ms = started.elapsed().as_millis() as u64;
7707                        let msg = err.to_string();
7708                        self.inner
7709                            .materialized_views
7710                            .write()
7711                            .record_refresh_failure(&q.name, msg.clone(), duration_ms, now_ms);
7712                        Err(err)
7713                    }
7714                }
7715            }
7716            // Row Level Security (Phase 2.5 PG parity).
7717            //
7718            // Policies live in an in-memory registry keyed by (table, name).
7719            // Enforcement (AND-ing the policy's USING clause into every
7720            // query's WHERE for the table) arrives in Phase 2.5.2 via the
7721            // filter compiler; this dispatch only manages the catalog.
7722            QueryExpr::CreatePolicy(ref q) => {
7723                let key = (q.table.clone(), q.name.clone());
7724                self.inner
7725                    .rls_policies
7726                    .write()
7727                    .insert(key, Arc::new(q.clone()));
7728                self.invalidate_plan_cache();
7729                // Issue #120 — surface policy names in the
7730                // schema-vocabulary so AskPipeline (#121) can resolve
7731                // a policy reference back to its table.
7732                self.schema_vocabulary_apply(
7733                    crate::runtime::schema_vocabulary::DdlEvent::CreatePolicy {
7734                        collection: q.table.clone(),
7735                        policy: q.name.clone(),
7736                    },
7737                );
7738                Ok(RuntimeQueryResult::ok_message(
7739                    query.to_string(),
7740                    &format!("policy {} on {} created", q.name, q.table),
7741                    "create_policy",
7742                ))
7743            }
7744            QueryExpr::DropPolicy(ref q) => {
7745                let removed = self
7746                    .inner
7747                    .rls_policies
7748                    .write()
7749                    .remove(&(q.table.clone(), q.name.clone()))
7750                    .is_some();
7751                if !removed && !q.if_exists {
7752                    return Err(RedDBError::Internal(format!(
7753                        "policy {} on {} does not exist",
7754                        q.name, q.table
7755                    )));
7756                }
7757                self.invalidate_plan_cache();
7758                // Issue #120 — keep the schema-vocabulary policy
7759                // entry in sync.
7760                self.schema_vocabulary_apply(
7761                    crate::runtime::schema_vocabulary::DdlEvent::DropPolicy {
7762                        collection: q.table.clone(),
7763                        policy: q.name.clone(),
7764                    },
7765                );
7766                Ok(RuntimeQueryResult::ok_message(
7767                    query.to_string(),
7768                    &format!("policy {} on {} dropped", q.name, q.table),
7769                    "drop_policy",
7770                ))
7771            }
7772            // Foreign Data Wrappers (Phase 3.2 PG parity).
7773            //
7774            // CREATE SERVER / CREATE FOREIGN TABLE register into the shared
7775            // `ForeignTableRegistry`. The read path consults that registry
7776            // before dispatching a SELECT — when the table name matches a
7777            // registered foreign table, we forward the scan to the wrapper
7778            // and skip the normal collection lookup.
7779            //
7780            // Phase 3.2 is in-memory only; persistence across restarts is a
7781            // 3.2.2 follow-up that mirrors the view registry pattern.
7782            QueryExpr::CreateServer(ref q) => {
7783                use crate::storage::fdw::FdwOptions;
7784                let registry = Arc::clone(&self.inner.foreign_tables);
7785                if registry.server(&q.name).is_some() {
7786                    if q.if_not_exists {
7787                        return Ok(RuntimeQueryResult::ok_message(
7788                            query.to_string(),
7789                            &format!("server {} already exists — skipped", q.name),
7790                            "create_server",
7791                        ));
7792                    }
7793                    return Err(RedDBError::Internal(format!(
7794                        "server {} already exists",
7795                        q.name
7796                    )));
7797                }
7798                let mut opts = FdwOptions::new();
7799                for (k, v) in &q.options {
7800                    opts.values.insert(k.clone(), v.clone());
7801                }
7802                registry
7803                    .create_server(&q.name, &q.wrapper, opts)
7804                    .map_err(|e| RedDBError::Internal(e.to_string()))?;
7805                Ok(RuntimeQueryResult::ok_message(
7806                    query.to_string(),
7807                    &format!("server {} created (wrapper {})", q.name, q.wrapper),
7808                    "create_server",
7809                ))
7810            }
7811            QueryExpr::DropServer(ref q) => {
7812                let existed = self.inner.foreign_tables.drop_server(&q.name);
7813                if !existed && !q.if_exists {
7814                    return Err(RedDBError::Internal(format!(
7815                        "server {} does not exist",
7816                        q.name
7817                    )));
7818                }
7819                Ok(RuntimeQueryResult::ok_message(
7820                    query.to_string(),
7821                    &format!(
7822                        "server {} dropped{}",
7823                        q.name,
7824                        if q.cascade { " (cascade)" } else { "" }
7825                    ),
7826                    "drop_server",
7827                ))
7828            }
7829            QueryExpr::CreateForeignTable(ref q) => {
7830                use crate::storage::fdw::{FdwOptions, ForeignColumn, ForeignTable};
7831                let registry = Arc::clone(&self.inner.foreign_tables);
7832                if registry.foreign_table(&q.name).is_some() {
7833                    if q.if_not_exists {
7834                        return Ok(RuntimeQueryResult::ok_message(
7835                            query.to_string(),
7836                            &format!("foreign table {} already exists — skipped", q.name),
7837                            "create_foreign_table",
7838                        ));
7839                    }
7840                    return Err(RedDBError::Internal(format!(
7841                        "foreign table {} already exists",
7842                        q.name
7843                    )));
7844                }
7845                let mut opts = FdwOptions::new();
7846                for (k, v) in &q.options {
7847                    opts.values.insert(k.clone(), v.clone());
7848                }
7849                let columns: Vec<ForeignColumn> = q
7850                    .columns
7851                    .iter()
7852                    .map(|c| ForeignColumn {
7853                        name: c.name.clone(),
7854                        data_type: c.data_type.clone(),
7855                        not_null: c.not_null,
7856                    })
7857                    .collect();
7858                registry
7859                    .create_foreign_table(ForeignTable {
7860                        name: q.name.clone(),
7861                        server_name: q.server.clone(),
7862                        columns,
7863                        options: opts,
7864                    })
7865                    .map_err(|e| RedDBError::Internal(e.to_string()))?;
7866                self.invalidate_plan_cache();
7867                Ok(RuntimeQueryResult::ok_message(
7868                    query.to_string(),
7869                    &format!("foreign table {} created (server {})", q.name, q.server),
7870                    "create_foreign_table",
7871                ))
7872            }
7873            QueryExpr::DropForeignTable(ref q) => {
7874                let existed = self.inner.foreign_tables.drop_foreign_table(&q.name);
7875                if !existed && !q.if_exists {
7876                    return Err(RedDBError::Internal(format!(
7877                        "foreign table {} does not exist",
7878                        q.name
7879                    )));
7880                }
7881                self.invalidate_plan_cache();
7882                Ok(RuntimeQueryResult::ok_message(
7883                    query.to_string(),
7884                    &format!("foreign table {} dropped", q.name),
7885                    "drop_foreign_table",
7886                ))
7887            }
7888            // COPY table FROM 'path' (Phase 1.5 PG parity).
7889            //
7890            // Stream CSV rows through the shared `CsvImporter`. The collection
7891            // is auto-created on first insert (via `insert_auto`-style path);
7892            // VACUUM/ANALYZE afterwards is up to the caller.
7893            QueryExpr::CopyFrom(ref q) => {
7894                use crate::storage::import::{CsvConfig, CsvImporter};
7895                let store = self.inner.db.store();
7896                let cfg = CsvConfig {
7897                    collection: q.table.clone(),
7898                    has_header: q.has_header,
7899                    delimiter: q.delimiter.map(|c| c as u8).unwrap_or(b','),
7900                    ..CsvConfig::default()
7901                };
7902                let importer = CsvImporter::new(cfg);
7903                let stats = importer
7904                    .import_file(&q.path, store.as_ref())
7905                    .map_err(|e| RedDBError::Internal(format!("COPY failed: {e}")))?;
7906                // Tables are written → invalidate cached plans / result cache.
7907                self.note_table_write(&q.table);
7908                Ok(RuntimeQueryResult::ok_message(
7909                    query.to_string(),
7910                    &format!(
7911                        "COPY imported {} rows into {} ({} errors skipped, {}ms)",
7912                        stats.records_imported, q.table, stats.errors_skipped, stats.duration_ms
7913                    ),
7914                    "copy_from",
7915                ))
7916            }
7917            // Maintenance commands (Phase 1.2 PG parity).
7918            //
7919            // - VACUUM [FULL] [table]: refreshes planner stats for the target
7920            //   collection(s) and — when FULL — triggers a full pager persist
7921            //   (flushes dirty pages + fsync). Also invalidates the result cache
7922            //   so subsequent reads re-execute against the freshly compacted
7923            //   storage. RedDB's segment/btree GC runs continuously via the
7924            //   background lifecycle; explicit space reclamation for sealed
7925            //   segments arrives with Phase 2.3 (MVCC + dead-tuple reclamation).
7926            // - ANALYZE [table]: reruns `analyze_collection` +
7927            //   `persist_table_stats` via `refresh_table_planner_stats` so the
7928            //   planner has fresh histograms, distinct estimates, null counts.
7929            //
7930            // Both commands accept an optional target; omitting the target
7931            // iterates every collection in the store.
7932            QueryExpr::MaintenanceCommand(ref cmd) => {
7933                use crate::storage::query::ast::MaintenanceCommand as Mc;
7934                let store = self.inner.db.store();
7935                let (kind, msg) = match cmd {
7936                    Mc::Analyze { target } => {
7937                        let targets: Vec<String> = match target {
7938                            Some(t) => vec![t.clone()],
7939                            None => store.list_collections(),
7940                        };
7941                        for t in &targets {
7942                            self.refresh_table_planner_stats(t);
7943                        }
7944                        (
7945                            "analyze",
7946                            format!("ANALYZE refreshed stats for {} table(s)", targets.len()),
7947                        )
7948                    }
7949                    Mc::Vacuum { target, full } => {
7950                        let targets: Vec<String> = match target {
7951                            Some(t) => vec![t.clone()],
7952                            None => store.list_collections(),
7953                        };
7954                        let cutoff_xid = self.mvcc_vacuum_cutoff_xid();
7955                        let mut vacuum_stats =
7956                            crate::storage::unified::store::MvccVacuumStats::default();
7957                        for t in &targets {
7958                            let stats = store.vacuum_mvcc_history(t, cutoff_xid).map_err(|e| {
7959                                RedDBError::Internal(format!(
7960                                    "VACUUM MVCC history failed for {t}: {e}"
7961                                ))
7962                            })?;
7963                            if stats.reclaimed_versions > 0 {
7964                                self.rebuild_runtime_indexes_for_table(t)?;
7965                            }
7966                            vacuum_stats.add(&stats);
7967                        }
7968                        self.inner.snapshot_manager.prune_aborted(cutoff_xid);
7969                        // Stats refresh covers every target (same as ANALYZE).
7970                        for t in &targets {
7971                            self.refresh_table_planner_stats(t);
7972                        }
7973                        // FULL forces a pager persist (dirty-page flush + fsync).
7974                        // Regular VACUUM relies on the background writer / segment
7975                        // lifecycle so the command is non-blocking.
7976                        let persisted = if *full {
7977                            match store.persist() {
7978                                Ok(()) => true,
7979                                Err(e) => {
7980                                    return Err(RedDBError::Internal(format!(
7981                                        "VACUUM FULL persist failed: {e:?}"
7982                                    )));
7983                                }
7984                            }
7985                        } else {
7986                            false
7987                        };
7988                        // Result cache depended on pre-vacuum state.
7989                        self.invalidate_result_cache();
7990                        (
7991                            "vacuum",
7992                            format!(
7993                                "VACUUM{} processed {} table(s): scanned_versions={}, retained_versions={}, reclaimed_versions={}, retained_history_versions={}, reclaimed_history_versions={}, retained_tombstones={}, reclaimed_tombstones={}{}",
7994                                if *full { " FULL" } else { "" },
7995                                targets.len(),
7996                                vacuum_stats.scanned_versions,
7997                                vacuum_stats.retained_versions,
7998                                vacuum_stats.reclaimed_versions,
7999                                vacuum_stats.retained_history_versions,
8000                                vacuum_stats.reclaimed_history_versions,
8001                                vacuum_stats.retained_tombstones,
8002                                vacuum_stats.reclaimed_tombstones,
8003                                if persisted {
8004                                    " (pages flushed to disk)"
8005                                } else {
8006                                    ""
8007                                }
8008                            ),
8009                        )
8010                    }
8011                };
8012                Ok(RuntimeQueryResult::ok_message(
8013                    query.to_string(),
8014                    &msg,
8015                    kind,
8016                ))
8017            }
8018            // GRANT / REVOKE / ALTER USER (RBAC milestone).
8019            //
8020            // These hit the AuthStore directly. The privilege-check
8021            // gate at the top of `execute_query_expr` already decided
8022            // whether the caller may even run the statement; here we
8023            // just translate the AST into AuthStore calls.
8024            QueryExpr::Grant(ref g) => self.execute_grant_statement(query, g),
8025            QueryExpr::Revoke(ref r) => self.execute_revoke_statement(query, r),
8026            QueryExpr::AlterUser(ref a) => self.execute_alter_user_statement(query, a),
8027            QueryExpr::CreateIamPolicy { ref id, ref json } => {
8028                self.execute_create_iam_policy(query, id, json)
8029            }
8030            QueryExpr::DropIamPolicy { ref id } => self.execute_drop_iam_policy(query, id),
8031            QueryExpr::AttachPolicy {
8032                ref policy_id,
8033                ref principal,
8034            } => self.execute_attach_policy(query, policy_id, principal),
8035            QueryExpr::DetachPolicy {
8036                ref policy_id,
8037                ref principal,
8038            } => self.execute_detach_policy(query, policy_id, principal),
8039            QueryExpr::ShowPolicies { ref filter } => {
8040                self.execute_show_policies(query, filter.as_ref())
8041            }
8042            QueryExpr::ShowEffectivePermissions {
8043                ref user,
8044                ref resource,
8045            } => self.execute_show_effective_permissions(query, user, resource.as_ref()),
8046            QueryExpr::SimulatePolicy {
8047                ref user,
8048                ref action,
8049                ref resource,
8050            } => self.execute_simulate_policy(query, user, action, resource),
8051            QueryExpr::LintPolicy { ref source } => self.execute_lint_policy(query, source),
8052            QueryExpr::MigratePolicyMode {
8053                ref target,
8054                dry_run,
8055            } => self.execute_migrate_policy_mode(query, target, dry_run),
8056            QueryExpr::CreateMigration(ref q) => self.execute_create_migration(query, q),
8057            QueryExpr::ApplyMigration(ref q) => self.execute_apply_migration(query, q),
8058            QueryExpr::RollbackMigration(ref q) => self.execute_rollback_migration(query, q),
8059            QueryExpr::ExplainMigration(ref q) => self.execute_explain_migration(query, q),
8060        };
8061
8062        if !control_event_specs.is_empty() {
8063            let (outcome, reason) = match &query_result {
8064                Ok(_) => (crate::runtime::control_events::Outcome::Allowed, None),
8065                Err(err) => (control_event_outcome_for_error(err), Some(err.to_string())),
8066            };
8067            for spec in &control_event_specs {
8068                self.emit_control_event(
8069                    spec.kind,
8070                    outcome,
8071                    spec.action,
8072                    spec.resource.clone(),
8073                    reason.clone(),
8074                    spec.fields.clone(),
8075                )?;
8076            }
8077        }
8078
8079        if let (Some(plan), Ok(result)) = (&query_audit_plan, &query_result) {
8080            self.emit_query_audit(
8081                query,
8082                plan,
8083                query_audit_started.elapsed().as_millis() as u64,
8084                result,
8085            );
8086        }
8087
8088        // Decrypt Value::Secret columns in-place before caching, so
8089        // cached results match the post-decrypt shape and repeat
8090        // queries skip the per-row AES-GCM pass.
8091        let mut query_result = query_result;
8092        if let Ok(ref mut result) = query_result {
8093            if result.statement_type == "select" {
8094                self.apply_secret_decryption(result);
8095            }
8096        }
8097
8098        // Cache SELECT results for 30s.
8099        // Skip: pre-serialized JSON (large clone), and result sets > 5 rows.
8100        // Large multi-row results (range scans, filtered scans) are rarely
8101        // repeated with the same literal values so the cache hit rate is near
8102        // zero while the clone cost (100 records × ~16 fields each) is high.
8103        // Aggregations (1 row) and point lookups (1 row) still benefit.
8104        if let Ok(ref result) = query_result {
8105            frame.write_result_cache(self, result, result_cache_scopes);
8106        }
8107
8108        query_result
8109    }
8110
8111    /// Snapshot of every registered materialized view's runtime
8112    /// state — feeds the `red.materialized_views` virtual table.
8113    /// Issue #583 slice 10.
8114    pub fn materialized_view_metadata(
8115        &self,
8116    ) -> Vec<crate::storage::cache::result::MaterializedViewMetadata> {
8117        // Issue #595 slice 9c — `current_row_count` is now scraped
8118        // live from the backing collection rather than read from the
8119        // cache slot. Mirrors the slice-10 invariant on
8120        // `queue_pending_gauge` in #527: the live store is the source
8121        // of truth, the cache slot only carries last-refresh telemetry
8122        // (timing, error, refresh cadence).
8123        let store = self.inner.db.store();
8124        let mut entries = self.inner.materialized_views.read().metadata();
8125        for entry in &mut entries {
8126            if let Some(manager) = store.get_collection(&entry.name) {
8127                entry.current_row_count = manager.count() as u64;
8128            }
8129        }
8130        entries
8131    }
8132
8133    /// Drive scheduled refreshes for materialized views with a
8134    /// `REFRESH EVERY <duration>` clause. Called from the background
8135    /// scheduler thread (and from unit tests with a fake clock via
8136    /// `claim_due_at`). Each invocation atomically claims the set of
8137    /// due views (so two concurrent ticks never double-fire the same
8138    /// view) and runs each refresh through the standard execution
8139    /// path — failures are captured in `last_error` and the prior
8140    /// content stays intact. Issue #583 slice 10.
8141    /// Snapshot of every tracked retention sweeper state — feeds the
8142    /// three extra columns on `red.retention`. Issue #584 slice 12.
8143    pub(crate) fn retention_sweeper_snapshot(
8144        &self,
8145    ) -> Vec<(String, crate::runtime::retention_sweeper::SweeperState)> {
8146        self.inner.retention_sweeper.read().snapshot()
8147    }
8148
8149    /// Drive one tick of the retention sweeper. Iterates collections
8150    /// with a retention policy set, physically deletes at most
8151    /// `batch_size` expired rows per collection, and records the
8152    /// `last_sweep_at_ms` / `rows_swept_total` / pending estimate that
8153    /// `red.retention` exposes. Called from the background sweeper
8154    /// thread; safe to invoke directly from tests with a small batch
8155    /// size to drain rows deterministically. Issue #584 slice 12.
8156    ///
8157    /// Deletes are issued as `DELETE FROM <collection> WHERE
8158    /// <ts_column> < <cutoff>` through the standard `execute_query`
8159    /// chokepoint so WAL participation and snapshot guards apply
8160    /// exactly as for a user-issued DELETE — replicas replay the
8161    /// sweeper's deletes via the same WAL stream with no special
8162    /// handling on the replication side.
8163    ///
8164    /// Batching is enforced by tightening the cutoff: if more than
8165    /// `batch_size` rows are expired, the cutoff is dropped to the
8166    /// `batch_size`-th oldest expired timestamp + 1 so the predicate
8167    /// matches roughly `batch_size` rows; the remainder is reported
8168    /// as `current_rows_pending_sweep_estimate` and drained on the
8169    /// next tick.
8170    pub fn sweep_retention_tick(&self, batch_size: usize) {
8171        if batch_size == 0 {
8172            return;
8173        }
8174        let now_ms = std::time::SystemTime::now()
8175            .duration_since(std::time::UNIX_EPOCH)
8176            .map(|d| d.as_millis() as u64)
8177            .unwrap_or(0);
8178
8179        let store = self.inner.db.store();
8180        let collections = store.list_collections();
8181        for name in collections {
8182            let Some(contract) = self.inner.db.collection_contract(&name) else {
8183                continue;
8184            };
8185            let Some(retention_ms) = contract.retention_duration_ms else {
8186                continue;
8187            };
8188            let Some(ts_column) =
8189                crate::runtime::retention_filter::resolve_timestamp_column(&contract)
8190            else {
8191                continue;
8192            };
8193            let Some(manager) = store.get_collection(&name) else {
8194                continue;
8195            };
8196            let cutoff = (now_ms as i64).saturating_sub(retention_ms as i64);
8197
8198            // Single pass: collect expired timestamps. We keep the
8199            // full Vec rather than a bounded heap because the partial
8200            // sort below is the simplest correct way to find the
8201            // batch-th oldest; for the slice's "1000-row default
8202            // batch" target this is bounded enough for production
8203            // operation, and the alternative (in-place heap of size
8204            // batch+1) is a follow-up optimisation.
8205            let mut expired_ts: Vec<i64> = Vec::new();
8206            manager.for_each_entity(|entity| {
8207                let ts = match ts_column.as_str() {
8208                    "created_at" => Some(entity.created_at as i64),
8209                    "updated_at" => Some(entity.updated_at as i64),
8210                    other => entity
8211                        .data
8212                        .as_row()
8213                        .and_then(|row| row.get_field(other))
8214                        .and_then(|v| match v {
8215                            crate::storage::schema::Value::TimestampMs(t) => Some(*t),
8216                            crate::storage::schema::Value::Timestamp(t) => {
8217                                Some(t.saturating_mul(1_000))
8218                            }
8219                            crate::storage::schema::Value::BigInt(t) => Some(*t),
8220                            crate::storage::schema::Value::UnsignedInteger(t) => {
8221                                i64::try_from(*t).ok()
8222                            }
8223                            crate::storage::schema::Value::Integer(t) => Some(*t),
8224                            _ => None,
8225                        }),
8226                };
8227                if let Some(t) = ts {
8228                    if t < cutoff {
8229                        expired_ts.push(t);
8230                    }
8231                }
8232                true
8233            });
8234
8235            let total_expired = expired_ts.len() as u64;
8236            if total_expired == 0 {
8237                self.inner
8238                    .retention_sweeper
8239                    .write()
8240                    .record_tick(&name, 0, 0, now_ms);
8241                continue;
8242            }
8243
8244            let (effective_cutoff, pending) = if (total_expired as usize) <= batch_size {
8245                (cutoff, 0u64)
8246            } else {
8247                // Tighten the cutoff to the (batch_size)-th oldest
8248                // expired timestamp + 1 so DELETE matches roughly
8249                // `batch_size` rows.
8250                expired_ts.sort_unstable();
8251                let nth = expired_ts[batch_size - 1];
8252                (
8253                    nth.saturating_add(1),
8254                    total_expired.saturating_sub(batch_size as u64),
8255                )
8256            };
8257
8258            let stmt = format!(
8259                "DELETE FROM {} WHERE {} < {}",
8260                name, ts_column, effective_cutoff
8261            );
8262            let deleted = match self.execute_query(&stmt) {
8263                Ok(r) => r.affected_rows,
8264                Err(_) => 0,
8265            };
8266
8267            self.inner
8268                .retention_sweeper
8269                .write()
8270                .record_tick(&name, deleted, pending, now_ms);
8271        }
8272    }
8273
8274    pub fn refresh_due_materialized_views(&self) {
8275        let due = {
8276            let mut cache = self.inner.materialized_views.write();
8277            cache.claim_due_at(std::time::Instant::now())
8278        };
8279        for name in due {
8280            // Round-trip through `execute_query` (rather than the
8281            // prepared-statement `execute_query_expr` fast path, which
8282            // explicitly rejects DDL/maintenance statements). Failures
8283            // are captured inside the RefreshMaterializedView handler
8284            // via `record_refresh_failure`; the scheduler ignores the
8285            // Result so one bad view doesn't halt the loop.
8286            let stmt = format!("REFRESH MATERIALIZED VIEW {}", name);
8287            let _ = self.execute_query(&stmt);
8288        }
8289    }
8290
8291    /// Execute a pre-parsed `QueryExpr` directly, bypassing SQL parsing and the
8292    /// plan cache. Used by the prepared-statement fast path so that `execute_prepared`
8293    /// calls pay zero parse + cache overhead.
8294    ///
8295    /// Applies secret decryption on SELECT results, identical to `execute_query`.
8296    pub fn execute_query_expr(&self, expr: QueryExpr) -> RedDBResult<RuntimeQueryResult> {
8297        let _config_snapshot_guard = ConfigSnapshotGuard::install(Arc::clone(&self.inner.db));
8298        let _secret_store_guard = SecretStoreGuard::install(self.inner.auth_store.read().clone());
8299        // View rewrite (Phase 2.1): substitute any `QueryExpr::Table(tq)`
8300        // whose `tq.table` matches a registered view with the view's
8301        // underlying query. Safe to call even when no views are registered.
8302        let expr = self.rewrite_view_refs(expr);
8303
8304        self.validate_model_operations_before_auth(&expr)?;
8305        // Granular RBAC privilege check. Runs before dispatch so a
8306        // denied caller never reaches storage. Fail-closed: any error
8307        // resolving the action / resource produces PermissionDenied.
8308        if let Err(err) = self.check_query_privilege(&expr) {
8309            return Err(RedDBError::Query(format!("permission denied: {err}")));
8310        }
8311
8312        let statement = query_expr_name(&expr);
8313        let mode = detect_mode(statement);
8314        let query_str = statement;
8315
8316        let result = self.dispatch_expr(expr, query_str, mode)?;
8317        let mut r = result;
8318        if r.statement_type == "select" {
8319            self.apply_secret_decryption(&mut r);
8320        }
8321        Ok(r)
8322    }
8323
8324    pub(super) fn validate_model_operations_before_auth(
8325        &self,
8326        expr: &QueryExpr,
8327    ) -> RedDBResult<()> {
8328        use crate::catalog::CollectionModel;
8329        use crate::runtime::ddl::polymorphic_resolver;
8330        use crate::storage::query::ast::KvCommand;
8331
8332        let system_schema_target = match expr {
8333            QueryExpr::DropTable(q) => Some(q.name.as_str()),
8334            QueryExpr::DropGraph(q) => Some(q.name.as_str()),
8335            QueryExpr::DropVector(q) => Some(q.name.as_str()),
8336            QueryExpr::DropDocument(q) => Some(q.name.as_str()),
8337            QueryExpr::DropKv(q) => Some(q.name.as_str()),
8338            QueryExpr::DropCollection(q) => Some(q.name.as_str()),
8339            QueryExpr::Truncate(q) => Some(q.name.as_str()),
8340            _ => None,
8341        };
8342        if system_schema_target.is_some_and(crate::runtime::impl_ddl::is_system_schema_name) {
8343            return Err(RedDBError::Query("system schema is read-only".to_string()));
8344        }
8345
8346        let expected = match expr {
8347            QueryExpr::DropTable(q) => Some((q.name.as_str(), CollectionModel::Table)),
8348            QueryExpr::DropGraph(q) => Some((q.name.as_str(), CollectionModel::Graph)),
8349            QueryExpr::DropVector(q) => Some((q.name.as_str(), CollectionModel::Vector)),
8350            QueryExpr::DropDocument(q) => Some((q.name.as_str(), CollectionModel::Document)),
8351            QueryExpr::DropKv(q) => Some((q.name.as_str(), q.model)),
8352            QueryExpr::DropCollection(q) => q.model.map(|model| (q.name.as_str(), model)),
8353            QueryExpr::Truncate(q) => q.model.map(|model| (q.name.as_str(), model)),
8354            QueryExpr::KvCommand(cmd) => {
8355                let (collection, model) = match cmd {
8356                    KvCommand::Put {
8357                        collection, model, ..
8358                    }
8359                    | KvCommand::Get {
8360                        collection, model, ..
8361                    }
8362                    | KvCommand::Incr {
8363                        collection, model, ..
8364                    }
8365                    | KvCommand::Cas {
8366                        collection, model, ..
8367                    }
8368                    | KvCommand::Delete {
8369                        collection, model, ..
8370                    } => (collection.as_str(), *model),
8371                    KvCommand::Rotate { collection, .. }
8372                    | KvCommand::History { collection, .. }
8373                    | KvCommand::List { collection, .. }
8374                    | KvCommand::Purge { collection, .. } => {
8375                        (collection.as_str(), CollectionModel::Vault)
8376                    }
8377                    KvCommand::InvalidateTags { collection, .. } => {
8378                        (collection.as_str(), CollectionModel::Kv)
8379                    }
8380                    KvCommand::Watch {
8381                        collection, model, ..
8382                    } => (collection.as_str(), *model),
8383                    KvCommand::Unseal { collection, .. } => {
8384                        (collection.as_str(), CollectionModel::Vault)
8385                    }
8386                };
8387                Some((collection, model))
8388            }
8389            QueryExpr::ConfigCommand(cmd) => {
8390                self.validate_config_command_before_auth(cmd)?;
8391                None
8392            }
8393            _ => None,
8394        };
8395
8396        let Some((name, expected_model)) = expected else {
8397            return Ok(());
8398        };
8399        let snapshot = self.inner.db.catalog_model_snapshot();
8400        let Some(actual_model) = snapshot
8401            .collections
8402            .iter()
8403            .find(|collection| collection.name == name)
8404            .map(|collection| collection.declared_model.unwrap_or(collection.model))
8405        else {
8406            return Ok(());
8407        };
8408        polymorphic_resolver::ensure_model_match(expected_model, actual_model)
8409    }
8410
8411    /// Walk a `QueryExpr` and replace `QueryExpr::Table(tq)` nodes whose
8412    /// `tq.table` matches a registered view name with the view's stored
8413    /// body. Recurses through joins so `SELECT ... FROM t JOIN myview ...`
8414    /// resolves correctly. Pure operation — no side effects.
8415    pub(super) fn rewrite_view_refs(&self, expr: QueryExpr) -> QueryExpr {
8416        // Fast path: no views registered → return original expression.
8417        if self.inner.views.read().is_empty() {
8418            return expr;
8419        }
8420        self.rewrite_view_refs_inner(expr)
8421    }
8422
8423    fn rewrite_view_refs_inner(&self, expr: QueryExpr) -> QueryExpr {
8424        use crate::storage::query::ast::{Filter, TableSource};
8425        match expr {
8426            QueryExpr::Table(mut tq) => {
8427                // 1. If the TableSource is a subquery, recurse into it so
8428                //    `SELECT ... FROM (SELECT ... FROM myview) t` expands.
8429                //    The legacy `table` field (set to a synthetic
8430                //    "__subq_NNNN" sentinel) stays as-is so callers that
8431                //    read it keep compiling.
8432                if let Some(TableSource::Subquery(body)) = tq.source.take() {
8433                    tq.source = Some(TableSource::Subquery(Box::new(
8434                        self.rewrite_view_refs_inner(*body),
8435                    )));
8436                    return QueryExpr::Table(tq);
8437                }
8438
8439                // 2. Restore the source field (took it above for match).
8440                // When the source was `None` or `TableSource::Name(_)`, the
8441                // real lookup key is `tq.table` — check the view registry.
8442                let maybe_view = {
8443                    let views = self.inner.views.read();
8444                    views.get(&tq.table).cloned()
8445                };
8446                let Some(view) = maybe_view else {
8447                    return QueryExpr::Table(tq);
8448                };
8449
8450                // Issue #594 slice 9b — materialized views are read
8451                // from their backing collection, not by substituting
8452                // the body. Returning the TableQuery as-is lets the
8453                // normal table-read path resolve `SELECT FROM v`
8454                // against the collection provisioned at CREATE time.
8455                if view.materialized {
8456                    return QueryExpr::Table(tq);
8457                }
8458
8459                // Recurse into the view body — views may reference other
8460                // views. The recursion yields the final QueryExpr we need
8461                // to merge the outer's filter / limit / offset into.
8462                let inner_expr = self.rewrite_view_refs_inner((*view.query).clone());
8463
8464                // Phase 5: when the body is a Table we merge the outer
8465                // TableQuery's WHERE / LIMIT / OFFSET into it so stacked
8466                // views filter recursively. Non-table bodies (Search,
8467                // Ask, Vector, Graph, Hybrid) can't meaningfully combine
8468                // with an outer Table query today — return the body
8469                // verbatim; outer predicates are lost. Full projection
8470                // merge lands in Phase 5.2.
8471                match inner_expr {
8472                    QueryExpr::Table(mut inner_tq) => {
8473                        if let Some(outer_filter) = tq.filter.take() {
8474                            inner_tq.filter = Some(match inner_tq.filter.take() {
8475                                Some(existing) => {
8476                                    Filter::And(Box::new(existing), Box::new(outer_filter))
8477                                }
8478                                None => outer_filter,
8479                            });
8480                            // Keep the `Expr` form in lock-step with the
8481                            // merged `Filter`. The executor prefers
8482                            // `where_expr` and nulls `filter` when it is
8483                            // present (see `execute_query_inner`), so a
8484                            // stacked view whose outer predicate was only
8485                            // merged into `filter` would silently drop that
8486                            // predicate at eval time (#635).
8487                            inner_tq.where_expr = inner_tq
8488                                .filter
8489                                .as_ref()
8490                                .map(crate::storage::query::sql_lowering::filter_to_expr);
8491                        }
8492                        if let Some(outer_limit) = tq.limit {
8493                            inner_tq.limit = Some(match inner_tq.limit {
8494                                Some(existing) => existing.min(outer_limit),
8495                                None => outer_limit,
8496                            });
8497                        }
8498                        if let Some(outer_offset) = tq.offset {
8499                            inner_tq.offset = Some(match inner_tq.offset {
8500                                Some(existing) => existing + outer_offset,
8501                                None => outer_offset,
8502                            });
8503                        }
8504                        QueryExpr::Table(inner_tq)
8505                    }
8506                    other => other,
8507                }
8508            }
8509            QueryExpr::Join(mut jq) => {
8510                jq.left = Box::new(self.rewrite_view_refs_inner(*jq.left));
8511                jq.right = Box::new(self.rewrite_view_refs_inner(*jq.right));
8512                QueryExpr::Join(jq)
8513            }
8514            // Other variants don't carry nested QueryExpr that can reference
8515            // a view by table name. Return as-is.
8516            other => other,
8517        }
8518    }
8519
8520    /// Internal dispatch: route a `QueryExpr` to the appropriate executor.
8521    /// Shared by `execute_query` (after parse/cache) and `execute_query_expr`
8522    /// (direct call from prepared-statement handler).
8523    fn authorize_relational_table_select(
8524        &self,
8525        mut table: TableQuery,
8526        frame: &dyn super::statement_frame::ReadFrame,
8527    ) -> RedDBResult<Option<TableQuery>> {
8528        if let Some(TableSource::Subquery(inner)) = table.source.take() {
8529            let authorized_inner = self.authorize_relational_select_expr(*inner, frame)?;
8530            table.source = Some(TableSource::Subquery(Box::new(authorized_inner)));
8531            return Ok(Some(table));
8532        }
8533
8534        self.check_table_column_projection_authz(&table, frame)?;
8535
8536        if self.inner.rls_enabled_tables.read().contains(&table.table) {
8537            return Ok(inject_rls_filters(self, frame, table));
8538        }
8539
8540        Ok(Some(table))
8541    }
8542
8543    fn authorize_relational_join_select(
8544        &self,
8545        mut join: JoinQuery,
8546        frame: &dyn super::statement_frame::ReadFrame,
8547    ) -> RedDBResult<Option<JoinQuery>> {
8548        self.check_join_column_projection_authz(&join, frame)?;
8549        join.left = Box::new(self.authorize_relational_join_child(*join.left, frame)?);
8550        join.right = Box::new(self.authorize_relational_join_child(*join.right, frame)?);
8551        Ok(inject_rls_into_join(self, frame, join))
8552    }
8553
8554    fn authorize_relational_join_child(
8555        &self,
8556        expr: QueryExpr,
8557        frame: &dyn super::statement_frame::ReadFrame,
8558    ) -> RedDBResult<QueryExpr> {
8559        match expr {
8560            QueryExpr::Table(mut table) => {
8561                if let Some(TableSource::Subquery(inner)) = table.source.take() {
8562                    let authorized_inner = self.authorize_relational_select_expr(*inner, frame)?;
8563                    table.source = Some(TableSource::Subquery(Box::new(authorized_inner)));
8564                }
8565                Ok(QueryExpr::Table(table))
8566            }
8567            QueryExpr::Join(join) => self
8568                .authorize_relational_join_select(join, frame)?
8569                .map(QueryExpr::Join)
8570                .ok_or_else(|| {
8571                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
8572                }),
8573            other => Ok(other),
8574        }
8575    }
8576
8577    fn authorize_relational_select_expr(
8578        &self,
8579        expr: QueryExpr,
8580        frame: &dyn super::statement_frame::ReadFrame,
8581    ) -> RedDBResult<QueryExpr> {
8582        match expr {
8583            QueryExpr::Table(table) => self
8584                .authorize_relational_table_select(table, frame)?
8585                .map(QueryExpr::Table)
8586                .ok_or_else(|| {
8587                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
8588                }),
8589            QueryExpr::Join(join) => self
8590                .authorize_relational_join_select(join, frame)?
8591                .map(QueryExpr::Join)
8592                .ok_or_else(|| {
8593                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
8594                }),
8595            other => Ok(other),
8596        }
8597    }
8598
8599    fn check_table_column_projection_authz(
8600        &self,
8601        table: &TableQuery,
8602        frame: &dyn super::statement_frame::ReadFrame,
8603    ) -> RedDBResult<()> {
8604        let Some((username, role)) = frame.identity() else {
8605            return Ok(());
8606        };
8607        let Some(auth_store) = self.inner.auth_store.read().clone() else {
8608            return Ok(());
8609        };
8610
8611        let columns = self.resolved_table_projection_columns(table)?;
8612        let request = ColumnAccessRequest::select(table.table.clone(), columns);
8613        let principal = UserId::from_parts(frame.effective_scope(), username);
8614        let ctx = runtime_iam_context(
8615            role,
8616            frame.effective_scope(),
8617            auth_store.principal_is_system_owned(&principal),
8618        );
8619        let outcome = auth_store.check_column_projection_authz(&principal, &request, &ctx);
8620        if outcome.allowed() {
8621            return Ok(());
8622        }
8623
8624        if let Some(denied) = outcome.first_denied_column() {
8625            return Err(RedDBError::Query(format!(
8626                "permission denied: principal=`{username}` cannot select column `{}`",
8627                denied.resource.name
8628            )));
8629        }
8630        Err(RedDBError::Query(format!(
8631            "permission denied: principal=`{username}` cannot select table `{}`",
8632            table.table
8633        )))
8634    }
8635
8636    fn check_join_column_projection_authz(
8637        &self,
8638        join: &JoinQuery,
8639        frame: &dyn super::statement_frame::ReadFrame,
8640    ) -> RedDBResult<()> {
8641        let mut by_table: HashMap<String, BTreeSet<String>> = HashMap::new();
8642        let projections = crate::storage::query::sql_lowering::effective_join_projections(join);
8643        self.collect_join_projection_columns(join, &projections, &mut by_table)?;
8644
8645        for (table, columns) in by_table {
8646            let query = TableQuery {
8647                table,
8648                source: None,
8649                alias: None,
8650                select_items: Vec::new(),
8651                columns: columns.into_iter().map(Projection::Column).collect(),
8652                where_expr: None,
8653                filter: None,
8654                group_by_exprs: Vec::new(),
8655                group_by: Vec::new(),
8656                having_expr: None,
8657                having: None,
8658                order_by: Vec::new(),
8659                limit: None,
8660                limit_param: None,
8661                offset: None,
8662                offset_param: None,
8663                expand: None,
8664                as_of: None,
8665                sessionize: None,
8666            };
8667            self.check_table_column_projection_authz(&query, frame)?;
8668        }
8669        Ok(())
8670    }
8671
8672    fn collect_join_projection_columns(
8673        &self,
8674        join: &JoinQuery,
8675        projections: &[Projection],
8676        out: &mut HashMap<String, BTreeSet<String>>,
8677    ) -> RedDBResult<()> {
8678        let left = table_side_context(join.left.as_ref());
8679        let right = table_side_context(join.right.as_ref());
8680
8681        if projections
8682            .iter()
8683            .any(|projection| matches!(projection, Projection::All))
8684        {
8685            for side in [left.as_ref(), right.as_ref()].into_iter().flatten() {
8686                out.entry(side.table.clone())
8687                    .or_default()
8688                    .extend(self.table_all_projection_columns(&side.table)?);
8689            }
8690            return Ok(());
8691        }
8692
8693        for projection in projections {
8694            collect_projection_columns_for_join_side(
8695                projection,
8696                left.as_ref(),
8697                right.as_ref(),
8698                out,
8699            )?;
8700        }
8701        Ok(())
8702    }
8703
8704    fn resolved_table_projection_columns(&self, table: &TableQuery) -> RedDBResult<Vec<String>> {
8705        let projections = crate::storage::query::sql_lowering::effective_table_projections(table);
8706        if projections
8707            .iter()
8708            .any(|projection| matches!(projection, Projection::All))
8709        {
8710            return self.table_all_projection_columns(&table.table);
8711        }
8712
8713        let mut columns = BTreeSet::new();
8714        for projection in &projections {
8715            collect_projection_columns_for_table(
8716                projection,
8717                &table.table,
8718                table.alias.as_deref(),
8719                &mut columns,
8720            );
8721        }
8722        Ok(columns.into_iter().collect())
8723    }
8724
8725    fn table_all_projection_columns(&self, table: &str) -> RedDBResult<Vec<String>> {
8726        if let Some(contract) = self.inner.db.collection_contract_arc(table) {
8727            let columns: Vec<String> = contract
8728                .declared_columns
8729                .iter()
8730                .map(|column| column.name.clone())
8731                .collect();
8732            if !columns.is_empty() {
8733                return Ok(columns);
8734            }
8735        }
8736
8737        let records = scan_runtime_table_source_records_limited(&self.inner.db, table, Some(1))?;
8738        Ok(records
8739            .first()
8740            .map(|record| {
8741                record
8742                    .column_names()
8743                    .into_iter()
8744                    .map(|column| column.to_string())
8745                    .collect()
8746            })
8747            .unwrap_or_default())
8748    }
8749
8750    fn resolve_table_expr_subqueries(
8751        &self,
8752        mut table: TableQuery,
8753        frame: &dyn super::statement_frame::ReadFrame,
8754    ) -> RedDBResult<TableQuery> {
8755        // Only a `Subquery` source needs recursive resolution. `.take()`
8756        // would otherwise drop a `Name` / `Function` source on the floor
8757        // (the `if let` skips the body but the take already cleared it),
8758        // which silently broke `SELECT * FROM components(g)` — the TVF
8759        // dispatch downstream keys off `TableSource::Function` and never
8760        // fired. Restore any non-subquery source unchanged (issue #795).
8761        match table.source.take() {
8762            Some(TableSource::Subquery(inner)) => {
8763                let inner = self.resolve_select_expr_subqueries(*inner, frame)?;
8764                table.source = Some(TableSource::Subquery(Box::new(inner)));
8765            }
8766            other => table.source = other,
8767        }
8768
8769        let outer_scopes = relation_scopes_for_query(&QueryExpr::Table(table.clone()));
8770        for item in &mut table.select_items {
8771            if let crate::storage::query::ast::SelectItem::Expr { expr, .. } = item {
8772                *expr = self.resolve_expr_subqueries(expr.clone(), &outer_scopes, frame)?;
8773            }
8774        }
8775        if let Some(where_expr) = table.where_expr.take() {
8776            table.where_expr =
8777                Some(self.resolve_expr_subqueries(where_expr, &outer_scopes, frame)?);
8778            table.filter = None;
8779        }
8780        if let Some(having_expr) = table.having_expr.take() {
8781            table.having_expr =
8782                Some(self.resolve_expr_subqueries(having_expr, &outer_scopes, frame)?);
8783            table.having = None;
8784        }
8785        for expr in &mut table.group_by_exprs {
8786            *expr = self.resolve_expr_subqueries(expr.clone(), &outer_scopes, frame)?;
8787        }
8788        for clause in &mut table.order_by {
8789            if let Some(expr) = clause.expr.take() {
8790                clause.expr = Some(self.resolve_expr_subqueries(expr, &outer_scopes, frame)?);
8791            }
8792        }
8793        Ok(table)
8794    }
8795
8796    fn resolve_select_expr_subqueries(
8797        &self,
8798        expr: QueryExpr,
8799        frame: &dyn super::statement_frame::ReadFrame,
8800    ) -> RedDBResult<QueryExpr> {
8801        match expr {
8802            QueryExpr::Table(table) => self
8803                .resolve_table_expr_subqueries(table, frame)
8804                .map(QueryExpr::Table),
8805            QueryExpr::Join(mut join) => {
8806                join.left = Box::new(self.resolve_select_expr_subqueries(*join.left, frame)?);
8807                join.right = Box::new(self.resolve_select_expr_subqueries(*join.right, frame)?);
8808                Ok(QueryExpr::Join(join))
8809            }
8810            other => Ok(other),
8811        }
8812    }
8813
8814    fn resolve_expr_subqueries(
8815        &self,
8816        expr: crate::storage::query::ast::Expr,
8817        outer_scopes: &[String],
8818        frame: &dyn super::statement_frame::ReadFrame,
8819    ) -> RedDBResult<crate::storage::query::ast::Expr> {
8820        use crate::storage::query::ast::Expr;
8821
8822        match expr {
8823            Expr::Subquery { query, span } => {
8824                let values = self.execute_expr_subquery_values(query, outer_scopes, frame)?;
8825                if values.len() > 1 {
8826                    return Err(RedDBError::Query(
8827                        "scalar subquery returned more than one row".to_string(),
8828                    ));
8829                }
8830                Ok(Expr::Literal {
8831                    value: values.into_iter().next().unwrap_or(Value::Null),
8832                    span,
8833                })
8834            }
8835            Expr::BinaryOp { op, lhs, rhs, span } => Ok(Expr::BinaryOp {
8836                op,
8837                lhs: Box::new(self.resolve_expr_subqueries(*lhs, outer_scopes, frame)?),
8838                rhs: Box::new(self.resolve_expr_subqueries(*rhs, outer_scopes, frame)?),
8839                span,
8840            }),
8841            Expr::UnaryOp { op, operand, span } => Ok(Expr::UnaryOp {
8842                op,
8843                operand: Box::new(self.resolve_expr_subqueries(*operand, outer_scopes, frame)?),
8844                span,
8845            }),
8846            Expr::Cast {
8847                inner,
8848                target,
8849                span,
8850            } => Ok(Expr::Cast {
8851                inner: Box::new(self.resolve_expr_subqueries(*inner, outer_scopes, frame)?),
8852                target,
8853                span,
8854            }),
8855            Expr::FunctionCall { name, args, span } => {
8856                let args = args
8857                    .into_iter()
8858                    .map(|arg| self.resolve_expr_subqueries(arg, outer_scopes, frame))
8859                    .collect::<RedDBResult<Vec<_>>>()?;
8860                Ok(Expr::FunctionCall { name, args, span })
8861            }
8862            Expr::Case {
8863                branches,
8864                else_,
8865                span,
8866            } => {
8867                let branches = branches
8868                    .into_iter()
8869                    .map(|(cond, value)| {
8870                        Ok((
8871                            self.resolve_expr_subqueries(cond, outer_scopes, frame)?,
8872                            self.resolve_expr_subqueries(value, outer_scopes, frame)?,
8873                        ))
8874                    })
8875                    .collect::<RedDBResult<Vec<_>>>()?;
8876                let else_ = else_
8877                    .map(|expr| self.resolve_expr_subqueries(*expr, outer_scopes, frame))
8878                    .transpose()?
8879                    .map(Box::new);
8880                Ok(Expr::Case {
8881                    branches,
8882                    else_,
8883                    span,
8884                })
8885            }
8886            Expr::IsNull {
8887                operand,
8888                negated,
8889                span,
8890            } => Ok(Expr::IsNull {
8891                operand: Box::new(self.resolve_expr_subqueries(*operand, outer_scopes, frame)?),
8892                negated,
8893                span,
8894            }),
8895            Expr::InList {
8896                target,
8897                values,
8898                negated,
8899                span,
8900            } => {
8901                let target =
8902                    Box::new(self.resolve_expr_subqueries(*target, outer_scopes, frame)?);
8903                let mut resolved = Vec::new();
8904                for value in values {
8905                    if let Expr::Subquery { query, .. } = value {
8906                        resolved.extend(
8907                            self.execute_expr_subquery_values(query, outer_scopes, frame)?
8908                                .into_iter()
8909                                .map(Expr::lit),
8910                        );
8911                    } else {
8912                        resolved.push(self.resolve_expr_subqueries(value, outer_scopes, frame)?);
8913                    }
8914                }
8915                Ok(Expr::InList {
8916                    target,
8917                    values: resolved,
8918                    negated,
8919                    span,
8920                })
8921            }
8922            Expr::Between {
8923                target,
8924                low,
8925                high,
8926                negated,
8927                span,
8928            } => Ok(Expr::Between {
8929                target: Box::new(self.resolve_expr_subqueries(*target, outer_scopes, frame)?),
8930                low: Box::new(self.resolve_expr_subqueries(*low, outer_scopes, frame)?),
8931                high: Box::new(self.resolve_expr_subqueries(*high, outer_scopes, frame)?),
8932                negated,
8933                span,
8934            }),
8935            other => Ok(other),
8936        }
8937    }
8938
8939    fn execute_expr_subquery_values(
8940        &self,
8941        subquery: crate::storage::query::ast::ExprSubquery,
8942        outer_scopes: &[String],
8943        frame: &dyn super::statement_frame::ReadFrame,
8944    ) -> RedDBResult<Vec<Value>> {
8945        let query = *subquery.query;
8946        if query_references_outer_scope(&query, outer_scopes) {
8947            return Err(RedDBError::Query(
8948                "NOT_YET_SUPPORTED: correlated subqueries are not supported yet; track follow-up issue #470-correlated-subqueries".to_string(),
8949            ));
8950        }
8951        let query = self.rewrite_view_refs(query);
8952        let query = self.resolve_select_expr_subqueries(query, frame)?;
8953        let query = self.authorize_relational_select_expr(query, frame)?;
8954        let result = match query {
8955            QueryExpr::Table(table) => {
8956                execute_runtime_table_query(&self.inner.db, &table, Some(&self.inner.index_store))?
8957            }
8958            QueryExpr::Join(join) => execute_runtime_join_query(&self.inner.db, &join)?,
8959            other => {
8960                return Err(RedDBError::Query(format!(
8961                    "expression subquery must be a SELECT query, got {}",
8962                    query_expr_name(&other)
8963                )))
8964            }
8965        };
8966        first_column_values(result)
8967    }
8968
8969    fn dispatch_expr(
8970        &self,
8971        expr: QueryExpr,
8972        query_str: &str,
8973        mode: QueryMode,
8974    ) -> RedDBResult<RuntimeQueryResult> {
8975        let statement = query_expr_name(&expr);
8976        match expr {
8977            QueryExpr::Graph(_) | QueryExpr::Path(_) => {
8978                // Graph queries are not cacheable as prepared statements.
8979                Err(RedDBError::Query(
8980                    "graph queries cannot be used as prepared statements".to_string(),
8981                ))
8982            }
8983            QueryExpr::Table(table) => {
8984                let scope = self.ai_scope();
8985                let table = self.resolve_table_expr_subqueries(
8986                    table,
8987                    &scope as &dyn super::statement_frame::ReadFrame,
8988                )?;
8989                // Table-valued functions (e.g. components(g)) dispatch to a
8990                // read-only executor before any catalog/virtual-table routing
8991                // (issue #795).
8992                if let Some(TableSource::Function {
8993                    name,
8994                    args,
8995                    named_args,
8996                }) = table.source.clone()
8997                {
8998                    return Ok(RuntimeQueryResult {
8999                        query: query_str.to_string(),
9000                        mode,
9001                        statement,
9002                        engine: "runtime-graph-tvf",
9003                        result: self.execute_table_function(&name, &args, &named_args)?,
9004                        affected_rows: 0,
9005                        statement_type: "select",
9006                        bookmark: None,
9007                    });
9008                }
9009                // Inline-graph TVF (issue #799) on the prepared-statement /
9010                // direct-expr path. Result caching is wired on the
9011                // `execute_query_inner` path; here we just compute and return.
9012                if let Some(TableSource::InlineGraphFunction {
9013                    name,
9014                    nodes,
9015                    edges,
9016                    named_args,
9017                }) = table.source.clone()
9018                {
9019                    return Ok(RuntimeQueryResult {
9020                        query: query_str.to_string(),
9021                        mode,
9022                        statement,
9023                        engine: "runtime-graph-tvf-inline",
9024                        result: self.execute_inline_graph_function(
9025                            &name,
9026                            &nodes,
9027                            &edges,
9028                            &named_args,
9029                        )?,
9030                        affected_rows: 0,
9031                        statement_type: "select",
9032                        bookmark: None,
9033                    });
9034                }
9035                if super::red_schema::is_virtual_table(&table.table) {
9036                    return Ok(RuntimeQueryResult {
9037                        query: query_str.to_string(),
9038                        mode,
9039                        statement,
9040                        engine: "runtime-red-schema",
9041                        result: super::red_schema::red_query(
9042                            self,
9043                            &table.table,
9044                            &table,
9045                            &scope as &dyn super::statement_frame::ReadFrame,
9046                        )?,
9047                        affected_rows: 0,
9048                        statement_type: "select",
9049                        bookmark: None,
9050                    });
9051                }
9052                // `<graph>.<output>` analytics virtual view (issue #800).
9053                if let Some(view_result) = self.try_resolve_analytics_view(
9054                    &table,
9055                    &scope as &dyn super::statement_frame::ReadFrame,
9056                )? {
9057                    return Ok(RuntimeQueryResult {
9058                        query: query_str.to_string(),
9059                        mode,
9060                        statement,
9061                        engine: "runtime-graph-analytics-view",
9062                        result: view_result,
9063                        affected_rows: 0,
9064                        statement_type: "select",
9065                        bookmark: None,
9066                    });
9067                }
9068                let Some(table_with_rls) = self.authorize_relational_table_select(
9069                    table,
9070                    &scope as &dyn super::statement_frame::ReadFrame,
9071                )?
9072                else {
9073                    return Ok(RuntimeQueryResult {
9074                        query: query_str.to_string(),
9075                        mode,
9076                        statement,
9077                        engine: "runtime-table-rls",
9078                        result: crate::storage::query::unified::UnifiedResult::empty(),
9079                        affected_rows: 0,
9080                        statement_type: "select",
9081                        bookmark: None,
9082                    });
9083                };
9084                Ok(RuntimeQueryResult {
9085                    query: query_str.to_string(),
9086                    mode,
9087                    statement,
9088                    engine: "runtime-table",
9089                    result: execute_runtime_table_query(
9090                        &self.inner.db,
9091                        &table_with_rls,
9092                        Some(&self.inner.index_store),
9093                    )?,
9094                    affected_rows: 0,
9095                    statement_type: "select",
9096                    bookmark: None,
9097                })
9098            }
9099            QueryExpr::Join(join) => {
9100                let scope = self.ai_scope();
9101                let Some(join_with_rls) = self.authorize_relational_join_select(
9102                    join,
9103                    &scope as &dyn super::statement_frame::ReadFrame,
9104                )?
9105                else {
9106                    return Ok(RuntimeQueryResult {
9107                        query: query_str.to_string(),
9108                        mode,
9109                        statement,
9110                        engine: "runtime-join-rls",
9111                        result: crate::storage::query::unified::UnifiedResult::empty(),
9112                        affected_rows: 0,
9113                        statement_type: "select",
9114                        bookmark: None,
9115                    });
9116                };
9117                Ok(RuntimeQueryResult {
9118                    query: query_str.to_string(),
9119                    mode,
9120                    statement,
9121                    engine: "runtime-join",
9122                    result: execute_runtime_join_query(&self.inner.db, &join_with_rls)?,
9123                    affected_rows: 0,
9124                    statement_type: "select",
9125                    bookmark: None,
9126                })
9127            }
9128            QueryExpr::Vector(vector) => Ok(RuntimeQueryResult {
9129                query: query_str.to_string(),
9130                mode,
9131                statement,
9132                engine: "runtime-vector",
9133                result: execute_runtime_vector_query(&self.inner.db, &vector)?,
9134                affected_rows: 0,
9135                statement_type: "select",
9136                bookmark: None,
9137            }),
9138            QueryExpr::Hybrid(hybrid) => Ok(RuntimeQueryResult {
9139                query: query_str.to_string(),
9140                mode,
9141                statement,
9142                engine: "runtime-hybrid",
9143                result: execute_runtime_hybrid_query(&self.inner.db, &hybrid)?,
9144                affected_rows: 0,
9145                statement_type: "select",
9146                bookmark: None,
9147            }),
9148            QueryExpr::Insert(ref insert) if super::red_schema::is_virtual_table(&insert.table) => {
9149                Err(RedDBError::Query(
9150                    super::red_schema::READ_ONLY_ERROR.to_string(),
9151                ))
9152            }
9153            QueryExpr::Update(ref update) if super::red_schema::is_virtual_table(&update.table) => {
9154                Err(RedDBError::Query(
9155                    super::red_schema::READ_ONLY_ERROR.to_string(),
9156                ))
9157            }
9158            QueryExpr::Delete(ref delete) if super::red_schema::is_virtual_table(&delete.table) => {
9159                Err(RedDBError::Query(
9160                    super::red_schema::READ_ONLY_ERROR.to_string(),
9161                ))
9162            }
9163            QueryExpr::Insert(ref insert) => self
9164                .with_deferred_store_wal_for_dml(self.insert_may_emit_events(insert), || {
9165                    self.execute_insert(query_str, insert)
9166                }),
9167            QueryExpr::Update(ref update) => self
9168                .with_deferred_store_wal_for_dml(self.update_may_emit_events(update), || {
9169                    self.execute_update(query_str, update)
9170                }),
9171            QueryExpr::Delete(ref delete) => self
9172                .with_deferred_store_wal_for_dml(self.delete_may_emit_events(delete), || {
9173                    self.execute_delete(query_str, delete)
9174                }),
9175            QueryExpr::SearchCommand(ref cmd) => self.execute_search_command(query_str, cmd),
9176            QueryExpr::Ask(ref ask) => self.execute_ask(query_str, ask),
9177            _ => Err(RedDBError::Query(format!(
9178                "prepared-statement execution does not support {statement} statements"
9179            ))),
9180        }
9181    }
9182
9183    /// Dispatch a graph-collection table-valued function call in FROM
9184    /// position (e.g. `SELECT * FROM components(g)`).
9185    ///
9186    /// Validates the function name and arity here, materializes the whole
9187    /// active graph read-only, then runs the algorithm via the shared
9188    /// `dispatch_graph_algorithm` path. Never mutates the catalog or store.
9189    fn execute_table_function(
9190        &self,
9191        name: &str,
9192        args: &[String],
9193        named_args: &[(String, f64)],
9194    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
9195        if !is_graph_tvf_name(name) {
9196            return Err(RedDBError::Query(format!("unknown table function: {name}")));
9197        }
9198        // Every graph-collection TVF takes exactly one graph argument.
9199        if args.len() != 1 {
9200            return Err(RedDBError::Query(format!(
9201                "table function '{name}' takes exactly 1 graph argument, got {}",
9202                args.len()
9203            )));
9204        }
9205
9206        // Read-only materialization of the full active graph. Passing `None`
9207        // for the projection uses the full graph store. Like #795/#796, the
9208        // v0 form runs over the whole graph store regardless of the collection
9209        // argument value. Materialization never mutates any store.
9210        let (nodes, edges) = self.materialize_whole_graph_abstract()?;
9211        self.dispatch_graph_algorithm(name, nodes, edges, named_args)
9212    }
9213
9214    /// Dispatch an inline-graph table-valued function call in FROM position
9215    /// (e.g. `SELECT * FROM components(nodes => (…), edges => (…))`, issue
9216    /// #799).
9217    ///
9218    /// Materializes the two subqueries through the normal read path (so RLS,
9219    /// column authz, and MVCC visibility all apply), constructs the abstract
9220    /// graph — the first column of `nodes` is the node id; the first two-or-
9221    /// three columns of `edges` are `(source, target [, weight])` — then runs
9222    /// the same algorithm path used by the graph-collection form. Read-only.
9223    fn execute_inline_graph_function(
9224        &self,
9225        name: &str,
9226        nodes_query: &QueryExpr,
9227        edges_query: &QueryExpr,
9228        named_args: &[(String, f64)],
9229    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
9230        if !is_graph_tvf_name(name) {
9231            return Err(RedDBError::Query(format!("unknown table function: {name}")));
9232        }
9233
9234        let node_result = self.execute_query_expr(nodes_query.clone())?.result;
9235        let nodes = inline_node_ids(name, &node_result)?;
9236
9237        let edge_result = self.execute_query_expr(edges_query.clone())?.result;
9238        let edges = inline_edges(name, &edge_result)?;
9239
9240        self.dispatch_graph_algorithm(name, nodes, edges, named_args)
9241    }
9242
9243    /// Materialize the whole active graph read-only into the abstract
9244    /// `(nodes, edges)` inputs the pure graph algorithms consume.
9245    fn materialize_whole_graph_abstract(
9246        &self,
9247    ) -> RedDBResult<(
9248        Vec<String>,
9249        Vec<(
9250            String,
9251            String,
9252            crate::storage::engine::graph_algorithms::Weight,
9253        )>,
9254    )> {
9255        use crate::storage::engine::graph_algorithms;
9256
9257        let graph = super::graph_dsl::materialize_graph_with_projection(
9258            self.inner.db.store().as_ref(),
9259            None,
9260        )?;
9261        let nodes: Vec<String> = graph.iter_nodes().map(|n| n.id.clone()).collect();
9262        let edges: Vec<(String, String, graph_algorithms::Weight)> = graph
9263            .iter_all_edges()
9264            .into_iter()
9265            .map(|e| (e.source_id, e.target_id, e.weight))
9266            .collect();
9267        Ok((nodes, edges))
9268    }
9269
9270    /// Resolve a `<graph>.<output>` analytics virtual view (issue #800).
9271    ///
9272    /// Returns `Ok(None)` when `table` is not an analytics view — either the
9273    /// name is not dotted, a real collection of that exact name exists (a real
9274    /// collection always wins; no shadowing), the suffix is not a recognised
9275    /// analytics output, or the parent is not a graph. Returns `Ok(Some(_))`
9276    /// with the freshly computed result when it does resolve, and an error when
9277    /// the parent graph exists but the output is not enabled, a declared
9278    /// algorithm is unsupported, or the parent collection's policy denies the
9279    /// read.
9280    ///
9281    /// The view is recomputed on every call (no result-cache write) so it
9282    /// always reflects the current graph data, satisfying the on-demand
9283    /// recompute contract for this slice.
9284    fn try_resolve_analytics_view(
9285        &self,
9286        table: &TableQuery,
9287        frame: &dyn super::statement_frame::ReadFrame,
9288    ) -> RedDBResult<Option<crate::storage::query::unified::UnifiedResult>> {
9289        let full = table.table.as_str();
9290        let Some(dot) = full.rfind('.') else {
9291            return Ok(None);
9292        };
9293        // A real collection literally named `g.communities` always wins.
9294        if self.inner.db.store().get_collection(full).is_some() {
9295            return Ok(None);
9296        }
9297        let graph_name = &full[..dot];
9298        let output_name = &full[dot + 1..];
9299        let Some(output) = crate::catalog::AnalyticsOutput::from_str(output_name) else {
9300            return Ok(None);
9301        };
9302
9303        let contracts = self.inner.db.collection_contracts();
9304        let Some(contract) = contracts.iter().find(|c| c.name == graph_name) else {
9305            return Ok(None);
9306        };
9307        if contract.declared_model != crate::catalog::CollectionModel::Graph {
9308            return Ok(None);
9309        }
9310        let Some(view) = contract
9311            .analytics_config
9312            .iter()
9313            .find(|view| view.output == output)
9314        else {
9315            // The parent graph exists but this output was not declared — a
9316            // clear error beats the misleading "collection not found".
9317            return Err(RedDBError::Query(format!(
9318                "analytics output '{output_name}' is not enabled on graph '{graph_name}'; declare it with WITH ANALYTICS (...)"
9319            )));
9320        };
9321
9322        // Policy inheritance (AC5): route through the parent graph collection's
9323        // read authorization. A policy or RLS rule that denies the parent
9324        // denies its analytics views transitively.
9325        let parent_query = TableQuery::new(graph_name);
9326        if self
9327            .authorize_relational_table_select(parent_query, frame)?
9328            .is_none()
9329        {
9330            return Err(RedDBError::Query(format!(
9331                "permission denied: policy on graph '{graph_name}' denies analytics view '{output_name}'"
9332            )));
9333        }
9334
9335        let (algorithm, named_args) = analytics_view_algorithm(graph_name, view)?;
9336        let (nodes, edges) = self.materialize_whole_graph_abstract()?;
9337        let result = self.dispatch_graph_algorithm(&algorithm, nodes, edges, &named_args)?;
9338        Ok(Some(result))
9339    }
9340
9341    /// Shared algorithm dispatch over abstract `(nodes, edges)` inputs.
9342    ///
9343    /// Both the graph-collection form and the inline-graph form route here so
9344    /// named-argument validation and the projected row shape stay identical
9345    /// across the two signatures (issue #799). Projects each algorithm's
9346    /// native output shape.
9347    fn dispatch_graph_algorithm(
9348        &self,
9349        name: &str,
9350        nodes: Vec<String>,
9351        edges: Vec<(
9352            String,
9353            String,
9354            crate::storage::engine::graph_algorithms::Weight,
9355        )>,
9356        named_args: &[(String, f64)],
9357    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
9358        use crate::storage::engine::graph_algorithms;
9359        use crate::storage::query::unified::UnifiedResult;
9360        use crate::storage::schema::Value;
9361
9362        if name.eq_ignore_ascii_case("components") {
9363            reject_named_args(name, named_args)?;
9364            let assignment = graph_algorithms::connected_components(&nodes, &edges);
9365            let mut result =
9366                UnifiedResult::with_columns(vec!["node_id".into(), "island_id".into()]);
9367            for (node_id, island_id) in assignment {
9368                let mut record = UnifiedRecord::new();
9369                record.set("node_id", Value::text(node_id));
9370                record.set("island_id", Value::Integer(island_id as i64));
9371                result.push(record);
9372            }
9373            return Ok(result);
9374        }
9375
9376        if name.eq_ignore_ascii_case("louvain") {
9377            // The only supported named argument is `resolution` (γ). It
9378            // defaults to 1.0 (classic modularity) and must be a finite,
9379            // strictly positive number — a non-positive (or NaN/inf)
9380            // resolution has no sensible meaning.
9381            let resolution = louvain_resolution(named_args)?;
9382            let assignment = graph_algorithms::louvain(&nodes, &edges, resolution);
9383            let mut result =
9384                UnifiedResult::with_columns(vec!["node_id".into(), "community_id".into()]);
9385            for (node_id, community_id) in assignment {
9386                let mut record = UnifiedRecord::new();
9387                record.set("node_id", Value::text(node_id));
9388                record.set("community_id", Value::Integer(community_id as i64));
9389                result.push(record);
9390            }
9391            return Ok(result);
9392        }
9393
9394        if name.eq_ignore_ascii_case("degree_centrality") {
9395            reject_named_args(name, named_args)?;
9396            let assignment = abstract_degree_centrality(&nodes, &edges);
9397            let mut result = UnifiedResult::with_columns(vec!["node_id".into(), "degree".into()]);
9398            for (node_id, degree) in assignment {
9399                let mut record = UnifiedRecord::new();
9400                record.set("node_id", Value::text(node_id));
9401                record.set("degree", Value::Integer(degree as i64));
9402                result.push(record);
9403            }
9404            return Ok(result);
9405        }
9406
9407        if name.eq_ignore_ascii_case("shortest_path") {
9408            // Scalar named arguments: `src` and `dst` are required node ids,
9409            // `max_hops` is an optional non-negative edge-count cap. Node ids
9410            // in the graph store are integer entity ids rendered as strings, so
9411            // each id arg must be a non-negative whole number; reject anything
9412            // else (fractional, negative, NaN/inf) with a clear message.
9413            let mut src: Option<String> = None;
9414            let mut dst: Option<String> = None;
9415            let mut max_hops: Option<usize> = None;
9416            let as_node_id = |key: &str, value: f64| -> RedDBResult<String> {
9417                if !value.is_finite() || value < 0.0 || value.fract() != 0.0 {
9418                    return Err(RedDBError::Query(format!(
9419                        "table function 'shortest_path' argument '{key}' must be a non-negative integer node id, got {value}"
9420                    )));
9421                }
9422                Ok((value as i64).to_string())
9423            };
9424            for (key, value) in named_args {
9425                if key.eq_ignore_ascii_case("src") {
9426                    src = Some(as_node_id("src", *value)?);
9427                } else if key.eq_ignore_ascii_case("dst") {
9428                    dst = Some(as_node_id("dst", *value)?);
9429                } else if key.eq_ignore_ascii_case("max_hops") {
9430                    if !value.is_finite() || *value < 0.0 || value.fract() != 0.0 {
9431                        return Err(RedDBError::Query(format!(
9432                            "table function 'shortest_path' max_hops must be a non-negative integer, got {value}"
9433                        )));
9434                    }
9435                    max_hops = Some(*value as usize);
9436                } else {
9437                    return Err(RedDBError::Query(format!(
9438                        "table function 'shortest_path' has no named argument '{key}' (expected 'src', 'dst', 'max_hops')"
9439                    )));
9440                }
9441            }
9442            let src = src.ok_or_else(|| {
9443                RedDBError::Query(
9444                    "table function 'shortest_path' requires named argument 'src'".to_string(),
9445                )
9446            })?;
9447            let dst = dst.ok_or_else(|| {
9448                RedDBError::Query(
9449                    "table function 'shortest_path' requires named argument 'dst'".to_string(),
9450                )
9451            })?;
9452
9453            // Columns are always present; an unreachable pair (within the
9454            // optional `max_hops` budget) simply yields zero rows — never an
9455            // error. `hop` is the 0-based index from the source;
9456            // `cumulative_weight` is the running path weight (0 at the source,
9457            // the total at the destination). Edges are treated as undirected,
9458            // consistent with `components` / `louvain`.
9459            let mut result = UnifiedResult::with_columns(vec![
9460                "hop".into(),
9461                "node_id".into(),
9462                "cumulative_weight".into(),
9463            ]);
9464            if let Some(path) =
9465                graph_algorithms::shortest_path(&nodes, &edges, &src, &dst, max_hops)
9466            {
9467                for (hop, (node_id, cumulative_weight)) in path.into_iter().enumerate() {
9468                    let mut record = UnifiedRecord::new();
9469                    record.set("hop", Value::Integer(hop as i64));
9470                    record.set("node_id", Value::text(node_id));
9471                    record.set("cumulative_weight", Value::Float(cumulative_weight));
9472                    result.push(record);
9473                }
9474            }
9475            return Ok(result);
9476        }
9477        // ── Centrality family (issue #797): each returns rows `(node_id,
9478        // score)` over the abstract `(nodes, edges)` graph. Like the other
9479        // graph TVFs the graph is treated as undirected and scores are
9480        // deterministic; the inline-graph form shares this dispatch. ──
9481        if name.eq_ignore_ascii_case("betweenness") {
9482            reject_named_args(name, named_args)?;
9483            return Ok(Self::centrality_result(graph_algorithms::betweenness(
9484                &nodes, &edges,
9485            )));
9486        }
9487        if name.eq_ignore_ascii_case("eigenvector") {
9488            // Optional `max_iterations` (positive integer, default 100) and
9489            // `tolerance` (finite, strictly positive, default 1e-6).
9490            let mut max_iterations = 100_usize;
9491            let mut tolerance = 1e-6_f64;
9492            for (key, value) in named_args {
9493                if key.eq_ignore_ascii_case("max_iterations") {
9494                    max_iterations = parse_positive_iterations("eigenvector", value)?;
9495                } else if key.eq_ignore_ascii_case("tolerance") {
9496                    if !value.is_finite() || *value <= 0.0 {
9497                        return Err(RedDBError::Query(format!(
9498                            "table function 'eigenvector' tolerance must be > 0, got {value}"
9499                        )));
9500                    }
9501                    tolerance = *value;
9502                } else {
9503                    return Err(RedDBError::Query(format!(
9504                        "table function 'eigenvector' has no named argument '{key}' (expected 'max_iterations' or 'tolerance')"
9505                    )));
9506                }
9507            }
9508            return Ok(Self::centrality_result(graph_algorithms::eigenvector(
9509                &nodes,
9510                &edges,
9511                max_iterations,
9512                tolerance,
9513            )));
9514        }
9515        if name.eq_ignore_ascii_case("pagerank") {
9516            // Optional `damping` (in (0, 1), default 0.85) and `max_iterations`
9517            // (positive integer, default 100).
9518            let mut damping = 0.85_f64;
9519            let mut max_iterations = 100_usize;
9520            for (key, value) in named_args {
9521                if key.eq_ignore_ascii_case("damping") {
9522                    if !value.is_finite() || *value <= 0.0 || *value >= 1.0 {
9523                        return Err(RedDBError::Query(format!(
9524                            "table function 'pagerank' damping must be in (0, 1), got {value}"
9525                        )));
9526                    }
9527                    damping = *value;
9528                } else if key.eq_ignore_ascii_case("max_iterations") {
9529                    max_iterations = parse_positive_iterations("pagerank", value)?;
9530                } else {
9531                    return Err(RedDBError::Query(format!(
9532                        "table function 'pagerank' has no named argument '{key}' (expected 'damping' or 'max_iterations')"
9533                    )));
9534                }
9535            }
9536            return Ok(Self::centrality_result(graph_algorithms::pagerank(
9537                &nodes,
9538                &edges,
9539                damping,
9540                max_iterations,
9541            )));
9542        }
9543        Err(RedDBError::Query(format!("unknown table function: {name}")))
9544    }
9545
9546    /// `components(<graph_collection>)` — returns rows `(node_id, island_id)`.
9547    ///
9548    /// Materializes the active graph (nodes + weighted edges) read-only and
9549    /// runs the pure `graph_algorithms::connected_components`. Edges are
9550    /// treated as undirected; island ids are deterministic (ascending order of
9551    /// each component's smallest node).
9552    fn execute_components_tvf(
9553        &self,
9554        _collection: &str,
9555    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
9556        use crate::storage::engine::graph_algorithms;
9557        use crate::storage::query::unified::UnifiedResult;
9558        use crate::storage::schema::Value;
9559
9560        // Read-only materialization of the full active graph. The named
9561        // collection identifies the active graph scope; passing `None` for the
9562        // projection uses the full graph store (the same result
9563        // `active_graph_projection` yields when no projection is registered).
9564        // Materialization never mutates any store.
9565        let graph = super::graph_dsl::materialize_graph_with_projection(
9566            self.inner.db.store().as_ref(),
9567            None,
9568        )?;
9569
9570        // Materialize abstract inputs for the pure algorithm.
9571        let nodes: Vec<String> = graph.iter_nodes().map(|n| n.id.clone()).collect();
9572        let edges: Vec<(String, String, graph_algorithms::Weight)> = graph
9573            .iter_all_edges()
9574            .into_iter()
9575            .map(|e| (e.source_id, e.target_id, e.weight))
9576            .collect();
9577
9578        let assignment = graph_algorithms::connected_components(&nodes, &edges);
9579
9580        // Project into a UnifiedResult with columns ["node_id", "island_id"].
9581        let mut result = UnifiedResult::with_columns(vec!["node_id".into(), "island_id".into()]);
9582        for (node_id, island_id) in assignment {
9583            let mut record = UnifiedRecord::new();
9584            record.set("node_id", Value::text(node_id));
9585            record.set("island_id", Value::Integer(island_id as i64));
9586            result.push(record);
9587        }
9588        Ok(result)
9589    }
9590
9591    /// `louvain(<graph> [, resolution => <f64>])` — returns rows
9592    /// `(node_id, community_id)` (issue #796).
9593    ///
9594    /// Materializes the active graph (nodes + weighted edges) read-only and
9595    /// runs the pure, deterministic `graph_algorithms::louvain`. Edges are
9596    /// treated as undirected; community ids are assigned in ascending order of
9597    /// each community's smallest node, so identical input + resolution always
9598    /// yields identical rows. Like `components`, the v0 form runs over the
9599    /// whole graph store regardless of the collection argument value.
9600    fn execute_louvain_tvf(
9601        &self,
9602        _collection: &str,
9603        resolution: f64,
9604    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
9605        use crate::storage::engine::graph_algorithms;
9606        use crate::storage::query::unified::UnifiedResult;
9607        use crate::storage::schema::Value;
9608
9609        let graph = super::graph_dsl::materialize_graph_with_projection(
9610            self.inner.db.store().as_ref(),
9611            None,
9612        )?;
9613
9614        let nodes: Vec<String> = graph.iter_nodes().map(|n| n.id.clone()).collect();
9615        let edges: Vec<(String, String, graph_algorithms::Weight)> = graph
9616            .iter_all_edges()
9617            .into_iter()
9618            .map(|e| (e.source_id, e.target_id, e.weight))
9619            .collect();
9620
9621        let assignment = graph_algorithms::louvain(&nodes, &edges, resolution);
9622
9623        // Project into a UnifiedResult with columns ["node_id", "community_id"].
9624        let mut result = UnifiedResult::with_columns(vec!["node_id".into(), "community_id".into()]);
9625        for (node_id, community_id) in assignment {
9626            let mut record = UnifiedRecord::new();
9627            record.set("node_id", Value::text(node_id));
9628            record.set("community_id", Value::Integer(community_id as i64));
9629            result.push(record);
9630        }
9631        Ok(result)
9632    }
9633
9634    /// Project `(node_id, score)` centrality rows into a `UnifiedResult` with
9635    /// columns `["node_id", "score"]`; scores are `Value::Float`.
9636    fn centrality_result(
9637        rows: Vec<(String, f64)>,
9638    ) -> crate::storage::query::unified::UnifiedResult {
9639        use crate::storage::query::unified::UnifiedResult;
9640        use crate::storage::schema::Value;
9641        let mut result = UnifiedResult::with_columns(vec!["node_id".into(), "score".into()]);
9642        for (node_id, score) in rows {
9643            let mut record = UnifiedRecord::new();
9644            record.set("node_id", Value::text(node_id));
9645            record.set("score", Value::Float(score));
9646            result.push(record);
9647        }
9648        result
9649    }
9650
9651    /// Ultra-fast path: detect `SELECT * FROM table WHERE _entity_id = N` by string pattern
9652    /// and execute it without SQL parsing or planning. Returns None if pattern doesn't match.
9653    fn try_fast_entity_lookup(&self, query: &str) -> Option<RedDBResult<RuntimeQueryResult>> {
9654        // Pattern: "SELECT * FROM <table> WHERE _entity_id = <id>"
9655        // or "SELECT * FROM <table> WHERE _entity_id =<id>"
9656        let q = query.trim();
9657        if !q.starts_with("SELECT") && !q.starts_with("select") {
9658            return None;
9659        }
9660
9661        // Find "WHERE _entity_id = " or "WHERE _entity_id ="
9662        let where_pos = q
9663            .find("WHERE _entity_id")
9664            .or_else(|| q.find("where _entity_id"))?;
9665        let after_field = &q[where_pos + 16..].trim_start(); // skip "WHERE _entity_id"
9666        let after_eq = after_field.strip_prefix('=')?.trim_start();
9667
9668        // Parse the entity ID number
9669        let id_str = after_eq.trim();
9670        let entity_id: u64 = id_str.parse().ok()?;
9671
9672        // Extract table name: between "FROM " and " WHERE"
9673        let from_pos = q.find("FROM ").or_else(|| q.find("from "))? + 5;
9674        let table = q[from_pos..where_pos].trim();
9675        if table.is_empty()
9676            || table.contains(' ') && !table.contains(" AS ") && !table.contains(" as ")
9677        {
9678            return None; // complex query, fall through
9679        }
9680        let table_name = table.split_whitespace().next()?;
9681
9682        // Direct entity lookup — skips SQL parse, plan cache, result
9683        // cache, view rewriter, RLS gate. Safe because the gating in
9684        // `execute_query` guarantees no scope override / no
9685        // transaction context is active. MVCC visibility is still
9686        // honoured against the current snapshot.
9687        let store = self.inner.db.store();
9688        let entity = store
9689            .get(
9690                table_name,
9691                crate::storage::unified::EntityId::new(entity_id),
9692            )
9693            .filter(entity_visible_under_current_snapshot)
9694            .filter(|entity| {
9695                self.inner
9696                    .db
9697                    .replica_allows_entity_at_read(table_name, entity)
9698            });
9699
9700        let count = if entity.is_some() { 1u64 } else { 0 };
9701
9702        // Materialize a record so downstream consumers that walk
9703        // `result.records` (embedded runtime API, decrypt pass, CLI)
9704        // see the row. Previously only `pre_serialized_json` was
9705        // filled, which caused those consumers to see zero rows and
9706        // skewed benchmarks.
9707        let records: Vec<crate::storage::query::unified::UnifiedRecord> = entity
9708            .as_ref()
9709            .and_then(|e| runtime_table_record_from_entity(e.clone()))
9710            .into_iter()
9711            .collect();
9712
9713        let json = match entity {
9714            Some(ref e) => execute_runtime_serialize_single_entity(e),
9715            None => r#"{"columns":[],"record_count":0,"selection":{"scope":"any"},"records":[]}"#
9716                .to_string(),
9717        };
9718
9719        Some(Ok(RuntimeQueryResult {
9720            query: query.to_string(),
9721            mode: crate::storage::query::modes::QueryMode::Sql,
9722            statement: "select",
9723            engine: "fast-entity-lookup",
9724            result: crate::storage::query::unified::UnifiedResult {
9725                columns: Vec::new(),
9726                records,
9727                stats: crate::storage::query::unified::QueryStats {
9728                    rows_scanned: count,
9729                    ..Default::default()
9730                },
9731                pre_serialized_json: Some(json),
9732            },
9733            affected_rows: 0,
9734            statement_type: "select",
9735            bookmark: None,
9736        }))
9737    }
9738
9739    fn result_cache_backend(&self) -> RuntimeResultCacheBackend {
9740        match self
9741            .config_string(RESULT_CACHE_BACKEND_KEY, RESULT_CACHE_DEFAULT_BACKEND)
9742            .as_str()
9743        {
9744            "blob_cache" => RuntimeResultCacheBackend::BlobCache,
9745            "shadow" => RuntimeResultCacheBackend::Shadow,
9746            _ => RuntimeResultCacheBackend::Legacy,
9747        }
9748    }
9749
9750    /// Result-cache kill-switch (issue #802). When `false`, reads and
9751    /// writes are short-circuited so every query recomputes — used for
9752    /// debugging and to bound staleness without a restart.
9753    fn result_cache_enabled(&self) -> bool {
9754        self.config_bool(RESULT_CACHE_ENABLED_KEY, true)
9755    }
9756
9757    /// Configurable per-entry TTL in seconds (issue #802), defaulting to
9758    /// the former `RESULT_CACHE_TTL_SECS` constant.
9759    fn result_cache_ttl_secs(&self) -> u64 {
9760        self.config_u64(RESULT_CACHE_TTL_KEY, RESULT_CACHE_TTL_SECS)
9761    }
9762
9763    /// Configurable LRU capacity in entries (issue #802), defaulting to
9764    /// the former `RESULT_CACHE_MAX_ENTRIES` constant. Zero is treated as
9765    /// "no cached entries retained".
9766    fn result_cache_capacity(&self) -> usize {
9767        self.config_u64(RESULT_CACHE_CAPACITY_KEY, RESULT_CACHE_MAX_ENTRIES as u64) as usize
9768    }
9769
9770    /// Snapshot of the result-cache observability counters (issue #802):
9771    /// `(hits, misses, evictions)`. Surfaced under `red.metrics`.
9772    pub fn result_cache_metrics(&self) -> (u64, u64, u64) {
9773        use std::sync::atomic::Ordering::Relaxed;
9774        (
9775            self.inner.result_cache_hits.load(Relaxed),
9776            self.inner.result_cache_misses.load(Relaxed),
9777            self.inner.result_cache_evictions.load(Relaxed),
9778        )
9779    }
9780
9781    fn record_result_cache_evictions(&self, evicted: u64) {
9782        if evicted > 0 {
9783            self.inner
9784                .result_cache_evictions
9785                .fetch_add(evicted, std::sync::atomic::Ordering::Relaxed);
9786        }
9787    }
9788
9789    pub(super) fn get_result_cache_entry(&self, key: &str) -> Option<RuntimeQueryResult> {
9790        if !self.result_cache_enabled() {
9791            return None;
9792        }
9793        let hit = self.get_result_cache_entry_inner(key);
9794        let counter = if hit.is_some() {
9795            &self.inner.result_cache_hits
9796        } else {
9797            &self.inner.result_cache_misses
9798        };
9799        counter.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
9800        hit
9801    }
9802
9803    fn get_result_cache_entry_inner(&self, key: &str) -> Option<RuntimeQueryResult> {
9804        match self.result_cache_backend() {
9805            RuntimeResultCacheBackend::Legacy => self.get_legacy_result_cache_entry(key),
9806            RuntimeResultCacheBackend::BlobCache => self.get_blob_result_cache_entry(key),
9807            RuntimeResultCacheBackend::Shadow => {
9808                let legacy = self.get_legacy_result_cache_entry(key);
9809                let blob = self.get_blob_result_cache_entry(key);
9810                if let (Some(ref legacy), Some(ref blob)) = (&legacy, &blob) {
9811                    if result_cache_fingerprint(legacy) != result_cache_fingerprint(blob) {
9812                        self.inner
9813                            .result_cache_shadow_divergences
9814                            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
9815                        tracing::warn!(
9816                            key,
9817                            metric = crate::runtime::METRIC_CACHE_SHADOW_DIVERGENCE_TOTAL,
9818                            "result cache shadow backend diverged from legacy"
9819                        );
9820                    }
9821                }
9822                legacy
9823            }
9824        }
9825    }
9826
9827    fn get_legacy_result_cache_entry(&self, key: &str) -> Option<RuntimeQueryResult> {
9828        let ttl = self.result_cache_ttl_secs();
9829        let cache = self.inner.result_cache.read();
9830        cache.0.get(key).and_then(|entry| {
9831            if entry.cached_at.elapsed().as_secs() < ttl {
9832                Some(entry.result.clone())
9833            } else {
9834                None
9835            }
9836        })
9837    }
9838
9839    fn get_blob_result_cache_entry(&self, key: &str) -> Option<RuntimeQueryResult> {
9840        let hit = self
9841            .inner
9842            .result_blob_cache
9843            .get(RESULT_CACHE_BLOB_NAMESPACE, key)?;
9844        {
9845            let cache = self.inner.result_blob_entries.read();
9846            if let Some(entry) = cache.0.get(key) {
9847                return Some(entry.result.clone());
9848            }
9849        }
9850
9851        let (result, scopes) = decode_result_cache_payload(hit.value())?;
9852        let mut cache = self.inner.result_blob_entries.write();
9853        let (ref mut map, ref mut order) = *cache;
9854        if !map.contains_key(key) {
9855            order.push_back(key.to_string());
9856        }
9857        map.insert(
9858            key.to_string(),
9859            RuntimeResultCacheEntry {
9860                result: result.clone(),
9861                cached_at: std::time::Instant::now(),
9862                scopes,
9863            },
9864        );
9865        let evicted = trim_result_cache(map, order, self.result_cache_capacity());
9866        drop(cache);
9867        self.record_result_cache_evictions(evicted);
9868        Some(result)
9869    }
9870
9871    pub(super) fn put_result_cache_entry(&self, key: &str, entry: RuntimeResultCacheEntry) {
9872        if !self.result_cache_enabled() {
9873            return;
9874        }
9875        match self.result_cache_backend() {
9876            RuntimeResultCacheBackend::Legacy => self.put_legacy_result_cache_entry(key, entry),
9877            RuntimeResultCacheBackend::BlobCache => self.put_blob_result_cache_entry(key, entry),
9878            RuntimeResultCacheBackend::Shadow => {
9879                self.put_legacy_result_cache_entry(key, entry.clone());
9880                self.put_blob_result_cache_entry(key, entry);
9881            }
9882        }
9883    }
9884
9885    fn put_legacy_result_cache_entry(&self, key: &str, entry: RuntimeResultCacheEntry) {
9886        let capacity = self.result_cache_capacity();
9887        let mut cache = self.inner.result_cache.write();
9888        let (ref mut map, ref mut order) = *cache;
9889        if !map.contains_key(key) {
9890            order.push_back(key.to_string());
9891        }
9892        map.insert(key.to_string(), entry);
9893        let evicted = trim_result_cache(map, order, capacity);
9894        drop(cache);
9895        self.record_result_cache_evictions(evicted);
9896    }
9897
9898    fn put_blob_result_cache_entry(&self, key: &str, entry: RuntimeResultCacheEntry) {
9899        let policy = crate::storage::cache::BlobCachePolicy::default()
9900            .ttl_ms(self.result_cache_ttl_secs() * 1000)
9901            .priority(200);
9902        let dependencies = entry.scopes.iter().cloned().collect::<Vec<_>>();
9903        let bytes = encode_result_cache_payload(&entry)
9904            .unwrap_or_else(|| result_cache_fingerprint(&entry.result).into_bytes());
9905        let put = crate::storage::cache::BlobCachePut::new(bytes)
9906            .with_dependencies(dependencies)
9907            .with_policy(policy);
9908        if self
9909            .inner
9910            .result_blob_cache
9911            .put(RESULT_CACHE_BLOB_NAMESPACE, key, put)
9912            .is_err()
9913        {
9914            return;
9915        }
9916
9917        let capacity = self.result_cache_capacity();
9918        let mut cache = self.inner.result_blob_entries.write();
9919        let (ref mut map, ref mut order) = *cache;
9920        if !map.contains_key(key) {
9921            order.push_back(key.to_string());
9922        }
9923        map.insert(key.to_string(), entry);
9924        let evicted = trim_result_cache(map, order, capacity);
9925        drop(cache);
9926        self.record_result_cache_evictions(evicted);
9927    }
9928
9929    pub fn result_cache_shadow_divergences(&self) -> u64 {
9930        self.inner
9931            .result_cache_shadow_divergences
9932            .load(std::sync::atomic::Ordering::Relaxed)
9933    }
9934
9935    /// Invalidate the result cache (call after any write operation).
9936    /// Full clear — use for DDL (DROP TABLE, schema changes) or when table is unknown.
9937    pub fn invalidate_result_cache(&self) {
9938        let mut cache = self.inner.result_cache.write();
9939        cache.0.clear();
9940        cache.1.clear();
9941        let mut blob_entries = self.inner.result_blob_entries.write();
9942        blob_entries.0.clear();
9943        blob_entries.1.clear();
9944        self.inner
9945            .result_blob_cache
9946            .invalidate_namespace(RESULT_CACHE_BLOB_NAMESPACE);
9947        let mut ask_entries = self.inner.ask_answer_cache_entries.write();
9948        ask_entries.0.clear();
9949        ask_entries.1.clear();
9950        self.inner
9951            .result_blob_cache
9952            .invalidate_namespace(ASK_ANSWER_CACHE_NAMESPACE);
9953    }
9954
9955    /// Invalidate only result cache entries that declared a dependency on `table`.
9956    /// Cheaper than a full clear: unrelated tables keep their cached results.
9957    pub(crate) fn invalidate_result_cache_for_table(&self, table: &str) {
9958        // Hot-path probe both backends before taking write locks. The blob
9959        // backend is node-local, same as the legacy result cache.
9960        let legacy_has_match = {
9961            let cache = self.inner.result_cache.read();
9962            let (ref map, _) = *cache;
9963            !map.is_empty() && map.values().any(|entry| entry.scopes.contains(table))
9964        };
9965        let blob_has_match = {
9966            let cache = self.inner.result_blob_entries.read();
9967            let (ref map, _) = *cache;
9968            !map.is_empty() && map.values().any(|entry| entry.scopes.contains(table))
9969        };
9970        if legacy_has_match {
9971            let mut cache = self.inner.result_cache.write();
9972            let (ref mut map, ref mut order) = *cache;
9973            map.retain(|_, entry| !entry.scopes.contains(table));
9974            order.retain(|key| map.contains_key(key));
9975        }
9976
9977        if matches!(
9978            self.result_cache_backend(),
9979            RuntimeResultCacheBackend::BlobCache | RuntimeResultCacheBackend::Shadow
9980        ) {
9981            let mut blob_entries = self.inner.result_blob_entries.write();
9982            let (ref mut blob_map, ref mut blob_order) = *blob_entries;
9983            blob_map.clear();
9984            blob_order.clear();
9985            self.inner
9986                .result_blob_cache
9987                .invalidate_namespace(RESULT_CACHE_BLOB_NAMESPACE);
9988        } else if blob_has_match {
9989            let mut blob_entries = self.inner.result_blob_entries.write();
9990            let (ref mut blob_map, ref mut blob_order) = *blob_entries;
9991            blob_map.retain(|_, entry| !entry.scopes.contains(table));
9992            blob_order.retain(|key| blob_map.contains_key(key));
9993        }
9994        let mut ask_entries = self.inner.ask_answer_cache_entries.write();
9995        ask_entries.0.clear();
9996        ask_entries.1.clear();
9997        self.inner
9998            .result_blob_cache
9999            .invalidate_namespace(ASK_ANSWER_CACHE_NAMESPACE);
10000    }
10001
10002    pub(crate) fn invalidate_plan_cache(&self) {
10003        self.inner.query_cache.write().clear();
10004        self.inner
10005            .ddl_epoch
10006            .fetch_add(1, std::sync::atomic::Ordering::Release);
10007    }
10008
10009    /// Read the monotonic DDL epoch counter. Bumped by every
10010    /// `invalidate_plan_cache` call so prepared-statement holders can
10011    /// detect schema drift between PREPARE and EXECUTE.
10012    pub fn ddl_epoch(&self) -> u64 {
10013        self.inner
10014            .ddl_epoch
10015            .load(std::sync::atomic::Ordering::Acquire)
10016    }
10017
10018    pub(crate) fn clear_table_planner_stats(&self, table: &str) {
10019        let store = self.inner.db.store();
10020        crate::storage::query::planner::stats_catalog::clear_table_stats(store.as_ref(), table);
10021        self.invalidate_plan_cache();
10022    }
10023
10024    /// Replay `tenant_tables.*.column` keys from red_config at boot so
10025    /// `CREATE TABLE ... TENANT BY (col)` declarations persist across
10026    /// restarts (Phase 2.5.4). Reads every row of the `red_config`
10027    /// collection, picks the keys matching the tenant-marker shape,
10028    /// and calls `register_tenant_table` for each.
10029    ///
10030    /// Safe no-op when `red_config` doesn't exist (first boot on a
10031    /// fresh datadir).
10032    pub(crate) fn rehydrate_tenant_tables(&self) {
10033        let store = self.inner.db.store();
10034        let Some(manager) = store.get_collection("red_config") else {
10035            return;
10036        };
10037        // Replay in insertion order (SegmentManager iteration). Multiple
10038        // toggles on the same table leave several rows behind — the
10039        // last one processed wins because each register/unregister
10040        // call overwrites the in-memory state.
10041        for entity in manager.query_all(|_| true) {
10042            let crate::storage::unified::entity::EntityData::Row(row) = &entity.data else {
10043                continue;
10044            };
10045            let Some(named) = &row.named else { continue };
10046            let Some(crate::storage::schema::Value::Text(key)) = named.get("key") else {
10047                continue;
10048            };
10049            // Shape: tenant_tables.{table}.column
10050            let Some(rest) = key.strip_prefix("tenant_tables.") else {
10051                continue;
10052            };
10053            let Some((table, suffix)) = rest.rsplit_once('.') else {
10054                // Issue #205 — a `tenant_tables.*` row that doesn't
10055                // split cleanly is a schema-shape regression: the
10056                // metadata writer must always emit the `.column`
10057                // suffix, so reaching this branch means an upgrade
10058                // with incompatible state or external tampering.
10059                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
10060                    collection: "red_config".to_string(),
10061                    detail: format!("malformed tenant_tables key: {key}"),
10062                }
10063                .emit_global();
10064                continue;
10065            };
10066            if suffix != "column" {
10067                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
10068                    collection: "red_config".to_string(),
10069                    detail: format!("unexpected tenant_tables suffix: {key}"),
10070                }
10071                .emit_global();
10072                continue;
10073            }
10074            match named.get("value") {
10075                Some(crate::storage::schema::Value::Text(column)) => {
10076                    self.register_tenant_table(table, column);
10077                }
10078                // Null / missing value = DISABLE TENANCY marker.
10079                Some(crate::storage::schema::Value::Null) | None => {
10080                    self.unregister_tenant_table(table);
10081                }
10082                _ => {}
10083            }
10084        }
10085    }
10086
10087    /// Replay every persisted `MaterializedViewDescriptor` from the
10088    /// `red_materialized_view_defs` system collection (issue #593
10089    /// slice 9a). For each descriptor, re-parse the original SQL,
10090    /// extract the `QueryExpr::CreateView` it produced, and populate
10091    /// the in-memory registries (`inner.views` and
10092    /// `inner.materialized_views`) directly — no write paths run, so
10093    /// rehydrate does not re-persist what it just read.
10094    ///
10095    /// Malformed rows (missing `name`/`source_sql`, parse errors) are
10096    /// skipped with a `SchemaCorruption` operator event so a single
10097    /// bad entry does not block startup.
10098    pub(crate) fn rehydrate_materialized_view_descriptors(&self) {
10099        let store = self.inner.db.store();
10100        let descriptors = crate::runtime::continuous_materialized_view::load_all(store.as_ref());
10101        for descriptor in descriptors {
10102            let parsed = match crate::storage::query::parser::parse(&descriptor.source_sql) {
10103                Ok(qc) => qc,
10104                Err(err) => {
10105                    crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
10106                        collection:
10107                            crate::runtime::continuous_materialized_view::CATALOG_COLLECTION
10108                                .to_string(),
10109                        detail: format!(
10110                            "failed to re-parse materialized-view source for {}: {err}",
10111                            descriptor.name
10112                        ),
10113                    }
10114                    .emit_global();
10115                    continue;
10116                }
10117            };
10118            let crate::storage::query::ast::QueryExpr::CreateView(create) = parsed.query else {
10119                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
10120                    collection: crate::runtime::continuous_materialized_view::CATALOG_COLLECTION
10121                        .to_string(),
10122                    detail: format!(
10123                        "materialized-view source for {} did not re-parse as CREATE VIEW",
10124                        descriptor.name
10125                    ),
10126                }
10127                .emit_global();
10128                continue;
10129            };
10130            // Populate in-memory view registry.
10131            let view_name = create.name.clone();
10132            self.inner
10133                .views
10134                .write()
10135                .insert(view_name.clone(), Arc::new(create));
10136            // Materialized cache slot (data empty until next REFRESH).
10137            use crate::storage::cache::result::{MaterializedViewDef, RefreshPolicy};
10138            let refresh = match descriptor.refresh_every_ms {
10139                Some(ms) => RefreshPolicy::Periodic(std::time::Duration::from_millis(ms)),
10140                None => RefreshPolicy::Manual,
10141            };
10142            let def = MaterializedViewDef {
10143                name: view_name.clone(),
10144                query: format!("<parsed view {}>", view_name),
10145                dependencies: descriptor.source_collections.clone(),
10146                refresh,
10147                retention_duration_ms: descriptor.retention_duration_ms,
10148            };
10149            self.inner.materialized_views.write().register(def);
10150        }
10151        // A rehydrated view shape may differ from any plans the cache
10152        // bootstrapped before this method ran — flush to be safe.
10153        self.invalidate_plan_cache();
10154    }
10155
10156    pub(crate) fn rehydrate_declared_column_schemas(&self) {
10157        let store = self.inner.db.store();
10158        for contract in self.inner.db.collection_contracts() {
10159            let columns: Vec<String> = contract
10160                .declared_columns
10161                .iter()
10162                .map(|column| column.name.clone())
10163                .collect();
10164            let Some(manager) = store.get_collection(&contract.name) else {
10165                continue;
10166            };
10167            manager.set_column_schema_if_empty(columns);
10168        }
10169    }
10170
10171    /// Register a table as tenant-scoped (Phase 2.5.4). Installs the
10172    /// in-memory column mapping, the implicit RLS policy, and enables
10173    /// row-level security on the table. Idempotent — re-registering
10174    /// the same `(table, column)` replaces the prior auto-policy.
10175    pub fn register_tenant_table(&self, table: &str, column: &str) {
10176        use crate::storage::query::ast::{
10177            CompareOp, CreatePolicyQuery, Expr, FieldRef, Filter, Span,
10178        };
10179        self.inner
10180            .tenant_tables
10181            .write()
10182            .insert(table.to_string(), column.to_string());
10183
10184        // Build the policy: col = CURRENT_TENANT()
10185        // Uses CompareExpr so the comparison happens at runtime against
10186        // the thread-local tenant value read by the CURRENT_TENANT
10187        // scalar. Spans are synthetic — there's no source location for
10188        // an auto-generated policy.
10189        let lhs = Expr::Column {
10190            field: FieldRef::TableColumn {
10191                table: table.to_string(),
10192                column: column.to_string(),
10193            },
10194            span: Span::synthetic(),
10195        };
10196        let rhs = Expr::FunctionCall {
10197            name: "CURRENT_TENANT".to_string(),
10198            args: Vec::new(),
10199            span: Span::synthetic(),
10200        };
10201        let policy_filter = Filter::CompareExpr {
10202            lhs,
10203            op: CompareOp::Eq,
10204            rhs,
10205        };
10206
10207        let policy = CreatePolicyQuery {
10208            name: "__tenant_iso".to_string(),
10209            table: table.to_string(),
10210            action: None, // None = ALL actions (SELECT/INSERT/UPDATE/DELETE)
10211            role: None,   // None = every role
10212            using: Box::new(policy_filter),
10213            // Auto-tenancy defaults to Table targets. Collections of
10214            // other kinds (graph / vector / queue / timeseries) that
10215            // opt in via `ALTER ... ENABLE TENANCY` should use the
10216            // matching kind — but for now we keep the auto-policy
10217            // kind-agnostic so the evaluator can apply it to any
10218            // entity living in the collection.
10219            target_kind: crate::storage::query::ast::PolicyTargetKind::Table,
10220        };
10221
10222        // Replace any prior auto-policy for this table (column rename).
10223        self.inner.rls_policies.write().insert(
10224            (table.to_string(), "__tenant_iso".to_string()),
10225            Arc::new(policy),
10226        );
10227        self.inner
10228            .rls_enabled_tables
10229            .write()
10230            .insert(table.to_string());
10231
10232        // Auto-build a hash index on the tenant column. Every read/write
10233        // against a tenant-scoped table carries an implicit
10234        // `col = CURRENT_TENANT()` predicate from the auto-policy, so an
10235        // index on that column is on the hot path of every query. Without
10236        // it, every SELECT/UPDATE/DELETE degrades to a full scan.
10237        self.ensure_tenant_index(table, column);
10238    }
10239
10240    /// Auto-create the hash index that backs the tenant-iso RLS predicate.
10241    /// Skipped when:
10242    ///   * the column is dotted (nested path — flat secondary indices
10243    ///     don't cover those today; RLS still works via the policy)
10244    ///   * `__tenant_idx_{table}` already exists (idempotent on rehydrate)
10245    ///   * the user already registered an index whose first column matches
10246    ///     (avoids redundant duplicates of a user-defined composite)
10247    fn ensure_tenant_index(&self, table: &str, column: &str) {
10248        if column.contains('.') {
10249            return;
10250        }
10251        let index_name = format!("__tenant_idx_{table}");
10252        let registry = self.inner.index_store.list_indices(table);
10253        if registry.iter().any(|idx| idx.name == index_name) {
10254            return;
10255        }
10256        if registry
10257            .iter()
10258            .any(|idx| idx.columns.first().map(|c| c.as_str()) == Some(column))
10259        {
10260            return;
10261        }
10262
10263        let store = self.inner.db.store();
10264        let Some(manager) = store.get_collection(table) else {
10265            return;
10266        };
10267        let entities = manager.query_all(|_| true);
10268        let entity_fields: Vec<(
10269            crate::storage::unified::EntityId,
10270            Vec<(String, crate::storage::schema::Value)>,
10271        )> = entities
10272            .iter()
10273            .map(|e| {
10274                let fields = match &e.data {
10275                    crate::storage::EntityData::Row(row) => {
10276                        if let Some(ref named) = row.named {
10277                            named.iter().map(|(k, v)| (k.clone(), v.clone())).collect()
10278                        } else if let Some(ref schema) = row.schema {
10279                            schema
10280                                .iter()
10281                                .zip(row.columns.iter())
10282                                .map(|(k, v)| (k.clone(), v.clone()))
10283                                .collect()
10284                        } else {
10285                            Vec::new()
10286                        }
10287                    }
10288                    crate::storage::EntityData::Node(node) => node
10289                        .properties
10290                        .iter()
10291                        .map(|(k, v)| (k.clone(), v.clone()))
10292                        .collect(),
10293                    _ => Vec::new(),
10294                };
10295                (e.id, fields)
10296            })
10297            .collect();
10298
10299        let columns = vec![column.to_string()];
10300        if self
10301            .inner
10302            .index_store
10303            .create_index(
10304                &index_name,
10305                table,
10306                &columns,
10307                super::index_store::IndexMethodKind::Hash,
10308                false,
10309                &entity_fields,
10310            )
10311            .is_err()
10312        {
10313            return;
10314        }
10315        self.inner
10316            .index_store
10317            .register(super::index_store::RegisteredIndex {
10318                name: index_name,
10319                collection: table.to_string(),
10320                columns,
10321                method: super::index_store::IndexMethodKind::Hash,
10322                unique: false,
10323            });
10324        self.invalidate_plan_cache();
10325    }
10326
10327    /// Drop the auto-generated tenant index, if one exists. Called from
10328    /// `unregister_tenant_table` so DISABLE TENANCY / DROP TABLE clean up.
10329    fn drop_tenant_index(&self, table: &str) {
10330        let index_name = format!("__tenant_idx_{table}");
10331        self.inner.index_store.drop_index(&index_name, table);
10332    }
10333
10334    /// Retrieve the tenant column for a table, if any (Phase 2.5.4).
10335    /// Used by the INSERT auto-fill path to know which column to
10336    /// populate with `current_tenant()` when the user didn't name it.
10337    pub fn tenant_column(&self, table: &str) -> Option<String> {
10338        self.inner.tenant_tables.read().get(table).cloned()
10339    }
10340
10341    /// Remove a table's tenant registration (Phase 2.5.4). Called by
10342    /// DROP TABLE / ALTER TABLE DISABLE TENANCY. Removes the auto-policy
10343    /// but leaves any user-installed explicit policies intact.
10344    pub fn unregister_tenant_table(&self, table: &str) {
10345        self.inner.tenant_tables.write().remove(table);
10346        self.inner
10347            .rls_policies
10348            .write()
10349            .remove(&(table.to_string(), "__tenant_iso".to_string()));
10350        self.drop_tenant_index(table);
10351        // Only clear RLS enablement if no other policies remain.
10352        let has_other_policies = self
10353            .inner
10354            .rls_policies
10355            .read()
10356            .keys()
10357            .any(|(t, _)| t == table);
10358        if !has_other_policies {
10359            self.inner.rls_enabled_tables.write().remove(table);
10360        }
10361    }
10362
10363    /// Record that the running transaction has marked `id` in `collection`
10364    /// for deletion (Phase 2.3.2b MVCC tombstones). `stamper_xid` is the
10365    /// xid that was written into `xmax` — either the parent txn xid or
10366    /// the innermost savepoint sub-xid. Savepoint rollback filters by
10367    /// this xid to revive only its own tombstones.
10368    pub(crate) fn record_pending_tombstone(
10369        &self,
10370        conn_id: u64,
10371        collection: &str,
10372        id: crate::storage::unified::entity::EntityId,
10373        stamper_xid: crate::storage::transaction::snapshot::Xid,
10374        previous_xmax: crate::storage::transaction::snapshot::Xid,
10375    ) {
10376        self.inner
10377            .pending_tombstones
10378            .write()
10379            .entry(conn_id)
10380            .or_default()
10381            .push((collection.to_string(), id, stamper_xid, previous_xmax));
10382    }
10383
10384    pub(crate) fn record_pending_versioned_update(
10385        &self,
10386        conn_id: u64,
10387        collection: &str,
10388        old_id: crate::storage::unified::entity::EntityId,
10389        new_id: crate::storage::unified::entity::EntityId,
10390        stamper_xid: crate::storage::transaction::snapshot::Xid,
10391        previous_xmax: crate::storage::transaction::snapshot::Xid,
10392    ) {
10393        self.inner
10394            .pending_versioned_updates
10395            .write()
10396            .entry(conn_id)
10397            .or_default()
10398            .push((
10399                collection.to_string(),
10400                old_id,
10401                new_id,
10402                stamper_xid,
10403                previous_xmax,
10404            ));
10405    }
10406
10407    fn with_deferred_store_wal_if_transaction<T>(
10408        &self,
10409        f: impl FnOnce() -> RedDBResult<T>,
10410    ) -> RedDBResult<T> {
10411        let conn_id = current_connection_id();
10412        if !self.inner.tx_contexts.read().contains_key(&conn_id) {
10413            return f();
10414        }
10415
10416        crate::storage::UnifiedStore::begin_deferred_store_wal_capture();
10417        let result = f();
10418        let captured = crate::storage::UnifiedStore::take_deferred_store_wal_capture();
10419        match result {
10420            Ok(value) => {
10421                self.record_pending_store_wal_actions(conn_id, captured);
10422                Ok(value)
10423            }
10424            Err(err) => Err(err),
10425        }
10426    }
10427
10428    fn with_deferred_store_wal_for_dml<T>(
10429        &self,
10430        capture_autocommit_events: bool,
10431        f: impl FnOnce() -> RedDBResult<T>,
10432    ) -> RedDBResult<T> {
10433        let conn_id = current_connection_id();
10434        if self.inner.tx_contexts.read().contains_key(&conn_id) {
10435            return self.with_deferred_store_wal_if_transaction(f);
10436        }
10437        if !capture_autocommit_events {
10438            return f();
10439        }
10440
10441        crate::storage::UnifiedStore::begin_deferred_store_wal_capture();
10442        let result = f();
10443        let captured = crate::storage::UnifiedStore::take_deferred_store_wal_capture();
10444        self.inner
10445            .db
10446            .store()
10447            .append_deferred_store_wal_actions(captured)
10448            .map_err(|err| RedDBError::Internal(err.to_string()))?;
10449        result
10450    }
10451
10452    fn insert_may_emit_events(&self, query: &InsertQuery) -> bool {
10453        !query.suppress_events
10454            && self.collection_has_event_subscriptions_for_operation(
10455                &query.table,
10456                crate::catalog::SubscriptionOperation::Insert,
10457            )
10458    }
10459
10460    fn update_may_emit_events(&self, query: &UpdateQuery) -> bool {
10461        !query.suppress_events
10462            && self.collection_has_event_subscriptions_for_operation(
10463                &query.table,
10464                crate::catalog::SubscriptionOperation::Update,
10465            )
10466    }
10467
10468    fn delete_may_emit_events(&self, query: &DeleteQuery) -> bool {
10469        !query.suppress_events
10470            && self.collection_has_event_subscriptions_for_operation(
10471                &query.table,
10472                crate::catalog::SubscriptionOperation::Delete,
10473            )
10474    }
10475
10476    fn collection_has_event_subscriptions_for_operation(
10477        &self,
10478        collection: &str,
10479        operation: crate::catalog::SubscriptionOperation,
10480    ) -> bool {
10481        let Some(contract) = self.db().collection_contract_arc(collection) else {
10482            return false;
10483        };
10484        contract.subscriptions.iter().any(|subscription| {
10485            subscription.enabled
10486                && (subscription.ops_filter.is_empty()
10487                    || subscription.ops_filter.contains(&operation))
10488        })
10489    }
10490
10491    fn record_pending_store_wal_actions(
10492        &self,
10493        conn_id: u64,
10494        actions: crate::storage::unified::DeferredStoreWalActions,
10495    ) {
10496        if actions.is_empty() {
10497            return;
10498        }
10499        let mut guard = self.inner.pending_store_wal_actions.write();
10500        guard.entry(conn_id).or_default().extend(actions);
10501    }
10502
10503    fn flush_pending_store_wal_actions(&self, conn_id: u64) -> RedDBResult<()> {
10504        let Some(actions) = self
10505            .inner
10506            .pending_store_wal_actions
10507            .write()
10508            .remove(&conn_id)
10509        else {
10510            return Ok(());
10511        };
10512        self.inner
10513            .db
10514            .store()
10515            .append_deferred_store_wal_actions(actions)
10516            .map_err(|err| RedDBError::Internal(err.to_string()))
10517    }
10518
10519    fn discard_pending_store_wal_actions(&self, conn_id: u64) {
10520        self.inner
10521            .pending_store_wal_actions
10522            .write()
10523            .remove(&conn_id);
10524    }
10525
10526    fn xid_conflicts_with_snapshot(
10527        &self,
10528        xid: crate::storage::transaction::snapshot::Xid,
10529        snapshot: &crate::storage::transaction::snapshot::Snapshot,
10530        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
10531    ) -> bool {
10532        xid != 0
10533            && !own_xids.contains(&xid)
10534            && !self.inner.snapshot_manager.is_aborted(xid)
10535            && !self.inner.snapshot_manager.is_active(xid)
10536            && (xid > snapshot.xid || snapshot.in_progress.contains(&xid))
10537    }
10538
10539    fn conflict_error(
10540        collection: &str,
10541        logical_id: crate::storage::unified::entity::EntityId,
10542        xid: crate::storage::transaction::snapshot::Xid,
10543    ) -> RedDBError {
10544        RedDBError::Query(format!(
10545            "serialization conflict: table row {collection}/{} was modified by concurrent transaction {xid}",
10546            logical_id.raw()
10547        ))
10548    }
10549
10550    fn check_logical_row_conflict(
10551        &self,
10552        collection: &str,
10553        logical_id: crate::storage::unified::entity::EntityId,
10554        excluded_ids: &[crate::storage::unified::entity::EntityId],
10555        snapshot: &crate::storage::transaction::snapshot::Snapshot,
10556        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
10557    ) -> RedDBResult<()> {
10558        let store = self.inner.db.store();
10559        let Some(manager) = store.get_collection(collection) else {
10560            return Ok(());
10561        };
10562
10563        for candidate in manager.query_all(|_| true) {
10564            if excluded_ids.contains(&candidate.id) || candidate.logical_id() != logical_id {
10565                continue;
10566            }
10567            if self.xid_conflicts_with_snapshot(candidate.xmin, snapshot, own_xids) {
10568                return Err(Self::conflict_error(collection, logical_id, candidate.xmin));
10569            }
10570            if self.xid_conflicts_with_snapshot(candidate.xmax, snapshot, own_xids) {
10571                return Err(Self::conflict_error(collection, logical_id, candidate.xmax));
10572            }
10573        }
10574        Ok(())
10575    }
10576
10577    pub(crate) fn check_table_row_write_conflicts(
10578        &self,
10579        conn_id: u64,
10580        snapshot: &crate::storage::transaction::snapshot::Snapshot,
10581        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
10582    ) -> RedDBResult<()> {
10583        let versioned_updates = self
10584            .inner
10585            .pending_versioned_updates
10586            .read()
10587            .get(&conn_id)
10588            .cloned()
10589            .unwrap_or_default();
10590        let tombstones = self
10591            .inner
10592            .pending_tombstones
10593            .read()
10594            .get(&conn_id)
10595            .cloned()
10596            .unwrap_or_default();
10597
10598        let store = self.inner.db.store();
10599        for (collection, old_id, new_id, xid, previous_xmax) in versioned_updates {
10600            let Some(manager) = store.get_collection(&collection) else {
10601                continue;
10602            };
10603            let Some(old) = manager.get(old_id) else {
10604                continue;
10605            };
10606            let logical_id = old.logical_id();
10607            if self.xid_conflicts_with_snapshot(previous_xmax, snapshot, own_xids) {
10608                return Err(Self::conflict_error(&collection, logical_id, previous_xmax));
10609            }
10610            if old.xmax != xid && self.xid_conflicts_with_snapshot(old.xmax, snapshot, own_xids) {
10611                return Err(Self::conflict_error(&collection, logical_id, old.xmax));
10612            }
10613            self.check_logical_row_conflict(
10614                &collection,
10615                logical_id,
10616                &[old_id, new_id],
10617                snapshot,
10618                own_xids,
10619            )?;
10620        }
10621
10622        for (collection, id, xid, previous_xmax) in tombstones {
10623            let Some(manager) = store.get_collection(&collection) else {
10624                continue;
10625            };
10626            let Some(entity) = manager.get(id) else {
10627                continue;
10628            };
10629            let logical_id = entity.logical_id();
10630            if self.xid_conflicts_with_snapshot(previous_xmax, snapshot, own_xids) {
10631                return Err(Self::conflict_error(&collection, logical_id, previous_xmax));
10632            }
10633            if entity.xmax != xid
10634                && self.xid_conflicts_with_snapshot(entity.xmax, snapshot, own_xids)
10635            {
10636                return Err(Self::conflict_error(&collection, logical_id, entity.xmax));
10637            }
10638            self.check_logical_row_conflict(&collection, logical_id, &[id], snapshot, own_xids)?;
10639        }
10640
10641        Ok(())
10642    }
10643
10644    pub(crate) fn restore_pending_write_stamps(&self, conn_id: u64) {
10645        let versioned_updates = self
10646            .inner
10647            .pending_versioned_updates
10648            .read()
10649            .get(&conn_id)
10650            .cloned()
10651            .unwrap_or_default();
10652        let tombstones = self
10653            .inner
10654            .pending_tombstones
10655            .read()
10656            .get(&conn_id)
10657            .cloned()
10658            .unwrap_or_default();
10659
10660        let store = self.inner.db.store();
10661        for (collection, old_id, _new_id, xid, _previous_xmax) in versioned_updates {
10662            if let Some(manager) = store.get_collection(&collection) {
10663                if let Some(mut entity) = manager.get(old_id) {
10664                    entity.set_xmax(xid);
10665                    let _ = manager.update(entity);
10666                }
10667            }
10668        }
10669        for (collection, id, xid, _previous_xmax) in tombstones {
10670            if let Some(manager) = store.get_collection(&collection) {
10671                if let Some(mut entity) = manager.get(id) {
10672                    entity.set_xmax(xid);
10673                    let _ = manager.update(entity);
10674                }
10675            }
10676        }
10677    }
10678
10679    pub(crate) fn finalize_pending_versioned_updates(&self, conn_id: u64) {
10680        self.inner
10681            .pending_versioned_updates
10682            .write()
10683            .remove(&conn_id);
10684    }
10685
10686    pub(crate) fn revive_pending_versioned_updates(&self, conn_id: u64) {
10687        let Some(pending) = self
10688            .inner
10689            .pending_versioned_updates
10690            .write()
10691            .remove(&conn_id)
10692        else {
10693            return;
10694        };
10695
10696        let store = self.inner.db.store();
10697        for (collection, old_id, new_id, xid, previous_xmax) in pending {
10698            if let Some(manager) = store.get_collection(&collection) {
10699                if let Some(mut old) = manager.get(old_id) {
10700                    if old.xmax == xid {
10701                        old.set_xmax(previous_xmax);
10702                        let _ = manager.update(old);
10703                    }
10704                }
10705            }
10706            let _ = store.delete_batch(&collection, &[new_id]);
10707        }
10708    }
10709
10710    pub(crate) fn revive_versioned_updates_since(&self, conn_id: u64, stamper_xid: u64) -> usize {
10711        let mut guard = self.inner.pending_versioned_updates.write();
10712        let Some(pending) = guard.get_mut(&conn_id) else {
10713            return 0;
10714        };
10715
10716        let store = self.inner.db.store();
10717        let mut reverted = 0usize;
10718        pending.retain(|(collection, old_id, new_id, xid, previous_xmax)| {
10719            if *xid < stamper_xid {
10720                return true;
10721            }
10722            if let Some(manager) = store.get_collection(collection) {
10723                if let Some(mut old) = manager.get(*old_id) {
10724                    if old.xmax == *xid {
10725                        old.set_xmax(*previous_xmax);
10726                        let _ = manager.update(old);
10727                    }
10728                }
10729            }
10730            let _ = store.delete_batch(collection, &[*new_id]);
10731            reverted += 1;
10732            false
10733        });
10734        if pending.is_empty() {
10735            guard.remove(&conn_id);
10736        }
10737        reverted
10738    }
10739
10740    /// Flush tombstones on COMMIT. The xmax stamp is already the durable
10741    /// delete marker; commit only drops the rollback journal and emits
10742    /// side effects. Physical reclamation is left for VACUUM so old
10743    /// snapshots can still resolve the pre-delete row version.
10744    pub(crate) fn finalize_pending_tombstones(&self, conn_id: u64) {
10745        let Some(pending) = self.inner.pending_tombstones.write().remove(&conn_id) else {
10746            return;
10747        };
10748        if pending.is_empty() {
10749            return;
10750        }
10751
10752        let store = self.inner.db.store();
10753        for (collection, id, _xid, _previous_xmax) in pending {
10754            store.context_index().remove_entity(id);
10755            self.cdc_emit(
10756                crate::replication::cdc::ChangeOperation::Delete,
10757                &collection,
10758                id.raw(),
10759                "entity",
10760            );
10761        }
10762    }
10763
10764    /// Revive tombstones on ROLLBACK — reset `xmax` to 0 so the tuples
10765    /// become visible again to future snapshots. Best-effort: a row
10766    /// already reclaimed by a concurrent VACUUM stays gone, but VACUUM
10767    /// never reclaims tuples whose xmax is still referenced by any
10768    /// active snapshot, so this case is only reachable via external
10769    /// storage corruption.
10770    pub(crate) fn revive_pending_tombstones(&self, conn_id: u64) {
10771        let Some(pending) = self.inner.pending_tombstones.write().remove(&conn_id) else {
10772            return;
10773        };
10774
10775        let store = self.inner.db.store();
10776        for (collection, id, xid, previous_xmax) in pending {
10777            let Some(manager) = store.get_collection(&collection) else {
10778                continue;
10779            };
10780            if let Some(mut entity) = manager.get(id) {
10781                if entity.xmax == xid {
10782                    entity.set_xmax(previous_xmax);
10783                    let _ = manager.update(entity);
10784                }
10785            }
10786        }
10787    }
10788
10789    /// Slice C of PRD #718 — accessor for the local wait registry.
10790    pub fn queue_wait_registry(
10791        &self,
10792    ) -> std::sync::Arc<crate::runtime::queue_wait_registry::QueueWaitRegistry> {
10793        self.inner.queue_wait_registry.clone()
10794    }
10795
10796    /// Buffer a `(scope, queue)` wake on the current connection so it
10797    /// fires post-COMMIT, or notify immediately if no transaction is
10798    /// open (autocommit path). The wait registry only ever observes
10799    /// notifies for committed work — rollback drops the buffer.
10800    pub(crate) fn record_queue_wake(&self, scope: &str, queue: &str) {
10801        if self.current_xid().is_some() {
10802            let conn_id = current_connection_id();
10803            self.inner
10804                .pending_queue_wakes
10805                .write()
10806                .entry(conn_id)
10807                .or_default()
10808                .push((scope.to_string(), queue.to_string()));
10809            return;
10810        }
10811        self.inner.queue_wait_registry.notify(scope, queue);
10812    }
10813
10814    pub(crate) fn finalize_pending_queue_wakes(&self, conn_id: u64) {
10815        let Some(pending) = self.inner.pending_queue_wakes.write().remove(&conn_id) else {
10816            return;
10817        };
10818        for (scope, queue) in pending {
10819            self.inner.queue_wait_registry.notify(&scope, &queue);
10820        }
10821    }
10822
10823    pub(crate) fn discard_pending_queue_wakes(&self, conn_id: u64) {
10824        self.inner.pending_queue_wakes.write().remove(&conn_id);
10825    }
10826
10827    pub(crate) fn finalize_pending_kv_watch_events(&self, conn_id: u64) {
10828        let Some(pending) = self.inner.pending_kv_watch_events.write().remove(&conn_id) else {
10829            return;
10830        };
10831        for event in pending {
10832            self.cdc_emit_kv(
10833                event.op,
10834                &event.collection,
10835                &event.key,
10836                0,
10837                event.before,
10838                event.after,
10839            );
10840        }
10841    }
10842
10843    pub(crate) fn discard_pending_kv_watch_events(&self, conn_id: u64) {
10844        self.inner.pending_kv_watch_events.write().remove(&conn_id);
10845    }
10846
10847    /// Materialise the entire graph store while applying MVCC visibility
10848    /// AND per-collection RLS to each candidate node and edge. Mirrors
10849    /// `materialize_graph` but routes every entity through the same
10850    /// gate the SELECT path uses, with the correct `PolicyTargetKind`
10851    /// per entity kind (`Nodes` for graph nodes, `Edges` for graph
10852    /// edges). Returns the filtered `GraphStore` plus the
10853    /// `node_id → properties` map the executor needs for `RETURN n.*`
10854    /// projections.
10855    fn materialize_graph_with_rls(
10856        &self,
10857    ) -> RedDBResult<(
10858        crate::storage::engine::GraphStore,
10859        std::collections::HashMap<
10860            String,
10861            std::collections::HashMap<String, crate::storage::schema::Value>,
10862        >,
10863        crate::storage::query::unified::EdgeProperties,
10864    )> {
10865        use crate::storage::engine::GraphStore;
10866        use crate::storage::query::ast::{PolicyAction, PolicyTargetKind};
10867        use crate::storage::unified::entity::{EntityData, EntityKind};
10868        use std::collections::{HashMap, HashSet};
10869
10870        let store = self.inner.db.store();
10871        let snap_ctx = capture_current_snapshot();
10872        let role = current_auth_identity().map(|(_, r)| r.as_str().to_string());
10873
10874        let graph = GraphStore::new();
10875        let mut node_properties: HashMap<String, HashMap<String, crate::storage::schema::Value>> =
10876            HashMap::new();
10877        let mut edge_properties: crate::storage::query::unified::EdgeProperties = HashMap::new();
10878        let mut allowed_nodes: HashSet<String> = HashSet::new();
10879
10880        // Per-collection cached compiled filters — Nodes-kind for
10881        // first pass, Edges-kind for the second. None entries mean
10882        // "RLS enabled, zero matching policy → deny all of this kind".
10883        let mut node_rls: HashMap<String, Option<crate::storage::query::ast::Filter>> =
10884            HashMap::new();
10885        let mut edge_rls: HashMap<String, Option<crate::storage::query::ast::Filter>> =
10886            HashMap::new();
10887
10888        let collections = store.list_collections();
10889
10890        // First pass — gather nodes.
10891        for collection in &collections {
10892            let Some(manager) = store.get_collection(collection) else {
10893                continue;
10894            };
10895            let entities = manager.query_all(|_| true);
10896            for entity in entities {
10897                if !entity_visible_with_context(snap_ctx.as_ref(), &entity) {
10898                    continue;
10899                }
10900                let EntityKind::GraphNode(ref node) = entity.kind else {
10901                    continue;
10902                };
10903                if !node_passes_rls(self, collection, role.as_deref(), &mut node_rls, &entity) {
10904                    continue;
10905                }
10906                let id_str = entity.id.raw().to_string();
10907                graph
10908                    .add_node_with_label(
10909                        &id_str,
10910                        &node.label,
10911                        &super::graph_node_label(&node.node_type),
10912                    )
10913                    .map_err(|err| RedDBError::Query(err.to_string()))?;
10914                allowed_nodes.insert(id_str.clone());
10915                if let EntityData::Node(node_data) = &entity.data {
10916                    node_properties.insert(id_str, node_data.properties.clone());
10917                }
10918            }
10919        }
10920
10921        // Second pass — gather edges. An edge appears only when both
10922        // endpoint nodes survived the RLS pass AND the edge itself
10923        // passes its own RLS gate.
10924        for collection in &collections {
10925            let Some(manager) = store.get_collection(collection) else {
10926                continue;
10927            };
10928            let entities = manager.query_all(|_| true);
10929            for entity in entities {
10930                if !entity_visible_with_context(snap_ctx.as_ref(), &entity) {
10931                    continue;
10932                }
10933                let EntityKind::GraphEdge(ref edge) = entity.kind else {
10934                    continue;
10935                };
10936                if !allowed_nodes.contains(&edge.from_node)
10937                    || !allowed_nodes.contains(&edge.to_node)
10938                {
10939                    continue;
10940                }
10941                if !edge_passes_rls(self, collection, role.as_deref(), &mut edge_rls, &entity) {
10942                    continue;
10943                }
10944                let weight = match &entity.data {
10945                    EntityData::Edge(e) => e.weight,
10946                    _ => edge.weight as f32 / 1000.0,
10947                };
10948                let edge_label = super::graph_edge_label(&edge.label);
10949                graph
10950                    .add_edge_with_label(&edge.from_node, &edge.to_node, &edge_label, weight)
10951                    .map_err(|err| RedDBError::Query(err.to_string()))?;
10952                if let EntityData::Edge(edge_data) = &entity.data {
10953                    edge_properties.insert(
10954                        (edge.from_node.clone(), edge_label, edge.to_node.clone()),
10955                        edge_data.properties.clone(),
10956                    );
10957                }
10958            }
10959        }
10960
10961        // Suppress unused-PolicyAction/PolicyTargetKind warnings — both
10962        // are used inside the helper closures via the per-kind helpers
10963        // declared at the bottom of this file.
10964        let _ = (PolicyAction::Select, PolicyTargetKind::Nodes);
10965
10966        Ok((graph, node_properties, edge_properties))
10967    }
10968
10969    /// Phase 1.1 MVCC universal: post-save hook that stamps `xmin` on a
10970    /// freshly-inserted entity when the current connection holds an
10971    /// open transaction. Used by graph / vector / queue / timeseries
10972    /// write paths that go through the DevX builder API (`db.node(...)
10973    /// .save()` and friends) — those live in the storage crate and
10974    /// can't reach `current_xid()` without crossing layers, so the
10975    /// application layer calls this helper right after `save()` to
10976    /// finalise the MVCC stamp.
10977    ///
10978    /// Autocommit (outside BEGIN) is a no-op — no extra lookup or
10979    /// write, so the non-transactional hot path stays untouched.
10980    ///
10981    /// Best-effort: if the collection or entity disappears between
10982    /// the save and the stamp (concurrent DROP), we silently skip.
10983    pub(crate) fn stamp_xmin_if_in_txn(
10984        &self,
10985        collection: &str,
10986        id: crate::storage::unified::entity::EntityId,
10987    ) {
10988        let Some(xid) = self.current_xid() else {
10989            return;
10990        };
10991        let store = self.inner.db.store();
10992        let Some(manager) = store.get_collection(collection) else {
10993            return;
10994        };
10995        if let Some(mut entity) = manager.get(id) {
10996            entity.set_xmin(xid);
10997            let _ = manager.update(entity);
10998        }
10999    }
11000
11001    /// Revive tombstones stamped by `stamper_xid` or any sub-xid
11002    /// allocated after it (Phase 2.3.2e savepoint rollback). Any
11003    /// pending entries with `xid < stamper_xid` stay queued because
11004    /// they belong to the enclosing scope — they'll either flush on
11005    /// COMMIT or revive on an outer ROLLBACK TO SAVEPOINT.
11006    ///
11007    /// Returns the number of tuples whose `xmax` was wiped back to 0.
11008    pub(crate) fn revive_tombstones_since(&self, conn_id: u64, stamper_xid: u64) -> usize {
11009        let mut guard = self.inner.pending_tombstones.write();
11010        let Some(pending) = guard.get_mut(&conn_id) else {
11011            return 0;
11012        };
11013
11014        let store = self.inner.db.store();
11015        let mut revived = 0usize;
11016        pending.retain(|(collection, id, xid, previous_xmax)| {
11017            if *xid < stamper_xid {
11018                // Stamped before the savepoint — keep in queue.
11019                return true;
11020            }
11021            if let Some(manager) = store.get_collection(collection) {
11022                if let Some(mut entity) = manager.get(*id) {
11023                    if entity.xmax == *xid {
11024                        entity.set_xmax(*previous_xmax);
11025                        let _ = manager.update(entity);
11026                        revived += 1;
11027                    }
11028                }
11029            }
11030            false
11031        });
11032        if pending.is_empty() {
11033            guard.remove(&conn_id);
11034        }
11035        revived
11036    }
11037
11038    /// Return the snapshot the current connection should use for visibility
11039    /// checks (Phase 2.3 PG parity).
11040    ///
11041    /// * If the connection is inside a BEGIN-wrapped transaction, reuse
11042    ///   the snapshot stored in its `TxnContext`.
11043    /// * Otherwise (autocommit), capture a fresh snapshot tied to an
11044    ///   implicit xid=0 — the read path treats pre-MVCC rows as always
11045    ///   visible so this degrades to "see everything committed".
11046    pub fn current_snapshot(&self) -> crate::storage::transaction::snapshot::Snapshot {
11047        let conn_id = current_connection_id();
11048        if let Some(ctx) = self.inner.tx_contexts.read().get(&conn_id).cloned() {
11049            return ctx.snapshot;
11050        }
11051        // Autocommit: take a fresh snapshot bounded by `peek_next_xid` so
11052        // every already-committed xid (which is strictly less) passes the
11053        // `xmin <= snap.xid` gate, while concurrently-active xids land in
11054        // the `in_progress` set and stay hidden until they commit. Using
11055        // xid=0 would incorrectly hide every MVCC-stamped tuple.
11056        let high_water = self.inner.snapshot_manager.peek_next_xid();
11057        self.inner.snapshot_manager.snapshot(high_water)
11058    }
11059
11060    /// Xid of the current connection's active transaction, or `None` when
11061    /// running outside a BEGIN/COMMIT block. Write paths call this to
11062    /// decide whether to stamp `xmin`/`xmax` on tuples.
11063    /// Phase 2.3.2e: when a savepoint is open, `writer_xid` returns the
11064    /// sub-xid so new writes can be selectively rolled back. Otherwise
11065    /// the parent txn's xid is returned, matching pre-savepoint
11066    /// behaviour. Callers that need the enclosing *transaction* xid
11067    /// (e.g. VACUUM min-active calculations) should read `ctx.xid`
11068    /// directly.
11069    pub fn current_xid(&self) -> Option<crate::storage::transaction::snapshot::Xid> {
11070        let conn_id = current_connection_id();
11071        self.inner
11072            .tx_contexts
11073            .read()
11074            .get(&conn_id)
11075            .map(|ctx| ctx.writer_xid())
11076    }
11077
11078    /// `true` when the given connection id has an open `BEGIN`. Issue
11079    /// #760 — `OpenStream` consults this to refuse output streams that
11080    /// would otherwise collide with an interactive transaction (see
11081    /// ADR 0029 "Transaction interaction"). HTTP requests pre-dating the
11082    /// connection-id plumbing run with id `0`, which never carries a
11083    /// transaction context, so this returns `false` on those paths.
11084    pub fn connection_in_transaction(&self, conn_id: u64) -> bool {
11085        self.inner.tx_contexts.read().contains_key(&conn_id)
11086    }
11087
11088    /// Access the shared `SnapshotManager` — useful for VACUUM to compute
11089    /// the oldest-active xid when reclaiming dead tuples.
11090    pub fn snapshot_manager(&self) -> Arc<crate::storage::transaction::snapshot::SnapshotManager> {
11091        Arc::clone(&self.inner.snapshot_manager)
11092    }
11093
11094    fn mvcc_vacuum_cutoff_xid(&self) -> crate::storage::transaction::snapshot::Xid {
11095        let manager = &self.inner.snapshot_manager;
11096        let next_xid = manager.peek_next_xid();
11097        let mut cutoff = next_xid;
11098        if let Some(oldest_active) = manager.oldest_active_xid() {
11099            cutoff = cutoff.min(oldest_active);
11100        }
11101        if let Some(oldest_pinned) = manager.oldest_pinned_xid() {
11102            cutoff = cutoff.min(oldest_pinned);
11103        }
11104        let retention_xids = self.config_u64("runtime.mvcc.vacuum_retention_xids", 0);
11105        if retention_xids > 0 {
11106            cutoff = cutoff.min(next_xid.saturating_sub(retention_xids));
11107        }
11108        cutoff
11109    }
11110
11111    fn rebuild_runtime_indexes_for_table(&self, table: &str) -> RedDBResult<()> {
11112        let registered = self.inner.index_store.list_indices(table);
11113        if registered.is_empty() {
11114            return Ok(());
11115        }
11116        let store = self.inner.db.store();
11117        let Some(manager) = store.get_collection(table) else {
11118            return Ok(());
11119        };
11120        let entity_fields = manager
11121            .query_all(|entity| matches!(entity.kind, crate::storage::EntityKind::TableRow { .. }))
11122            .into_iter()
11123            .map(|entity| (entity.id, table_row_index_fields(&entity)))
11124            .collect::<Vec<_>>();
11125
11126        for index in registered {
11127            self.inner.index_store.drop_index(&index.name, table);
11128            self.inner
11129                .index_store
11130                .create_index(
11131                    &index.name,
11132                    table,
11133                    &index.columns,
11134                    index.method,
11135                    index.unique,
11136                    &entity_fields,
11137                )
11138                .map_err(RedDBError::Internal)?;
11139            self.inner.index_store.register(index);
11140        }
11141        self.invalidate_plan_cache();
11142        Ok(())
11143    }
11144
11145    /// Own-tx xids (parent + open/released savepoints) for the current
11146    /// connection. Transports + tests that build a `SnapshotContext`
11147    /// manually (outside the `execute_query` scope) need this set so
11148    /// the writer's own uncommitted tuples stay visible to self.
11149    pub fn current_txn_own_xids(
11150        &self,
11151    ) -> std::collections::HashSet<crate::storage::transaction::snapshot::Xid> {
11152        let mut set = std::collections::HashSet::new();
11153        if let Some(ctx) = self.inner.tx_contexts.read().get(&current_connection_id()) {
11154            set.insert(ctx.xid);
11155            for (_, sub) in &ctx.savepoints {
11156                set.insert(*sub);
11157            }
11158            for sub in &ctx.released_sub_xids {
11159                set.insert(*sub);
11160            }
11161        }
11162        set
11163    }
11164
11165    /// Access the shared `ForeignTableRegistry` (Phase 3.2 PG parity).
11166    ///
11167    /// Callers use this to check whether a table name is a registered
11168    /// foreign table (`registry.is_foreign_table(name)`) and, if so, to
11169    /// scan it (`registry.scan(name)`). The read-path rewriter consults
11170    /// this before dispatching into native-collection lookup.
11171    pub fn foreign_tables(&self) -> Arc<crate::storage::fdw::ForeignTableRegistry> {
11172        Arc::clone(&self.inner.foreign_tables)
11173    }
11174
11175    /// Is Row-Level Security enabled for this table? (Phase 2.5 PG parity)
11176    pub fn is_rls_enabled(&self, table: &str) -> bool {
11177        self.inner.rls_enabled_tables.read().contains(table)
11178    }
11179
11180    /// Collect the USING predicates that apply to this `(table, role, action)`.
11181    ///
11182    /// Returned filters should be OR-combined (a row passes RLS when *any*
11183    /// matching policy accepts it) and then AND-ed into the query's WHERE.
11184    /// When the table has RLS disabled this returns an empty Vec — callers
11185    /// can fast-path back to the unfiltered read.
11186    pub fn matching_rls_policies(
11187        &self,
11188        table: &str,
11189        role: Option<&str>,
11190        action: crate::storage::query::ast::PolicyAction,
11191    ) -> Vec<crate::storage::query::ast::Filter> {
11192        // Default kind = Table preserves the pre-Phase-2.5.5 behaviour:
11193        // callers that don't name a kind only see Table-scoped
11194        // policies (which is what execute SELECT / UPDATE / DELETE
11195        // expect).
11196        self.matching_rls_policies_for_kind(
11197            table,
11198            role,
11199            action,
11200            crate::storage::query::ast::PolicyTargetKind::Table,
11201        )
11202    }
11203
11204    /// Kind-aware variant used by cross-model scans (Phase 2.5.5).
11205    ///
11206    /// Graph scans request `Nodes` / `Edges`, vector ANN requests
11207    /// `Vectors`, queue consumers request `Messages`, and timeseries
11208    /// range scans request `Points`. Policies tagged with a
11209    /// different kind are skipped so a graph-scoped policy doesn't
11210    /// accidentally gate a table SELECT on the same collection.
11211    pub fn matching_rls_policies_for_kind(
11212        &self,
11213        table: &str,
11214        role: Option<&str>,
11215        action: crate::storage::query::ast::PolicyAction,
11216        kind: crate::storage::query::ast::PolicyTargetKind,
11217    ) -> Vec<crate::storage::query::ast::Filter> {
11218        if !self.is_rls_enabled(table) {
11219            return Vec::new();
11220        }
11221        let policies = self.inner.rls_policies.read();
11222        policies
11223            .iter()
11224            .filter_map(|((t, _), p)| {
11225                if t != table {
11226                    return None;
11227                }
11228                // Kind gate — Table policies also apply to every
11229                // other kind *iff* the policy predicate evaluates
11230                // against entity fields that exist uniformly; the
11231                // caller's kind filter is the stricter check, so
11232                // match literally. Auto-tenancy policies stamp
11233                // Table and the caller passes the concrete kind —
11234                // we allow Table policies to apply cross-kind for
11235                // backwards compat.
11236                if p.target_kind != kind
11237                    && p.target_kind != crate::storage::query::ast::PolicyTargetKind::Table
11238                {
11239                    return None;
11240                }
11241                // Action gate — `None` means "ALL" actions.
11242                if let Some(a) = p.action {
11243                    if a != action {
11244                        return None;
11245                    }
11246                }
11247                // Role gate — `None` means "any role".
11248                if let Some(p_role) = p.role.as_deref() {
11249                    match role {
11250                        Some(r) if r == p_role => {}
11251                        _ => return None,
11252                    }
11253                }
11254                Some((*p.using).clone())
11255            })
11256            .collect()
11257    }
11258
11259    pub(crate) fn refresh_table_planner_stats(&self, table: &str) {
11260        let store = self.inner.db.store();
11261        if let Some(stats) =
11262            crate::storage::query::planner::stats_catalog::analyze_collection(store.as_ref(), table)
11263        {
11264            crate::storage::query::planner::stats_catalog::persist_table_stats(
11265                store.as_ref(),
11266                &stats,
11267            );
11268        } else {
11269            crate::storage::query::planner::stats_catalog::clear_table_stats(store.as_ref(), table);
11270        }
11271        self.invalidate_plan_cache();
11272    }
11273
11274    pub(crate) fn note_table_write(&self, table: &str) {
11275        // Skip the write lock when the table is already marked
11276        // dirty. With single-row UPDATEs in a loop this used to
11277        // grab the planner_dirty_tables write lock N times even
11278        // though the first call already flipped the flag.
11279        let already_dirty = self.inner.planner_dirty_tables.read().contains(table);
11280        if !already_dirty {
11281            self.inner
11282                .planner_dirty_tables
11283                .write()
11284                .insert(table.to_string());
11285        }
11286        self.invalidate_result_cache_for_table(table);
11287    }
11288
11289    /// Wrap the planner's `RuntimeQueryExplain` as rows on a
11290    /// `RuntimeQueryResult` so callers over the SQL interface see the
11291    /// plan tree in the same shape a SELECT produces.
11292    ///
11293    /// Columns: `op`, `source`, `est_rows`, `est_cost`, `depth`.
11294    /// Nodes are walked depth-first; `depth` counts from 0 at the
11295    /// root so a text renderer can indent without re-walking.
11296    fn explain_as_rows(&self, raw_query: &str, inner_sql: &str) -> RedDBResult<RuntimeQueryResult> {
11297        let explain = self.explain_query(inner_sql)?;
11298
11299        let columns = vec![
11300            "op".to_string(),
11301            "source".to_string(),
11302            "est_rows".to_string(),
11303            "est_cost".to_string(),
11304            "depth".to_string(),
11305        ];
11306
11307        let mut records: Vec<crate::storage::query::unified::UnifiedRecord> = Vec::new();
11308
11309        // Prepend `CteScan` markers when the query carried a leading
11310        // WITH clause. The CTE bodies are already inlined into the
11311        // main plan tree, but operators reading EXPLAIN need to see
11312        // which named CTEs were resolved — without this row the plan
11313        // would look indistinguishable from a hand-inlined query.
11314        for name in &explain.cte_materializations {
11315            use std::sync::Arc;
11316            let mut rec = crate::storage::query::unified::UnifiedRecord::default();
11317            rec.set_arc(Arc::from("op"), Value::text("CteScan".to_string()));
11318            rec.set_arc(Arc::from("source"), Value::text(name.clone()));
11319            rec.set_arc(Arc::from("est_rows"), Value::Float(0.0));
11320            rec.set_arc(Arc::from("est_cost"), Value::Float(0.0));
11321            rec.set_arc(Arc::from("depth"), Value::Integer(0));
11322            records.push(rec);
11323        }
11324
11325        walk_plan_node(&explain.logical_plan.root, 0, &mut records);
11326
11327        let result = crate::storage::query::unified::UnifiedResult {
11328            columns,
11329            records,
11330            stats: Default::default(),
11331            pre_serialized_json: None,
11332        };
11333
11334        Ok(RuntimeQueryResult {
11335            query: raw_query.to_string(),
11336            mode: explain.mode,
11337            statement: "explain",
11338            engine: "runtime-explain",
11339            result,
11340            affected_rows: 0,
11341            statement_type: "select",
11342            bookmark: None,
11343        })
11344    }
11345
11346    // -----------------------------------------------------------------
11347    // Granular RBAC — privilege gate + GRANT/REVOKE/ALTER USER dispatch
11348    // -----------------------------------------------------------------
11349
11350    /// Project a `QueryExpr` to the (action, resource) pair the
11351    /// privilege engine cares about. Returns `Ok(())` for statements
11352    /// that don't touch user data (transaction control, SHOW, SET, etc.).
11353    pub(crate) fn check_query_privilege(
11354        &self,
11355        expr: &crate::storage::query::ast::QueryExpr,
11356    ) -> Result<(), String> {
11357        use crate::auth::privileges::{Action, AuthzContext, Resource};
11358        use crate::auth::UserId;
11359        use crate::storage::query::ast::QueryExpr;
11360
11361        // No auth store wired (embedded mode / fresh DB / tests) → bypass.
11362        // The bootstrap path itself goes through `execute_query` so this
11363        // is the only sensible default; once auth is wired, the gate
11364        // becomes active.
11365        let auth_store = match self.inner.auth_store.read().clone() {
11366            Some(s) => s,
11367            None => return Ok(()),
11368        };
11369
11370        // Resolve principal + role from the thread-local identity.
11371        // Anonymous (no identity) is allowed to read the bootstrap path
11372        // only when auth_store says so; we treat missing identity as
11373        // platform-admin-equivalent here so embedded test harnesses
11374        // continue to work without setting an identity.
11375        let (username, role) = match current_auth_identity() {
11376            Some(p) => p,
11377            None => return Ok(()),
11378        };
11379        let tenant = current_tenant();
11380
11381        let ctx = AuthzContext {
11382            principal: &username,
11383            effective_role: role,
11384            tenant: tenant.as_deref(),
11385        };
11386        let principal_id = UserId::from_parts(tenant.as_deref(), &username);
11387
11388        // Map QueryExpr → (Action, Resource).
11389        let (action, resource) = match expr {
11390            QueryExpr::Table(t) => (Action::Select, Resource::table_from_name(&t.table)),
11391            QueryExpr::QueueSelect(q) => {
11392                return self.check_queue_op_privilege(
11393                    &auth_store,
11394                    &principal_id,
11395                    role,
11396                    tenant.as_deref(),
11397                    "queue:peek",
11398                    &q.queue,
11399                );
11400            }
11401            QueryExpr::QueueCommand(cmd) => {
11402                use crate::storage::query::ast::QueueCommand;
11403                let (queue, action_verb) = match cmd {
11404                    QueueCommand::Push { queue, .. } => (queue.as_str(), "queue:enqueue"),
11405                    QueueCommand::Pop { queue, .. }
11406                    | QueueCommand::GroupRead { queue, .. }
11407                    | QueueCommand::Claim { queue, .. } => (queue.as_str(), "queue:read"),
11408                    QueueCommand::Peek { queue, .. }
11409                    | QueueCommand::Len { queue }
11410                    | QueueCommand::Pending { queue, .. } => (queue.as_str(), "queue:peek"),
11411                    QueueCommand::Ack { queue, .. } => (queue.as_str(), "queue:ack"),
11412                    QueueCommand::Nack {
11413                        queue, delay_ms, ..
11414                    } => {
11415                        // Per-failure retry overrides re-shape retry
11416                        // behaviour for everyone draining the queue and
11417                        // gate on the dedicated `queue:retry` verb so
11418                        // operators can grant base NACK without granting
11419                        // the override capability.
11420                        let verb = if delay_ms.is_some() {
11421                            "queue:retry"
11422                        } else {
11423                            "queue:nack"
11424                        };
11425                        (queue.as_str(), verb)
11426                    }
11427                    QueueCommand::Purge { queue } => (queue.as_str(), "queue:purge"),
11428                    // `GroupCreate` is part of the consumer-setup
11429                    // surface — read-side, never destructive.
11430                    QueueCommand::GroupCreate { queue, .. } => (queue.as_str(), "queue:read"),
11431                    QueueCommand::Move { source, .. } => (source.as_str(), "queue:dlq:move"),
11432                };
11433                return self.check_queue_op_privilege(
11434                    &auth_store,
11435                    &principal_id,
11436                    role,
11437                    tenant.as_deref(),
11438                    action_verb,
11439                    queue,
11440                );
11441            }
11442            QueryExpr::Graph(g) => {
11443                // MATCH … RETURN is the explorer's pattern-traversal
11444                // surface — gate on `graph:traverse` (#757).
11445                self.check_graph_op_privilege(
11446                    &auth_store,
11447                    &principal_id,
11448                    role,
11449                    tenant.as_deref(),
11450                    "graph:traverse",
11451                )?;
11452                if auth_store.iam_authorization_enabled() {
11453                    self.check_graph_property_projection_privilege(
11454                        &auth_store,
11455                        &principal_id,
11456                        role,
11457                        tenant.as_deref(),
11458                        g,
11459                    )?;
11460                    return Ok(());
11461                }
11462                return Ok(());
11463            }
11464            QueryExpr::Path(_) => {
11465                // PATH FROM … TO … is a path-traversal query — gates
11466                // on `graph:traverse` like neighborhood/shortest-path
11467                // (#757).
11468                return self.check_graph_op_privilege(
11469                    &auth_store,
11470                    &principal_id,
11471                    role,
11472                    tenant.as_deref(),
11473                    "graph:traverse",
11474                );
11475            }
11476            QueryExpr::GraphCommand(cmd) => {
11477                use crate::storage::query::ast::GraphCommand;
11478                let action_verb = match cmd {
11479                    // Metadata / property reads.
11480                    GraphCommand::Properties { .. } => "graph:read",
11481                    // Traversal / pattern-walk surface.
11482                    GraphCommand::Neighborhood { .. }
11483                    | GraphCommand::Traverse { .. }
11484                    | GraphCommand::ShortestPath { .. } => "graph:traverse",
11485                    // Analytics algorithms — expensive enough that Red
11486                    // UI needs to gate the runner independently of
11487                    // ordinary traversal.
11488                    GraphCommand::Centrality { .. }
11489                    | GraphCommand::Community { .. }
11490                    | GraphCommand::Components { .. }
11491                    | GraphCommand::Cycles { .. }
11492                    | GraphCommand::Clustering
11493                    | GraphCommand::TopologicalSort => "graph:algorithm:run",
11494                };
11495                return self.check_graph_op_privilege(
11496                    &auth_store,
11497                    &principal_id,
11498                    role,
11499                    tenant.as_deref(),
11500                    action_verb,
11501                );
11502            }
11503            QueryExpr::Vector(v) => {
11504                if auth_store.iam_authorization_enabled() {
11505                    self.check_vector_op_privilege(
11506                        &auth_store,
11507                        &principal_id,
11508                        role,
11509                        tenant.as_deref(),
11510                        "vector:search",
11511                        &v.collection,
11512                    )?;
11513                    self.check_table_like_column_projection_privilege(
11514                        &auth_store,
11515                        &principal_id,
11516                        role,
11517                        tenant.as_deref(),
11518                        &v.collection,
11519                        &["content".to_string()],
11520                    )?;
11521                    return Ok(());
11522                }
11523                return Ok(());
11524            }
11525            QueryExpr::SearchCommand(cmd) => {
11526                use crate::storage::query::ast::SearchCommand;
11527                if auth_store.iam_authorization_enabled() {
11528                    // `SEARCH SIMILAR [..] COLLECTION <c>` and `SEARCH
11529                    // HYBRID ... COLLECTION <c>` are the same UI
11530                    // affordances as `VECTOR SEARCH` / hybrid joins —
11531                    // Red UI must see the same `vector:search` envelope
11532                    // so a single toolbar grant is sufficient.
11533                    let collection = match cmd {
11534                        SearchCommand::Similar { collection, .. }
11535                        | SearchCommand::Hybrid { collection, .. } => Some(collection.as_str()),
11536                        _ => None,
11537                    };
11538                    if let Some(c) = collection {
11539                        self.check_vector_op_privilege(
11540                            &auth_store,
11541                            &principal_id,
11542                            role,
11543                            tenant.as_deref(),
11544                            "vector:search",
11545                            c,
11546                        )?;
11547                        return Ok(());
11548                    }
11549                }
11550                return Ok(());
11551            }
11552            QueryExpr::Hybrid(h) => {
11553                if auth_store.iam_authorization_enabled() {
11554                    // The vector half of a hybrid search is gated under
11555                    // the same `vector:search` verb as a standalone
11556                    // VECTOR SEARCH — Red UI's hybrid-search toolbar
11557                    // must surface the same UI-safe denial envelope
11558                    // when the principal lacks the grant. The
11559                    // structured half is dispatched to its own gate via
11560                    // the inner query during execution.
11561                    self.check_vector_op_privilege(
11562                        &auth_store,
11563                        &principal_id,
11564                        role,
11565                        tenant.as_deref(),
11566                        "vector:search",
11567                        &h.vector.collection,
11568                    )?;
11569                    return Ok(());
11570                }
11571                return Ok(());
11572            }
11573            QueryExpr::Insert(i) => (Action::Insert, Resource::table_from_name(&i.table)),
11574            QueryExpr::Update(u) => (Action::Update, Resource::table_from_name(&u.table)),
11575            QueryExpr::Delete(d) => (Action::Delete, Resource::table_from_name(&d.table)),
11576            // Joins inherit the read privilege from any constituent
11577            // table — for now we emit a single Select on the database
11578            // (admins bypass; non-admins need a Database/Schema grant).
11579            QueryExpr::Join(_) => (Action::Select, Resource::Database),
11580            // GRANT / REVOKE / ALTER USER are authority statements;
11581            // require Admin (the helper methods enforce).
11582            QueryExpr::Grant(_) | QueryExpr::Revoke(_) | QueryExpr::AlterUser(_) => {
11583                return if role == crate::auth::Role::Admin {
11584                    Ok(())
11585                } else {
11586                    Err(format!(
11587                        "principal=`{}` role=`{:?}` cannot issue ACL/auth DDL",
11588                        username, role
11589                    ))
11590                };
11591            }
11592            QueryExpr::CreateIamPolicy { id, .. } => {
11593                return self.check_policy_management_privilege(
11594                    &auth_store,
11595                    &principal_id,
11596                    role,
11597                    tenant.as_deref(),
11598                    "policy:put",
11599                    "policy",
11600                    id,
11601                );
11602            }
11603            QueryExpr::DropIamPolicy { id } => {
11604                return self.check_policy_management_privilege(
11605                    &auth_store,
11606                    &principal_id,
11607                    role,
11608                    tenant.as_deref(),
11609                    "policy:drop",
11610                    "policy",
11611                    id,
11612                );
11613            }
11614            QueryExpr::AttachPolicy { policy_id, .. } => {
11615                return self.check_policy_management_privilege(
11616                    &auth_store,
11617                    &principal_id,
11618                    role,
11619                    tenant.as_deref(),
11620                    "policy:attach",
11621                    "policy",
11622                    policy_id,
11623                );
11624            }
11625            QueryExpr::DetachPolicy { policy_id, .. } => {
11626                return self.check_policy_management_privilege(
11627                    &auth_store,
11628                    &principal_id,
11629                    role,
11630                    tenant.as_deref(),
11631                    "policy:detach",
11632                    "policy",
11633                    policy_id,
11634                );
11635            }
11636            QueryExpr::ShowPolicies { .. } | QueryExpr::ShowEffectivePermissions { .. } => {
11637                return Ok(());
11638            }
11639            QueryExpr::SimulatePolicy { .. } => {
11640                return self.check_policy_management_privilege(
11641                    &auth_store,
11642                    &principal_id,
11643                    role,
11644                    tenant.as_deref(),
11645                    "policy:simulate",
11646                    "policy",
11647                    "*",
11648                );
11649            }
11650            QueryExpr::LintPolicy { .. } => {
11651                // Linting is a read-only inspection — gate it like
11652                // simulate (policy management role).
11653                return self.check_policy_management_privilege(
11654                    &auth_store,
11655                    &principal_id,
11656                    role,
11657                    tenant.as_deref(),
11658                    "policy:simulate",
11659                    "policy",
11660                    "*",
11661                );
11662            }
11663            QueryExpr::MigratePolicyMode { dry_run, .. } => {
11664                // DRY RUN is a pre-flight inspection (policy:simulate).
11665                // The actual mode flip is a privileged mutation under
11666                // the policy:put action (it persists a new enforcement
11667                // mode to the vault KV through `set_enforcement_mode`).
11668                let action = if *dry_run {
11669                    "policy:simulate"
11670                } else {
11671                    "policy:put"
11672                };
11673                return self.check_policy_management_privilege(
11674                    &auth_store,
11675                    &principal_id,
11676                    role,
11677                    tenant.as_deref(),
11678                    action,
11679                    "policy",
11680                    "*",
11681                );
11682            }
11683            // DROP and TRUNCATE — Write-role gate + per-collection IAM policy
11684            // when IAM mode is active. Other DDL stays role-only for now.
11685            QueryExpr::DropTable(q) => {
11686                return self.check_ddl_collection_privilege(
11687                    &auth_store,
11688                    &principal_id,
11689                    role,
11690                    tenant.as_deref(),
11691                    &username,
11692                    "drop",
11693                    &q.name,
11694                );
11695            }
11696            QueryExpr::DropGraph(q) => {
11697                return self.check_ddl_collection_privilege(
11698                    &auth_store,
11699                    &principal_id,
11700                    role,
11701                    tenant.as_deref(),
11702                    &username,
11703                    "drop",
11704                    &q.name,
11705                );
11706            }
11707            QueryExpr::DropVector(q) => {
11708                return self.check_ddl_collection_privilege(
11709                    &auth_store,
11710                    &principal_id,
11711                    role,
11712                    tenant.as_deref(),
11713                    &username,
11714                    "drop",
11715                    &q.name,
11716                );
11717            }
11718            QueryExpr::DropDocument(q) => {
11719                return self.check_ddl_collection_privilege(
11720                    &auth_store,
11721                    &principal_id,
11722                    role,
11723                    tenant.as_deref(),
11724                    &username,
11725                    "drop",
11726                    &q.name,
11727                );
11728            }
11729            QueryExpr::DropKv(q) => {
11730                return self.check_ddl_collection_privilege(
11731                    &auth_store,
11732                    &principal_id,
11733                    role,
11734                    tenant.as_deref(),
11735                    &username,
11736                    "drop",
11737                    &q.name,
11738                );
11739            }
11740            QueryExpr::DropCollection(q) => {
11741                return self.check_ddl_collection_privilege(
11742                    &auth_store,
11743                    &principal_id,
11744                    role,
11745                    tenant.as_deref(),
11746                    &username,
11747                    "drop",
11748                    &q.name,
11749                );
11750            }
11751            QueryExpr::Truncate(q) => {
11752                return self.check_ddl_collection_privilege(
11753                    &auth_store,
11754                    &principal_id,
11755                    role,
11756                    tenant.as_deref(),
11757                    &username,
11758                    "truncate",
11759                    &q.name,
11760                );
11761            }
11762            // Remaining DDL (#753) — hybrid policy-aware gate. Specific
11763            // create/alter/drop verbs gate operations with a clear
11764            // per-collection target so Red UI can author fine-grained
11765            // policies (`create on collection:users`). Namespace-level
11766            // and grouped DDL fall back to broader `schema:admin` /
11767            // `schema:write` verbs against a `schema:<name>` resource.
11768            // All branches share the [`check_ddl_object_privilege`]
11769            // helper so allows / denies produce the same structured
11770            // "principal=… action=… resource=<kind>:<name> denied by
11771            // IAM policy" reason the Red UI security read contracts
11772            // (#740) already render.
11773            QueryExpr::CreateTable(q) => {
11774                return self.check_ddl_object_privilege(
11775                    &auth_store,
11776                    &principal_id,
11777                    role,
11778                    tenant.as_deref(),
11779                    &username,
11780                    "create",
11781                    "collection",
11782                    &q.name,
11783                    crate::auth::Role::Write,
11784                );
11785            }
11786            QueryExpr::CreateCollection(q) => {
11787                return self.check_ddl_object_privilege(
11788                    &auth_store,
11789                    &principal_id,
11790                    role,
11791                    tenant.as_deref(),
11792                    &username,
11793                    "create",
11794                    "collection",
11795                    &q.name,
11796                    crate::auth::Role::Write,
11797                );
11798            }
11799            QueryExpr::CreateVector(q) => {
11800                return self.check_ddl_object_privilege(
11801                    &auth_store,
11802                    &principal_id,
11803                    role,
11804                    tenant.as_deref(),
11805                    &username,
11806                    "create",
11807                    "collection",
11808                    &q.name,
11809                    crate::auth::Role::Write,
11810                );
11811            }
11812            QueryExpr::AlterTable(q) => {
11813                return self.check_ddl_object_privilege(
11814                    &auth_store,
11815                    &principal_id,
11816                    role,
11817                    tenant.as_deref(),
11818                    &username,
11819                    "alter",
11820                    "collection",
11821                    &q.name,
11822                    crate::auth::Role::Write,
11823                );
11824            }
11825            QueryExpr::CreateIndex(q) => {
11826                return self.check_ddl_object_privilege(
11827                    &auth_store,
11828                    &principal_id,
11829                    role,
11830                    tenant.as_deref(),
11831                    &username,
11832                    "create",
11833                    "collection",
11834                    &q.table,
11835                    crate::auth::Role::Write,
11836                );
11837            }
11838            QueryExpr::DropIndex(q) => {
11839                return self.check_ddl_object_privilege(
11840                    &auth_store,
11841                    &principal_id,
11842                    role,
11843                    tenant.as_deref(),
11844                    &username,
11845                    "drop",
11846                    "collection",
11847                    &q.table,
11848                    crate::auth::Role::Write,
11849                );
11850            }
11851            QueryExpr::CreateSchema(q) => {
11852                return self.check_ddl_object_privilege(
11853                    &auth_store,
11854                    &principal_id,
11855                    role,
11856                    tenant.as_deref(),
11857                    &username,
11858                    "schema:admin",
11859                    "schema",
11860                    &q.name,
11861                    crate::auth::Role::Admin,
11862                );
11863            }
11864            QueryExpr::DropSchema(q) => {
11865                return self.check_ddl_object_privilege(
11866                    &auth_store,
11867                    &principal_id,
11868                    role,
11869                    tenant.as_deref(),
11870                    &username,
11871                    "schema:admin",
11872                    "schema",
11873                    &q.name,
11874                    crate::auth::Role::Admin,
11875                );
11876            }
11877            QueryExpr::CreateSequence(q) => {
11878                return self.check_ddl_object_privilege(
11879                    &auth_store,
11880                    &principal_id,
11881                    role,
11882                    tenant.as_deref(),
11883                    &username,
11884                    "create",
11885                    "collection",
11886                    &q.name,
11887                    crate::auth::Role::Write,
11888                );
11889            }
11890            QueryExpr::DropSequence(q) => {
11891                return self.check_ddl_object_privilege(
11892                    &auth_store,
11893                    &principal_id,
11894                    role,
11895                    tenant.as_deref(),
11896                    &username,
11897                    "drop",
11898                    "collection",
11899                    &q.name,
11900                    crate::auth::Role::Write,
11901                );
11902            }
11903            QueryExpr::CreateView(q) => {
11904                return self.check_ddl_object_privilege(
11905                    &auth_store,
11906                    &principal_id,
11907                    role,
11908                    tenant.as_deref(),
11909                    &username,
11910                    "create",
11911                    "collection",
11912                    &q.name,
11913                    crate::auth::Role::Write,
11914                );
11915            }
11916            QueryExpr::DropView(q) => {
11917                return self.check_ddl_object_privilege(
11918                    &auth_store,
11919                    &principal_id,
11920                    role,
11921                    tenant.as_deref(),
11922                    &username,
11923                    "drop",
11924                    "collection",
11925                    &q.name,
11926                    crate::auth::Role::Write,
11927                );
11928            }
11929            QueryExpr::RefreshMaterializedView(q) => {
11930                return self.check_ddl_object_privilege(
11931                    &auth_store,
11932                    &principal_id,
11933                    role,
11934                    tenant.as_deref(),
11935                    &username,
11936                    "alter",
11937                    "collection",
11938                    &q.name,
11939                    crate::auth::Role::Write,
11940                );
11941            }
11942            QueryExpr::CreatePolicy(q) => {
11943                return self.check_ddl_object_privilege(
11944                    &auth_store,
11945                    &principal_id,
11946                    role,
11947                    tenant.as_deref(),
11948                    &username,
11949                    "create",
11950                    "collection",
11951                    &q.table,
11952                    crate::auth::Role::Write,
11953                );
11954            }
11955            QueryExpr::DropPolicy(q) => {
11956                return self.check_ddl_object_privilege(
11957                    &auth_store,
11958                    &principal_id,
11959                    role,
11960                    tenant.as_deref(),
11961                    &username,
11962                    "drop",
11963                    "collection",
11964                    &q.table,
11965                    crate::auth::Role::Write,
11966                );
11967            }
11968            QueryExpr::CreateServer(q) => {
11969                return self.check_ddl_object_privilege(
11970                    &auth_store,
11971                    &principal_id,
11972                    role,
11973                    tenant.as_deref(),
11974                    &username,
11975                    "schema:admin",
11976                    "schema",
11977                    &q.name,
11978                    crate::auth::Role::Admin,
11979                );
11980            }
11981            QueryExpr::DropServer(q) => {
11982                return self.check_ddl_object_privilege(
11983                    &auth_store,
11984                    &principal_id,
11985                    role,
11986                    tenant.as_deref(),
11987                    &username,
11988                    "schema:admin",
11989                    "schema",
11990                    &q.name,
11991                    crate::auth::Role::Admin,
11992                );
11993            }
11994            QueryExpr::CreateForeignTable(q) => {
11995                return self.check_ddl_object_privilege(
11996                    &auth_store,
11997                    &principal_id,
11998                    role,
11999                    tenant.as_deref(),
12000                    &username,
12001                    "schema:write",
12002                    "schema",
12003                    &q.name,
12004                    crate::auth::Role::Write,
12005                );
12006            }
12007            QueryExpr::DropForeignTable(q) => {
12008                return self.check_ddl_object_privilege(
12009                    &auth_store,
12010                    &principal_id,
12011                    role,
12012                    tenant.as_deref(),
12013                    &username,
12014                    "schema:write",
12015                    "schema",
12016                    &q.name,
12017                    crate::auth::Role::Write,
12018                );
12019            }
12020            QueryExpr::CreateTimeSeries(q) => {
12021                return self.check_ddl_object_privilege(
12022                    &auth_store,
12023                    &principal_id,
12024                    role,
12025                    tenant.as_deref(),
12026                    &username,
12027                    "create",
12028                    "collection",
12029                    &q.name,
12030                    crate::auth::Role::Write,
12031                );
12032            }
12033            QueryExpr::CreateMetric(q) => {
12034                return self.check_ddl_object_privilege(
12035                    &auth_store,
12036                    &principal_id,
12037                    role,
12038                    tenant.as_deref(),
12039                    &username,
12040                    "create",
12041                    "collection",
12042                    &q.path,
12043                    crate::auth::Role::Write,
12044                );
12045            }
12046            QueryExpr::AlterMetric(q) => {
12047                return self.check_ddl_object_privilege(
12048                    &auth_store,
12049                    &principal_id,
12050                    role,
12051                    tenant.as_deref(),
12052                    &username,
12053                    "alter",
12054                    "collection",
12055                    &q.path,
12056                    crate::auth::Role::Write,
12057                );
12058            }
12059            QueryExpr::CreateSlo(q) => {
12060                return self.check_ddl_object_privilege(
12061                    &auth_store,
12062                    &principal_id,
12063                    role,
12064                    tenant.as_deref(),
12065                    &username,
12066                    "create",
12067                    "collection",
12068                    &q.path,
12069                    crate::auth::Role::Write,
12070                );
12071            }
12072            QueryExpr::DropTimeSeries(q) => {
12073                return self.check_ddl_object_privilege(
12074                    &auth_store,
12075                    &principal_id,
12076                    role,
12077                    tenant.as_deref(),
12078                    &username,
12079                    "drop",
12080                    "collection",
12081                    &q.name,
12082                    crate::auth::Role::Write,
12083                );
12084            }
12085            QueryExpr::CreateQueue(q) => {
12086                return self.check_ddl_object_privilege(
12087                    &auth_store,
12088                    &principal_id,
12089                    role,
12090                    tenant.as_deref(),
12091                    &username,
12092                    "create",
12093                    "collection",
12094                    &q.name,
12095                    crate::auth::Role::Write,
12096                );
12097            }
12098            QueryExpr::AlterQueue(q) => {
12099                return self.check_ddl_object_privilege(
12100                    &auth_store,
12101                    &principal_id,
12102                    role,
12103                    tenant.as_deref(),
12104                    &username,
12105                    "alter",
12106                    "collection",
12107                    &q.name,
12108                    crate::auth::Role::Write,
12109                );
12110            }
12111            QueryExpr::DropQueue(q) => {
12112                return self.check_ddl_object_privilege(
12113                    &auth_store,
12114                    &principal_id,
12115                    role,
12116                    tenant.as_deref(),
12117                    &username,
12118                    "drop",
12119                    "collection",
12120                    &q.name,
12121                    crate::auth::Role::Write,
12122                );
12123            }
12124            QueryExpr::CreateTree(q) => {
12125                return self.check_ddl_object_privilege(
12126                    &auth_store,
12127                    &principal_id,
12128                    role,
12129                    tenant.as_deref(),
12130                    &username,
12131                    "create",
12132                    "collection",
12133                    &q.collection,
12134                    crate::auth::Role::Write,
12135                );
12136            }
12137            QueryExpr::DropTree(q) => {
12138                return self.check_ddl_object_privilege(
12139                    &auth_store,
12140                    &principal_id,
12141                    role,
12142                    tenant.as_deref(),
12143                    &username,
12144                    "drop",
12145                    "collection",
12146                    &q.collection,
12147                    crate::auth::Role::Write,
12148                );
12149            }
12150            // Migration DDL — CREATE MIGRATION is grouped DDL on the
12151            // schema namespace; uses the `schema:write` fallback verb
12152            // (no obvious per-collection target).
12153            QueryExpr::CreateMigration(q) => {
12154                return self.check_ddl_object_privilege(
12155                    &auth_store,
12156                    &principal_id,
12157                    role,
12158                    tenant.as_deref(),
12159                    &username,
12160                    "schema:write",
12161                    "schema",
12162                    &q.name,
12163                    crate::auth::Role::Write,
12164                );
12165            }
12166            // APPLY / ROLLBACK change data and schema — require Admin.
12167            QueryExpr::ApplyMigration(_) | QueryExpr::RollbackMigration(_) => {
12168                return if role == crate::auth::Role::Admin {
12169                    Ok(())
12170                } else {
12171                    Err(format!(
12172                        "principal=`{}` role=`{:?}` cannot issue APPLY/ROLLBACK MIGRATION",
12173                        username, role
12174                    ))
12175                };
12176            }
12177            // EXPLAIN MIGRATION is read-only — any authenticated principal.
12178            QueryExpr::ExplainMigration(_) => return Ok(()),
12179            // Everything else (SET, SHOW, transaction control, graph
12180            // commands, queue/tree commands, MaintenanceCommand …)
12181            // is allowed for any authenticated principal.
12182            _ => return Ok(()),
12183        };
12184
12185        if auth_store.iam_authorization_enabled() {
12186            let iam_action = legacy_action_to_iam(action);
12187            let iam_resource = legacy_resource_to_iam(&resource, tenant.as_deref());
12188            let iam_ctx = runtime_iam_context(
12189                role,
12190                tenant.as_deref(),
12191                auth_store.principal_is_system_owned(&principal_id),
12192            );
12193            if !auth_store.check_policy_authz_with_role(
12194                &principal_id,
12195                iam_action,
12196                &iam_resource,
12197                &iam_ctx,
12198                role,
12199            ) {
12200                return Err(format!(
12201                    "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
12202                    username, iam_action, iam_resource.kind, iam_resource.name
12203                ));
12204            }
12205
12206            if let QueryExpr::Table(table) = expr {
12207                self.check_table_column_projection_privilege(
12208                    &auth_store,
12209                    &principal_id,
12210                    &iam_ctx,
12211                    table,
12212                )?;
12213            }
12214
12215            if let QueryExpr::Update(update) = expr {
12216                let columns = update_set_target_columns(update);
12217                if !columns.is_empty() {
12218                    let request = column_access_request_for_table_update(&update.table, columns);
12219                    let outcome =
12220                        auth_store.check_column_projection_authz(&principal_id, &request, &iam_ctx);
12221                    if let Some(denied) = outcome.first_denied_column() {
12222                        return Err(format!(
12223                            "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM column policy",
12224                            username, iam_action, denied.resource.kind, denied.resource.name
12225                        ));
12226                    }
12227                    if !outcome.allowed() {
12228                        return Err(format!(
12229                            "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
12230                            username,
12231                            iam_action,
12232                            outcome.table_resource.kind,
12233                            outcome.table_resource.name
12234                        ));
12235                    }
12236                }
12237
12238                if let Some(columns) = update_returning_columns_for_policy(self, update) {
12239                    let request = column_access_request_for_table_select(&update.table, columns);
12240                    let outcome =
12241                        auth_store.check_column_projection_authz(&principal_id, &request, &iam_ctx);
12242                    if let Some(denied) = outcome.first_denied_column() {
12243                        return Err(format!(
12244                            "principal=`{}` action=`select` resource=`{}:{}` denied by IAM column policy",
12245                            username, denied.resource.kind, denied.resource.name
12246                        ));
12247                    }
12248                    if !outcome.allowed() {
12249                        return Err(format!(
12250                            "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
12251                            username, outcome.table_resource.kind, outcome.table_resource.name
12252                        ));
12253                    }
12254                }
12255            }
12256
12257            Ok(())
12258        } else {
12259            auth_store
12260                .check_grant(&ctx, action, &resource)
12261                .map_err(|e| e.to_string())
12262        }
12263    }
12264
12265    fn check_table_column_projection_privilege(
12266        &self,
12267        auth_store: &Arc<crate::auth::store::AuthStore>,
12268        principal: &crate::auth::UserId,
12269        ctx: &crate::auth::policies::EvalContext,
12270        table: &crate::storage::query::ast::TableQuery,
12271    ) -> Result<(), String> {
12272        use crate::auth::{ColumnAccessRequest, ColumnDecisionEffect};
12273
12274        let columns = requested_table_columns_for_policy(table);
12275        if columns.is_empty() {
12276            return Ok(());
12277        }
12278
12279        let request = ColumnAccessRequest::select(table.table.clone(), columns);
12280        let outcome = auth_store.check_column_projection_authz(principal, &request, ctx);
12281        if outcome.allowed() {
12282            return Ok(());
12283        }
12284
12285        if !matches!(
12286            outcome.table_decision,
12287            crate::auth::policies::Decision::Allow { .. }
12288                | crate::auth::policies::Decision::AdminBypass
12289        ) {
12290            return Err(format!(
12291                "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
12292                principal, outcome.table_resource.kind, outcome.table_resource.name
12293            ));
12294        }
12295
12296        let denied = outcome
12297            .first_denied_column()
12298            .filter(|decision| decision.effective == ColumnDecisionEffect::Denied);
12299        match denied {
12300            Some(decision) => Err(format!(
12301                "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
12302                principal, decision.resource.kind, decision.resource.name
12303            )),
12304            None => Ok(()),
12305        }
12306    }
12307
12308    fn check_graph_property_projection_privilege(
12309        &self,
12310        auth_store: &Arc<crate::auth::store::AuthStore>,
12311        principal: &crate::auth::UserId,
12312        role: crate::auth::Role,
12313        tenant: Option<&str>,
12314        query: &crate::storage::query::ast::GraphQuery,
12315    ) -> Result<(), String> {
12316        let columns = explicit_graph_projection_properties(query);
12317        if columns.is_empty() {
12318            return Ok(());
12319        }
12320        self.check_table_like_column_projection_privilege(
12321            auth_store, principal, role, tenant, "graph", &columns,
12322        )
12323    }
12324
12325    fn check_table_like_column_projection_privilege(
12326        &self,
12327        auth_store: &Arc<crate::auth::store::AuthStore>,
12328        principal: &crate::auth::UserId,
12329        role: crate::auth::Role,
12330        tenant: Option<&str>,
12331        table: &str,
12332        columns: &[String],
12333    ) -> Result<(), String> {
12334        let iam_ctx = runtime_iam_context(
12335            role,
12336            tenant,
12337            auth_store.principal_is_system_owned(principal),
12338        );
12339        let request =
12340            crate::auth::ColumnAccessRequest::select(table.to_string(), columns.iter().cloned());
12341        let outcome = auth_store.check_column_projection_authz(principal, &request, &iam_ctx);
12342        if outcome.allowed() {
12343            return Ok(());
12344        }
12345        let denied = outcome
12346            .first_denied_column()
12347            .map(|d| d.resource.name.clone())
12348            .unwrap_or_else(|| format!("{table}.<unknown>"));
12349        Err(format!(
12350            "principal=`{}` action=`select` resource=`column:{}` denied by IAM policy",
12351            principal, denied
12352        ))
12353    }
12354
12355    fn check_policy_management_privilege(
12356        &self,
12357        auth_store: &Arc<crate::auth::store::AuthStore>,
12358        principal: &crate::auth::UserId,
12359        role: crate::auth::Role,
12360        tenant: Option<&str>,
12361        action: &str,
12362        resource_kind: &str,
12363        resource_name: &str,
12364    ) -> Result<(), String> {
12365        let ctx = runtime_iam_context(
12366            role,
12367            tenant,
12368            auth_store.principal_is_system_owned(principal),
12369        );
12370
12371        if !auth_store.iam_authorization_enabled() {
12372            return if role == crate::auth::Role::Admin {
12373                Ok(())
12374            } else {
12375                Err(format!(
12376                    "principal=`{}` role=`{:?}` cannot issue ACL/auth DDL",
12377                    principal, role
12378                ))
12379            };
12380        }
12381
12382        let mut resource = crate::auth::policies::ResourceRef::new(
12383            resource_kind.to_string(),
12384            resource_name.to_string(),
12385        );
12386        if let Some(t) = tenant {
12387            resource = resource.with_tenant(t.to_string());
12388        }
12389        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
12390            Ok(())
12391        } else {
12392            Err(format!(
12393                "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
12394                principal, action, resource.kind, resource.name
12395            ))
12396        }
12397    }
12398
12399    fn check_managed_config_write_for_set_config(&self, key: &str) -> RedDBResult<()> {
12400        let Some(auth_store) = self.inner.auth_store.read().clone() else {
12401            return Ok(());
12402        };
12403        let (username, role) = current_auth_identity()
12404            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
12405        let tenant = current_tenant();
12406        let principal = crate::auth::UserId::from_parts(tenant.as_deref(), &username);
12407        let ctx = runtime_iam_context(
12408            role,
12409            tenant.as_deref(),
12410            auth_store.principal_is_system_owned(&principal),
12411        );
12412        let gate = crate::auth::managed_config::ManagedConfigGate::new(
12413            self.inner.config_registry.as_ref(),
12414        );
12415        match gate.check_write(&auth_store, &principal, &ctx, key) {
12416            crate::auth::managed_config::ManagedConfigDecision::PassThrough { .. }
12417            | crate::auth::managed_config::ManagedConfigDecision::Allow { .. } => Ok(()),
12418            crate::auth::managed_config::ManagedConfigDecision::Deny { reason, .. } => {
12419                Err(RedDBError::Query(format!(
12420                    "permission denied: managed config mutation blocked for `{key}`: {reason}"
12421                )))
12422            }
12423        }
12424    }
12425
12426    /// IAM privilege check for a granular queue operation (issue #755 /
12427    /// PRD #735).
12428    ///
12429    /// Each queue operation maps to a stable verb in
12430    /// [`crate::auth::action_catalog`] (`queue:enqueue`, `queue:read`,
12431    /// `queue:peek`, `queue:ack`, `queue:nack`, `queue:retry`,
12432    /// `queue:dlq:move`, `queue:purge`, `queue:presence:read`). The
12433    /// resource is `queue:<name>` scoped to the current tenant. In
12434    /// legacy mode (no IAM authorization configured) the check is a
12435    /// no-op — the role gates in `execute_queue_command` still apply
12436    /// and the legacy `select` / `write` grant table continues to
12437    /// govern queue access. In IAM-enabled mode a missing granular
12438    /// grant yields a structured, UI-safe error of the form
12439    /// `principal=… action=queue:… resource=queue:… denied by IAM
12440    /// policy` so Red UI can surface the failing toolbar action.
12441    fn check_queue_op_privilege(
12442        &self,
12443        auth_store: &Arc<crate::auth::store::AuthStore>,
12444        principal: &crate::auth::UserId,
12445        role: crate::auth::Role,
12446        tenant: Option<&str>,
12447        action: &str,
12448        queue: &str,
12449    ) -> Result<(), String> {
12450        if !auth_store.iam_authorization_enabled() {
12451            return Ok(());
12452        }
12453        let mut resource =
12454            crate::auth::policies::ResourceRef::new("queue".to_string(), queue.to_string());
12455        if let Some(t) = tenant {
12456            resource = resource.with_tenant(t.to_string());
12457        }
12458        let ctx = runtime_iam_context(
12459            role,
12460            tenant,
12461            auth_store.principal_is_system_owned(principal),
12462        );
12463        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
12464            Ok(())
12465        } else {
12466            Err(format!(
12467                "principal=`{}` action=`{}` resource=`queue:{}` denied by IAM policy",
12468                principal, action, queue
12469            ))
12470        }
12471    }
12472
12473    /// IAM privilege check for a graph operation (issue #757 / PRD
12474    /// #735).
12475    ///
12476    /// Each graph operation maps to a stable verb in
12477    /// [`crate::auth::action_catalog`] — `graph:read` for
12478    /// metadata/property lookups, `graph:traverse` for MATCH / PATH /
12479    /// NEIGHBORHOOD / TRAVERSE / SHORTEST_PATH, and
12480    /// `graph:algorithm:run` for analytics algorithms (centrality,
12481    /// community, components, cycles, clustering, topological sort).
12482    /// The resource is `graph:*` scoped to the current tenant — the
12483    /// runtime today operates on a singleton graph store so the name
12484    /// has no concrete identifier; policies grant the explorer
12485    /// surface by writing `graph:*` as the resource pattern.
12486    ///
12487    /// In legacy mode (no IAM authorization configured) the check is
12488    /// a no-op so the existing role-based defaults continue to
12489    /// govern. In IAM-enabled mode a missing grant produces the
12490    /// UI-safe envelope `principal=… action=graph:… resource=graph:*
12491    /// denied by IAM policy` Red UI keys on.
12492    fn check_graph_op_privilege(
12493        &self,
12494        auth_store: &Arc<crate::auth::store::AuthStore>,
12495        principal: &crate::auth::UserId,
12496        role: crate::auth::Role,
12497        tenant: Option<&str>,
12498        action: &str,
12499    ) -> Result<(), String> {
12500        if !auth_store.iam_authorization_enabled() {
12501            return Ok(());
12502        }
12503        let mut resource =
12504            crate::auth::policies::ResourceRef::new("graph".to_string(), "*".to_string());
12505        if let Some(t) = tenant {
12506            resource = resource.with_tenant(t.to_string());
12507        }
12508        let ctx = runtime_iam_context(
12509            role,
12510            tenant,
12511            auth_store.principal_is_system_owned(principal),
12512        );
12513        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
12514            Ok(())
12515        } else {
12516            Err(format!(
12517                "principal=`{}` action=`{}` resource=`graph:*` denied by IAM policy",
12518                principal, action
12519            ))
12520        }
12521    }
12522
12523    /// IAM privilege check for a granular vector operation (issue #756
12524    /// / PRD #735).
12525    ///
12526    /// Each vector operation maps to a stable verb in
12527    /// [`crate::auth::action_catalog`] (`vector:read`, `vector:search`,
12528    /// `vector:artifact:read`, `vector:artifact:rebuild`,
12529    /// `vector:admin`). The resource is `vector:<collection>` scoped to
12530    /// the current tenant. In legacy mode (no IAM authorization
12531    /// configured) the check is a no-op — the role gates and existing
12532    /// `select` / column-projection grants continue to govern access.
12533    /// In IAM-enabled mode a missing granular grant yields a
12534    /// structured, UI-safe error of the form `principal=…
12535    /// action=vector:… resource=vector:… denied by IAM policy` so Red
12536    /// UI can surface the failing toolbar action.
12537    fn check_vector_op_privilege(
12538        &self,
12539        auth_store: &Arc<crate::auth::store::AuthStore>,
12540        principal: &crate::auth::UserId,
12541        role: crate::auth::Role,
12542        tenant: Option<&str>,
12543        action: &str,
12544        collection: &str,
12545    ) -> Result<(), String> {
12546        if !auth_store.iam_authorization_enabled() {
12547            return Ok(());
12548        }
12549        let mut resource =
12550            crate::auth::policies::ResourceRef::new("vector".to_string(), collection.to_string());
12551        if let Some(t) = tenant {
12552            resource = resource.with_tenant(t.to_string());
12553        }
12554        let ctx = runtime_iam_context(
12555            role,
12556            tenant,
12557            auth_store.principal_is_system_owned(principal),
12558        );
12559        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
12560            Ok(())
12561        } else {
12562            Err(format!(
12563                "principal=`{}` action=`{}` resource=`vector:{}` denied by IAM policy",
12564                principal, action, collection
12565            ))
12566        }
12567    }
12568
12569    /// IAM privilege check for DROP / TRUNCATE on a named collection.
12570    ///
12571    /// Delegates to [`check_ddl_object_privilege`] with `resource_kind =
12572    /// "collection"`. Kept as a thin wrapper so the existing DROP/TRUNCATE
12573    /// callsites stay readable.
12574    fn check_ddl_collection_privilege(
12575        &self,
12576        auth_store: &Arc<crate::auth::store::AuthStore>,
12577        principal: &crate::auth::UserId,
12578        role: crate::auth::Role,
12579        tenant: Option<&str>,
12580        username: &str,
12581        action: &str,
12582        collection: &str,
12583    ) -> Result<(), String> {
12584        self.check_ddl_object_privilege(
12585            auth_store,
12586            principal,
12587            role,
12588            tenant,
12589            username,
12590            action,
12591            "collection",
12592            collection,
12593            crate::auth::Role::Write,
12594        )
12595    }
12596
12597    /// Generalised IAM privilege check for DDL on a named object.
12598    ///
12599    /// `action` is the stable verb advertised through the action catalog
12600    /// (`create`, `alter`, `drop`, `truncate`, `schema:write`,
12601    /// `schema:admin`). `resource_kind` / `resource_name` form the policy
12602    /// resource (`collection:<name>`, `schema:<name>`). `min_role` is the
12603    /// legacy gate when IAM is not yet enabled.
12604    ///
12605    /// Behaviour:
12606    /// * Role below `min_role` → structured "principal=… role=… cannot
12607    ///   issue DDL" denial, audit recorded.
12608    /// * IAM disabled → audit-record success and allow (legacy path).
12609    /// * IAM enabled → call `check_policy_authz_with_role`. Explicit Deny
12610    ///   and DefaultDeny in PolicyOnly mode both produce a UI-safe
12611    ///   "principal=… action=… resource=<kind>:<name> denied by IAM
12612    ///   policy" string. Explicit Allow and the LegacyRbac fallback
12613    ///   allow the action.
12614    #[allow(clippy::too_many_arguments)]
12615    fn check_ddl_object_privilege(
12616        &self,
12617        auth_store: &Arc<crate::auth::store::AuthStore>,
12618        principal: &crate::auth::UserId,
12619        role: crate::auth::Role,
12620        tenant: Option<&str>,
12621        username: &str,
12622        action: &str,
12623        resource_kind: &str,
12624        resource_name: &str,
12625        min_role: crate::auth::Role,
12626    ) -> Result<(), String> {
12627        if role < min_role {
12628            let msg = format!(
12629                "principal=`{}` role=`{:?}` cannot issue DDL action=`{}` resource=`{}:{}`",
12630                username, role, action, resource_kind, resource_name
12631            );
12632            self.inner.audit_log.record(
12633                action,
12634                username,
12635                resource_name,
12636                "denied",
12637                crate::json::Value::Null,
12638            );
12639            return Err(msg);
12640        }
12641
12642        if !auth_store.iam_authorization_enabled() {
12643            self.inner.audit_log.record(
12644                action,
12645                username,
12646                resource_name,
12647                "ok",
12648                crate::json::Value::Null,
12649            );
12650            return Ok(());
12651        }
12652
12653        let mut resource = crate::auth::policies::ResourceRef::new(
12654            resource_kind.to_string(),
12655            resource_name.to_string(),
12656        );
12657        if let Some(t) = tenant {
12658            resource = resource.with_tenant(t.to_string());
12659        }
12660        let ctx = runtime_iam_context(
12661            role,
12662            tenant,
12663            auth_store.principal_is_system_owned(principal),
12664        );
12665        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
12666            self.inner.audit_log.record(
12667                action,
12668                username,
12669                resource_name,
12670                "ok",
12671                crate::json::Value::Null,
12672            );
12673            Ok(())
12674        } else {
12675            self.inner.audit_log.record(
12676                action,
12677                username,
12678                resource_name,
12679                "denied",
12680                crate::json::Value::Null,
12681            );
12682            Err(format!(
12683                "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
12684                username, action, resource_kind, resource_name
12685            ))
12686        }
12687    }
12688
12689    /// Translate the parsed [`GrantStmt`] into AuthStore mutations.
12690    fn execute_grant_statement(
12691        &self,
12692        query: &str,
12693        stmt: &crate::storage::query::ast::GrantStmt,
12694    ) -> RedDBResult<RuntimeQueryResult> {
12695        use crate::auth::privileges::{Action, GrantPrincipal, Resource};
12696        use crate::auth::UserId;
12697        use crate::storage::query::ast::{GrantObjectKind, GrantPrincipalRef};
12698
12699        let auth_store = self
12700            .inner
12701            .auth_store
12702            .read()
12703            .clone()
12704            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
12705
12706        // Granter identity + role.
12707        let (gname, grole) = current_auth_identity().ok_or_else(|| {
12708            RedDBError::Query("GRANT requires an authenticated principal".to_string())
12709        })?;
12710        let granter = UserId::from_parts(current_tenant().as_deref(), &gname);
12711        let granter_role = grole;
12712
12713        // Build the action set.
12714        let mut actions: Vec<Action> = Vec::new();
12715        if stmt.all {
12716            actions.push(Action::All);
12717        } else {
12718            for kw in &stmt.actions {
12719                let a = Action::from_keyword(kw).ok_or_else(|| {
12720                    RedDBError::Query(format!("unknown privilege keyword `{}`", kw))
12721                })?;
12722                actions.push(a);
12723            }
12724        }
12725
12726        // Audit emit (printed; structured emission is Agent #4's lane).
12727        let mut applied = 0usize;
12728        for obj in &stmt.objects {
12729            let resource = match stmt.object_kind {
12730                GrantObjectKind::Table => Resource::Table {
12731                    schema: obj.schema.clone(),
12732                    table: obj.name.clone(),
12733                },
12734                GrantObjectKind::Schema => Resource::Schema(obj.name.clone()),
12735                GrantObjectKind::Database => Resource::Database,
12736                GrantObjectKind::Function => Resource::Function {
12737                    schema: obj.schema.clone(),
12738                    name: obj.name.clone(),
12739                },
12740            };
12741            for principal in &stmt.principals {
12742                let p = match principal {
12743                    GrantPrincipalRef::Public => GrantPrincipal::Public,
12744                    GrantPrincipalRef::Group(g) => GrantPrincipal::Group(g.clone()),
12745                    GrantPrincipalRef::User { tenant, name } => {
12746                        GrantPrincipal::User(UserId::from_parts(tenant.as_deref(), name))
12747                    }
12748                };
12749                // Tenant of the grant follows the granter's tenant
12750                // (cross-tenant guard inside `AuthStore::grant`).
12751                let tenant = granter.tenant.clone();
12752                auth_store
12753                    .grant(
12754                        &granter,
12755                        granter_role,
12756                        p.clone(),
12757                        resource.clone(),
12758                        actions.clone(),
12759                        stmt.with_grant_option,
12760                        tenant.clone(),
12761                    )
12762                    .map_err(|e| RedDBError::Query(e.to_string()))?;
12763
12764                // IAM policy translation: every GRANT also lands as a
12765                // synthetic `_grant_<id>` policy attached to the
12766                // principal so the new evaluator sees it.
12767                if let Some(policy) =
12768                    grant_to_iam_policy(&p, &resource, &actions, tenant.as_deref())
12769                {
12770                    let pid = policy.id.clone();
12771                    auth_store
12772                        .put_policy_internal(policy)
12773                        .map_err(|e| RedDBError::Query(e.to_string()))?;
12774                    let attachment = match &p {
12775                        GrantPrincipal::User(uid) => {
12776                            crate::auth::store::PrincipalRef::User(uid.clone())
12777                        }
12778                        GrantPrincipal::Group(group) => {
12779                            crate::auth::store::PrincipalRef::Group(group.clone())
12780                        }
12781                        GrantPrincipal::Public => crate::auth::store::PrincipalRef::Group(
12782                            crate::auth::store::PUBLIC_IAM_GROUP.to_string(),
12783                        ),
12784                    };
12785                    auth_store
12786                        .attach_policy(attachment, &pid)
12787                        .map_err(|e| RedDBError::Query(e.to_string()))?;
12788                }
12789                applied += 1;
12790                tracing::info!(
12791                    target: "audit",
12792                    principal = %granter,
12793                    action = "grant",
12794                    "GRANT applied"
12795                );
12796            }
12797        }
12798
12799        self.invalidate_result_cache();
12800        Ok(RuntimeQueryResult::ok_message(
12801            query.to_string(),
12802            &format!("GRANT applied to {} target(s)", applied),
12803            "grant",
12804        ))
12805    }
12806
12807    /// Translate the parsed [`RevokeStmt`] into AuthStore mutations.
12808    fn execute_revoke_statement(
12809        &self,
12810        query: &str,
12811        stmt: &crate::storage::query::ast::RevokeStmt,
12812    ) -> RedDBResult<RuntimeQueryResult> {
12813        use crate::auth::privileges::{Action, GrantPrincipal, Resource};
12814        use crate::auth::UserId;
12815        use crate::storage::query::ast::{GrantObjectKind, GrantPrincipalRef};
12816
12817        let auth_store = self
12818            .inner
12819            .auth_store
12820            .read()
12821            .clone()
12822            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
12823
12824        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
12825            RedDBError::Query("REVOKE requires an authenticated principal".to_string())
12826        })?;
12827        let granter_role = grole;
12828
12829        let actions: Vec<Action> = if stmt.all {
12830            vec![Action::All]
12831        } else {
12832            stmt.actions
12833                .iter()
12834                .map(|kw| Action::from_keyword(kw).unwrap_or(Action::Select))
12835                .collect()
12836        };
12837
12838        let mut total_removed = 0usize;
12839        for obj in &stmt.objects {
12840            let resource = match stmt.object_kind {
12841                GrantObjectKind::Table => Resource::Table {
12842                    schema: obj.schema.clone(),
12843                    table: obj.name.clone(),
12844                },
12845                GrantObjectKind::Schema => Resource::Schema(obj.name.clone()),
12846                GrantObjectKind::Database => Resource::Database,
12847                GrantObjectKind::Function => Resource::Function {
12848                    schema: obj.schema.clone(),
12849                    name: obj.name.clone(),
12850                },
12851            };
12852            for principal in &stmt.principals {
12853                let p = match principal {
12854                    GrantPrincipalRef::Public => GrantPrincipal::Public,
12855                    GrantPrincipalRef::Group(g) => GrantPrincipal::Group(g.clone()),
12856                    GrantPrincipalRef::User { tenant, name } => {
12857                        GrantPrincipal::User(UserId::from_parts(tenant.as_deref(), name))
12858                    }
12859                };
12860                let removed = auth_store
12861                    .revoke(granter_role, &p, &resource, &actions)
12862                    .map_err(|e| RedDBError::Query(e.to_string()))?;
12863                let _removed_policies =
12864                    auth_store.delete_synthetic_grant_policies(&p, &resource, &actions);
12865                total_removed += removed;
12866            }
12867        }
12868
12869        self.invalidate_result_cache();
12870        Ok(RuntimeQueryResult::ok_message(
12871            query.to_string(),
12872            &format!("REVOKE removed {} grant(s)", total_removed),
12873            "revoke",
12874        ))
12875    }
12876
12877    /// Translate the parsed [`AlterUserStmt`] into AuthStore mutations.
12878    fn execute_alter_user_statement(
12879        &self,
12880        query: &str,
12881        stmt: &crate::storage::query::ast::AlterUserStmt,
12882    ) -> RedDBResult<RuntimeQueryResult> {
12883        use crate::auth::privileges::UserAttributes;
12884        use crate::auth::UserId;
12885        use crate::storage::query::ast::AlterUserAttribute;
12886
12887        let auth_store = self
12888            .inner
12889            .auth_store
12890            .read()
12891            .clone()
12892            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
12893
12894        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
12895            RedDBError::Query("ALTER USER requires an authenticated principal".to_string())
12896        })?;
12897        if grole != crate::auth::Role::Admin {
12898            return Err(RedDBError::Query(
12899                "ALTER USER requires Admin role".to_string(),
12900            ));
12901        }
12902
12903        let target = UserId::from_parts(stmt.tenant.as_deref(), &stmt.username);
12904
12905        // Apply attributes incrementally — each one reads the current
12906        // record, mutates the relevant field, writes back.
12907        let mut attrs = auth_store.user_attributes(&target);
12908        let mut enable_change: Option<bool> = None;
12909
12910        for a in &stmt.attributes {
12911            match a {
12912                AlterUserAttribute::ValidUntil(ts) => {
12913                    // Parse ISO-ish timestamp → ms since epoch. Fall
12914                    // back to integer-ms parsing for callers that pass
12915                    // `'1234567890123'`.
12916                    let ms = parse_timestamp_to_ms(ts).ok_or_else(|| {
12917                        RedDBError::Query(format!("invalid VALID UNTIL timestamp `{ts}`"))
12918                    })?;
12919                    attrs.valid_until = Some(ms);
12920                }
12921                AlterUserAttribute::ConnectionLimit(n) => {
12922                    if *n < 0 {
12923                        return Err(RedDBError::Query(
12924                            "CONNECTION LIMIT must be non-negative".to_string(),
12925                        ));
12926                    }
12927                    attrs.connection_limit = Some(*n as u32);
12928                }
12929                AlterUserAttribute::SetSearchPath(p) => {
12930                    attrs.search_path = Some(p.clone());
12931                }
12932                AlterUserAttribute::AddGroup(g) => {
12933                    if !attrs.groups.iter().any(|existing| existing == g) {
12934                        attrs.groups.push(g.clone());
12935                        attrs.groups.sort();
12936                    }
12937                }
12938                AlterUserAttribute::DropGroup(g) => {
12939                    attrs.groups.retain(|existing| existing != g);
12940                }
12941                AlterUserAttribute::Enable => enable_change = Some(true),
12942                AlterUserAttribute::Disable => enable_change = Some(false),
12943                AlterUserAttribute::Password(_) => {
12944                    // Out of scope — accept the AST but no-op so the
12945                    // parser stays compatible with future password
12946                    // rotation work.
12947                }
12948            }
12949        }
12950
12951        auth_store
12952            .set_user_attributes(&target, attrs)
12953            .map_err(|e| RedDBError::Query(e.to_string()))?;
12954        if let Some(en) = enable_change {
12955            auth_store
12956                .set_user_enabled(&target, en)
12957                .map_err(|e| RedDBError::Query(e.to_string()))?;
12958        }
12959        self.invalidate_result_cache();
12960        tracing::info!(
12961            target: "audit",
12962            principal = %target,
12963            action = "alter_user",
12964            "ALTER USER applied"
12965        );
12966
12967        Ok(RuntimeQueryResult::ok_message(
12968            query.to_string(),
12969            &format!("ALTER USER {} applied", target),
12970            "alter_user",
12971        ))
12972    }
12973
12974    // -----------------------------------------------------------------
12975    // IAM policy executors
12976    // -----------------------------------------------------------------
12977
12978    fn execute_create_iam_policy(
12979        &self,
12980        query: &str,
12981        id: &str,
12982        json: &str,
12983    ) -> RedDBResult<RuntimeQueryResult> {
12984        use crate::auth::policies::Policy;
12985
12986        let auth_store = self
12987            .inner
12988            .auth_store
12989            .read()
12990            .clone()
12991            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
12992
12993        // Parse + validate. The kernel rejects oversize / bad shape /
12994        // bad action keywords. If the supplied id differs from the JSON
12995        // id, override it with the SQL-provided id (the JSON id is
12996        // optional context — the SQL DDL form is authoritative).
12997        let mut policy = Policy::from_json_str(json)
12998            .map_err(|e| RedDBError::Query(format!("policy parse: {e}")))?;
12999        if policy.id != id {
13000            policy.id = id.to_string();
13001        }
13002        let pid = policy.id.clone();
13003        let tenant = current_tenant();
13004        let (actor_name, actor_role) = current_auth_identity()
13005            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
13006        let actor = crate::auth::UserId::from_parts(tenant.as_deref(), &actor_name);
13007        let eval_ctx = runtime_iam_context(
13008            actor_role,
13009            tenant.as_deref(),
13010            auth_store.principal_is_system_owned(&actor),
13011        );
13012        let event_ctx = self.policy_mutation_control_ctx(&actor, tenant.as_deref());
13013        let ledger = self.inner.control_event_ledger.read();
13014        let control = crate::auth::store::PolicyMutationControl {
13015            ctx: &event_ctx,
13016            ledger: ledger.as_ref(),
13017            config: self.inner.control_event_config,
13018            registry: Some(self.inner.config_registry.as_ref()),
13019            actor: &actor,
13020            eval_ctx: &eval_ctx,
13021        };
13022        auth_store
13023            .put_policy_with_control_events(policy, &control)
13024            .map_err(|e| RedDBError::Query(e.to_string()))?;
13025
13026        let principal = actor_name;
13027        tracing::info!(
13028            target: "audit",
13029            principal = %principal,
13030            action = "iam:policy.put",
13031            matched_policy_id = %pid,
13032            "CREATE POLICY applied"
13033        );
13034        self.inner.audit_log.record(
13035            "iam/policy.put",
13036            &principal,
13037            &pid,
13038            "ok",
13039            crate::json::Value::Null,
13040        );
13041
13042        self.invalidate_result_cache();
13043        Ok(RuntimeQueryResult::ok_message(
13044            query.to_string(),
13045            &format!("policy `{pid}` stored"),
13046            "create_iam_policy",
13047        ))
13048    }
13049
13050    fn execute_drop_iam_policy(&self, query: &str, id: &str) -> RedDBResult<RuntimeQueryResult> {
13051        let auth_store = self
13052            .inner
13053            .auth_store
13054            .read()
13055            .clone()
13056            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
13057        let tenant = current_tenant();
13058        let (actor_name, actor_role) = current_auth_identity()
13059            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
13060        let actor = crate::auth::UserId::from_parts(tenant.as_deref(), &actor_name);
13061        let eval_ctx = runtime_iam_context(
13062            actor_role,
13063            tenant.as_deref(),
13064            auth_store.principal_is_system_owned(&actor),
13065        );
13066        let event_ctx = self.policy_mutation_control_ctx(&actor, tenant.as_deref());
13067        let ledger = self.inner.control_event_ledger.read();
13068        let control = crate::auth::store::PolicyMutationControl {
13069            ctx: &event_ctx,
13070            ledger: ledger.as_ref(),
13071            config: self.inner.control_event_config,
13072            registry: Some(self.inner.config_registry.as_ref()),
13073            actor: &actor,
13074            eval_ctx: &eval_ctx,
13075        };
13076        auth_store
13077            .delete_policy_with_control_events(id, &control)
13078            .map_err(|e| RedDBError::Query(e.to_string()))?;
13079
13080        let principal = actor_name;
13081        tracing::info!(
13082            target: "audit",
13083            principal = %principal,
13084            action = "iam:policy.drop",
13085            matched_policy_id = %id,
13086            "DROP POLICY applied"
13087        );
13088        self.inner.audit_log.record(
13089            "iam/policy.drop",
13090            &principal,
13091            id,
13092            "ok",
13093            crate::json::Value::Null,
13094        );
13095
13096        self.invalidate_result_cache();
13097        Ok(RuntimeQueryResult::ok_message(
13098            query.to_string(),
13099            &format!("policy `{id}` dropped"),
13100            "drop_iam_policy",
13101        ))
13102    }
13103
13104    fn execute_attach_policy(
13105        &self,
13106        query: &str,
13107        policy_id: &str,
13108        principal: &crate::storage::query::ast::PolicyPrincipalRef,
13109    ) -> RedDBResult<RuntimeQueryResult> {
13110        use crate::auth::store::PrincipalRef;
13111        use crate::auth::UserId;
13112        use crate::storage::query::ast::PolicyPrincipalRef;
13113
13114        let auth_store = self
13115            .inner
13116            .auth_store
13117            .read()
13118            .clone()
13119            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
13120        let p = match principal {
13121            PolicyPrincipalRef::User(u) => {
13122                PrincipalRef::User(UserId::from_parts(u.tenant.as_deref(), &u.username))
13123            }
13124            PolicyPrincipalRef::Group(g) => PrincipalRef::Group(g.clone()),
13125        };
13126        let pretty_target = principal_label(principal);
13127        let tenant = current_tenant();
13128        let (actor_name, actor_role) = current_auth_identity()
13129            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
13130        let actor = crate::auth::UserId::from_parts(tenant.as_deref(), &actor_name);
13131        let eval_ctx = runtime_iam_context(
13132            actor_role,
13133            tenant.as_deref(),
13134            auth_store.principal_is_system_owned(&actor),
13135        );
13136        let event_ctx = self.policy_mutation_control_ctx(&actor, tenant.as_deref());
13137        let ledger = self.inner.control_event_ledger.read();
13138        let control = crate::auth::store::PolicyMutationControl {
13139            ctx: &event_ctx,
13140            ledger: ledger.as_ref(),
13141            config: self.inner.control_event_config,
13142            registry: Some(self.inner.config_registry.as_ref()),
13143            actor: &actor,
13144            eval_ctx: &eval_ctx,
13145        };
13146        auth_store
13147            .attach_policy_with_control_events(p, policy_id, &control)
13148            .map_err(|e| RedDBError::Query(e.to_string()))?;
13149
13150        let principal_str = actor_name;
13151        tracing::info!(
13152            target: "audit",
13153            principal = %principal_str,
13154            action = "iam:policy.attach",
13155            matched_policy_id = %policy_id,
13156            target = %pretty_target,
13157            "ATTACH POLICY applied"
13158        );
13159        self.inner.audit_log.record(
13160            "iam/policy.attach",
13161            &principal_str,
13162            &pretty_target,
13163            "ok",
13164            crate::json::Value::Null,
13165        );
13166
13167        self.invalidate_result_cache();
13168        Ok(RuntimeQueryResult::ok_message(
13169            query.to_string(),
13170            &format!("policy `{policy_id}` attached to {pretty_target}"),
13171            "attach_policy",
13172        ))
13173    }
13174
13175    fn execute_detach_policy(
13176        &self,
13177        query: &str,
13178        policy_id: &str,
13179        principal: &crate::storage::query::ast::PolicyPrincipalRef,
13180    ) -> RedDBResult<RuntimeQueryResult> {
13181        use crate::auth::store::PrincipalRef;
13182        use crate::auth::UserId;
13183        use crate::storage::query::ast::PolicyPrincipalRef;
13184
13185        let auth_store = self
13186            .inner
13187            .auth_store
13188            .read()
13189            .clone()
13190            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
13191        let p = match principal {
13192            PolicyPrincipalRef::User(u) => {
13193                PrincipalRef::User(UserId::from_parts(u.tenant.as_deref(), &u.username))
13194            }
13195            PolicyPrincipalRef::Group(g) => PrincipalRef::Group(g.clone()),
13196        };
13197        let pretty_target = principal_label(principal);
13198        let tenant = current_tenant();
13199        let (actor_name, actor_role) = current_auth_identity()
13200            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
13201        let actor = crate::auth::UserId::from_parts(tenant.as_deref(), &actor_name);
13202        let eval_ctx = runtime_iam_context(
13203            actor_role,
13204            tenant.as_deref(),
13205            auth_store.principal_is_system_owned(&actor),
13206        );
13207        let event_ctx = self.policy_mutation_control_ctx(&actor, tenant.as_deref());
13208        let ledger = self.inner.control_event_ledger.read();
13209        let control = crate::auth::store::PolicyMutationControl {
13210            ctx: &event_ctx,
13211            ledger: ledger.as_ref(),
13212            config: self.inner.control_event_config,
13213            registry: Some(self.inner.config_registry.as_ref()),
13214            actor: &actor,
13215            eval_ctx: &eval_ctx,
13216        };
13217        auth_store
13218            .detach_policy_with_control_events(p, policy_id, &control)
13219            .map_err(|e| RedDBError::Query(e.to_string()))?;
13220
13221        let principal_str = actor_name;
13222        tracing::info!(
13223            target: "audit",
13224            principal = %principal_str,
13225            action = "iam:policy.detach",
13226            matched_policy_id = %policy_id,
13227            target = %pretty_target,
13228            "DETACH POLICY applied"
13229        );
13230        self.inner.audit_log.record(
13231            "iam/policy.detach",
13232            &principal_str,
13233            &pretty_target,
13234            "ok",
13235            crate::json::Value::Null,
13236        );
13237
13238        self.invalidate_result_cache();
13239        Ok(RuntimeQueryResult::ok_message(
13240            query.to_string(),
13241            &format!("policy `{policy_id}` detached from {pretty_target}"),
13242            "detach_policy",
13243        ))
13244    }
13245
13246    fn execute_show_policies(
13247        &self,
13248        query: &str,
13249        filter: Option<&crate::storage::query::ast::PolicyPrincipalRef>,
13250    ) -> RedDBResult<RuntimeQueryResult> {
13251        use crate::auth::UserId;
13252        use crate::storage::query::ast::PolicyPrincipalRef;
13253        use crate::storage::query::unified::UnifiedRecord;
13254        use crate::storage::schema::Value as SchemaValue;
13255        use std::sync::Arc;
13256
13257        let auth_store = self
13258            .inner
13259            .auth_store
13260            .read()
13261            .clone()
13262            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
13263
13264        let pols = match filter {
13265            None => auth_store.list_policies(),
13266            Some(PolicyPrincipalRef::User(u)) => {
13267                let id = UserId::from_parts(u.tenant.as_deref(), &u.username);
13268                auth_store.effective_policies(&id)
13269            }
13270            Some(PolicyPrincipalRef::Group(g)) => auth_store.group_policies(g),
13271        };
13272
13273        let mut records = Vec::with_capacity(pols.len() + 1);
13274
13275        // Header row (#712 / S5A): synthetic record at index 0 that
13276        // reports the active PolicyEnforcementMode and the hard-cutover
13277        // version, so an operator running SHOW POLICIES can see the
13278        // current posture without a separate command.
13279        let mode = auth_store.enforcement_mode();
13280        let mut header = UnifiedRecord::default();
13281        header.set_arc(
13282            Arc::from("id"),
13283            SchemaValue::text("<enforcement_mode>".to_string()),
13284        );
13285        header.set_arc(Arc::from("statements"), SchemaValue::Integer(0));
13286        header.set_arc(Arc::from("tenant"), SchemaValue::Null);
13287        let header_json = format!(
13288            r#"{{"enforcement_mode":"{}","policy_only_hard_version":"{}"}}"#,
13289            mode.as_str(),
13290            crate::auth::enforcement_mode::POLICY_ONLY_HARD_VERSION
13291        );
13292        header.set_arc(Arc::from("json"), SchemaValue::text(header_json));
13293        records.push(header);
13294
13295        for p in pols.iter() {
13296            let mut rec = UnifiedRecord::default();
13297            rec.set_arc(Arc::from("id"), SchemaValue::text(p.id.clone()));
13298            rec.set_arc(
13299                Arc::from("statements"),
13300                SchemaValue::Integer(p.statements.len() as i64),
13301            );
13302            rec.set_arc(
13303                Arc::from("tenant"),
13304                p.tenant
13305                    .as_deref()
13306                    .map(|t| SchemaValue::text(t.to_string()))
13307                    .unwrap_or(SchemaValue::Null),
13308            );
13309            rec.set_arc(Arc::from("json"), SchemaValue::text(p.to_json_string()));
13310            records.push(rec);
13311        }
13312        let mut result = crate::storage::query::unified::UnifiedResult::empty();
13313        result.records = records;
13314        Ok(RuntimeQueryResult {
13315            query: query.to_string(),
13316            mode: crate::storage::query::modes::QueryMode::Sql,
13317            statement: "show_policies",
13318            engine: "iam-policies",
13319            result,
13320            affected_rows: 0,
13321            statement_type: "select",
13322            bookmark: None,
13323        })
13324    }
13325
13326    fn execute_show_effective_permissions(
13327        &self,
13328        query: &str,
13329        user: &crate::storage::query::ast::PolicyUserRef,
13330        resource: Option<&crate::storage::query::ast::PolicyResourceRef>,
13331    ) -> RedDBResult<RuntimeQueryResult> {
13332        use crate::auth::UserId;
13333        use crate::storage::query::unified::UnifiedRecord;
13334        use crate::storage::schema::Value as SchemaValue;
13335        use std::sync::Arc;
13336
13337        let auth_store = self
13338            .inner
13339            .auth_store
13340            .read()
13341            .clone()
13342            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
13343        let id = UserId::from_parts(user.tenant.as_deref(), &user.username);
13344        let pols = auth_store.effective_policies(&id);
13345
13346        // Show one row per (policy, statement) tuple, plus any
13347        // resource-level filter passed by the caller.
13348        let mut records = Vec::new();
13349        for p in pols.iter() {
13350            for (idx, st) in p.statements.iter().enumerate() {
13351                if let Some(_r) = resource {
13352                    // Naive filter: render statement targets to strings
13353                    // and skip if no match. Conservative default = include
13354                    // (the simulator handles fine-grained matching).
13355                }
13356                let mut rec = UnifiedRecord::default();
13357                rec.set_arc(Arc::from("policy_id"), SchemaValue::text(p.id.clone()));
13358                rec.set_arc(
13359                    Arc::from("statement_index"),
13360                    SchemaValue::Integer(idx as i64),
13361                );
13362                rec.set_arc(
13363                    Arc::from("sid"),
13364                    st.sid
13365                        .as_deref()
13366                        .map(|s| SchemaValue::text(s.to_string()))
13367                        .unwrap_or(SchemaValue::Null),
13368                );
13369                rec.set_arc(
13370                    Arc::from("effect"),
13371                    SchemaValue::text(match st.effect {
13372                        crate::auth::policies::Effect::Allow => "allow",
13373                        crate::auth::policies::Effect::Deny => "deny",
13374                    }),
13375                );
13376                rec.set_arc(
13377                    Arc::from("actions"),
13378                    SchemaValue::Integer(st.actions.len() as i64),
13379                );
13380                rec.set_arc(
13381                    Arc::from("resources"),
13382                    SchemaValue::Integer(st.resources.len() as i64),
13383                );
13384                records.push(rec);
13385            }
13386        }
13387        let mut result = crate::storage::query::unified::UnifiedResult::empty();
13388        result.records = records;
13389        Ok(RuntimeQueryResult {
13390            query: query.to_string(),
13391            mode: crate::storage::query::modes::QueryMode::Sql,
13392            statement: "show_effective_permissions",
13393            engine: "iam-policies",
13394            result,
13395            affected_rows: 0,
13396            statement_type: "select",
13397            bookmark: None,
13398        })
13399    }
13400
13401    fn execute_lint_policy(
13402        &self,
13403        query: &str,
13404        source: &crate::storage::query::ast::LintPolicySource,
13405    ) -> RedDBResult<RuntimeQueryResult> {
13406        use crate::auth::policy_linter::lint;
13407        use crate::storage::query::ast::LintPolicySource;
13408        use crate::storage::query::unified::UnifiedRecord;
13409        use crate::storage::schema::Value as SchemaValue;
13410        use std::sync::Arc;
13411
13412        // Resolve the policy text. `JSON` source lints the literal
13413        // verbatim; `Id` source fetches the stored document so
13414        // operators can lint a policy by name without rebuilding the
13415        // JSON from `SHOW POLICY`.
13416        let policy_text = match source {
13417            LintPolicySource::Json(text) => text.clone(),
13418            LintPolicySource::Id(id) => {
13419                let auth_store =
13420                    self.inner.auth_store.read().clone().ok_or_else(|| {
13421                        RedDBError::Query("auth store not configured".to_string())
13422                    })?;
13423                let policy = auth_store
13424                    .get_policy(id)
13425                    .ok_or_else(|| RedDBError::Query(format!("policy `{id}` not found")))?;
13426                policy.to_json_string()
13427            }
13428        };
13429        let diagnostics = lint(&policy_text);
13430
13431        let principal_str = current_auth_identity()
13432            .map(|(u, _)| u)
13433            .unwrap_or_else(|| "anonymous".into());
13434        tracing::info!(
13435            target: "audit",
13436            principal = %principal_str,
13437            action = "iam:policy.lint",
13438            diagnostic_count = diagnostics.len(),
13439            "LINT POLICY issued"
13440        );
13441        self.inner.audit_log.record(
13442            "iam/policy.lint",
13443            &principal_str,
13444            match source {
13445                LintPolicySource::Id(id) => id.as_str(),
13446                LintPolicySource::Json(_) => "<json>",
13447            },
13448            "ok",
13449            crate::json::Value::Null,
13450        );
13451
13452        // One row per diagnostic. Column order matches the HTTP
13453        // surface's JSON keys so the two contracts line up.
13454        const COLUMNS: [&str; 5] = ["severity", "code", "message", "suggested_fix", "location"];
13455        let schema = Arc::new(
13456            COLUMNS
13457                .iter()
13458                .map(|name| Arc::<str>::from(*name))
13459                .collect::<Vec<_>>(),
13460        );
13461        let records: Vec<UnifiedRecord> = diagnostics
13462            .iter()
13463            .map(|d| {
13464                UnifiedRecord::with_schema(
13465                    Arc::clone(&schema),
13466                    vec![
13467                        SchemaValue::text(d.severity.as_str()),
13468                        SchemaValue::text(d.code.as_str()),
13469                        SchemaValue::text(d.message.clone()),
13470                        d.suggested_fix
13471                            .as_deref()
13472                            .map(SchemaValue::text)
13473                            .unwrap_or(SchemaValue::Null),
13474                        d.location
13475                            .as_deref()
13476                            .map(SchemaValue::text)
13477                            .unwrap_or(SchemaValue::Null),
13478                    ],
13479                )
13480            })
13481            .collect();
13482        let mut result = crate::storage::query::unified::UnifiedResult::with_columns(
13483            COLUMNS.iter().map(|c| c.to_string()).collect(),
13484        );
13485        result.records = records;
13486        Ok(RuntimeQueryResult {
13487            query: query.to_string(),
13488            mode: crate::storage::query::modes::QueryMode::Sql,
13489            statement: "lint_policy",
13490            engine: "iam-policies",
13491            result,
13492            affected_rows: 0,
13493            statement_type: "select",
13494            bookmark: None,
13495        })
13496    }
13497
13498    /// `MIGRATE POLICY MODE TO '<target>' [DRY RUN]` — flip the install
13499    /// from `legacy_rbac` to `policy_only` after the pre-flight delta
13500    /// simulator confirms no non-admin principal would lose access.
13501    /// Issue #714.
13502    fn execute_migrate_policy_mode(
13503        &self,
13504        query: &str,
13505        target: &str,
13506        dry_run: bool,
13507    ) -> RedDBResult<RuntimeQueryResult> {
13508        use crate::auth::enforcement_mode::PolicyEnforcementMode;
13509        use crate::auth::migrate_policy_mode::{
13510            principal_label, simulate_migration_delta, MigratePolicyDelta,
13511        };
13512        use crate::auth::policies::ResourceRef;
13513        use crate::storage::query::unified::UnifiedRecord;
13514        use crate::storage::schema::Value as SchemaValue;
13515        use std::sync::Arc;
13516
13517        // Only `policy_only` is a meaningful destination for this
13518        // command — flipping back to `legacy_rbac` is supported via
13519        // direct config writes (it doesn't need a pre-flight). We
13520        // reject everything else with the same allowlist `parse` uses.
13521        let parsed = PolicyEnforcementMode::parse(target).ok_or_else(|| {
13522            RedDBError::Query(format!(
13523                "MIGRATE POLICY MODE: invalid target `{target}` (expected `policy_only`)"
13524            ))
13525        })?;
13526        if parsed != PolicyEnforcementMode::PolicyOnly {
13527            return Err(RedDBError::Query(format!(
13528                "MIGRATE POLICY MODE: target `{target}` is not supported — only `policy_only` may be migrated to via this command"
13529            )));
13530        }
13531
13532        let auth_store = self
13533            .inner
13534            .auth_store
13535            .read()
13536            .clone()
13537            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
13538
13539        // Resource enumeration: every existing collection probed as
13540        // `table:<name>`. This is the realistic resource surface for
13541        // the legacy_rbac fallback (the role floors gate per-table
13542        // actions). Wildcard / column-scoped resources are still
13543        // covered by the policy evaluator because evaluate() resolves
13544        // resource patterns relative to the concrete resources we
13545        // probe here.
13546        let snapshot = self.inner.db.catalog_model_snapshot();
13547        let resources: Vec<ResourceRef> = snapshot
13548            .collections
13549            .iter()
13550            .map(|c| ResourceRef::new("table", c.name.clone()))
13551            .collect();
13552
13553        let now_ms = crate::utils::now_unix_millis() as u128;
13554        let deltas: Vec<MigratePolicyDelta> =
13555            simulate_migration_delta(auth_store.as_ref(), &resources, now_ms);
13556
13557        let principal_str = current_auth_identity()
13558            .map(|(u, _)| u)
13559            .unwrap_or_else(|| "anonymous".into());
13560
13561        // Audit every issuance. The outcome line differentiates
13562        // dry-run, refused, and applied — operators can grep for these
13563        // strings in the audit log.
13564        let outcome_str = if dry_run {
13565            "dry_run"
13566        } else if deltas.is_empty() {
13567            "applied"
13568        } else {
13569            "refused"
13570        };
13571        tracing::info!(
13572            target: "audit",
13573            principal = %principal_str,
13574            action = "iam:policy.migrate_mode",
13575            target = %target,
13576            dry_run,
13577            delta_count = deltas.len(),
13578            outcome = outcome_str,
13579            "MIGRATE POLICY MODE issued"
13580        );
13581        self.inner.audit_log.record(
13582            "iam/policy.migrate_mode",
13583            &principal_str,
13584            target,
13585            outcome_str,
13586            crate::json::Value::Null,
13587        );
13588
13589        // Refuse the non-dry-run path when any principal would lose
13590        // access. The error string carries a compact summary plus the
13591        // delta count so operators can re-run with DRY RUN to inspect.
13592        if !dry_run && !deltas.is_empty() {
13593            let summary = deltas
13594                .iter()
13595                .take(5)
13596                .map(|d| {
13597                    format!(
13598                        "{}:{}/{}:{}",
13599                        principal_label(&d.principal),
13600                        d.action,
13601                        d.resource_kind,
13602                        d.resource_name
13603                    )
13604                })
13605                .collect::<Vec<_>>()
13606                .join(", ");
13607            let more = if deltas.len() > 5 {
13608                format!(" (and {} more)", deltas.len() - 5)
13609            } else {
13610                String::new()
13611            };
13612            return Err(RedDBError::Query(format!(
13613                "MIGRATE POLICY MODE refused: {n} principal/action/resource pair(s) would lose access under `policy_only`. Run `MIGRATE POLICY MODE TO '{target}' DRY RUN` to inspect. Sample: {summary}{more}",
13614                n = deltas.len(),
13615            )));
13616        }
13617
13618        // Mutate the live enforcement mode only on the non-dry-run
13619        // path with an empty delta. `set_enforcement_mode` also
13620        // persists to vault_kv so the new mode survives restart.
13621        if !dry_run {
13622            auth_store.set_enforcement_mode(parsed);
13623        }
13624
13625        const COLUMNS: [&str; 5] = [
13626            "principal",
13627            "role",
13628            "action",
13629            "resource_kind",
13630            "resource_name",
13631        ];
13632        let schema = Arc::new(
13633            COLUMNS
13634                .iter()
13635                .map(|name| Arc::<str>::from(*name))
13636                .collect::<Vec<_>>(),
13637        );
13638        let records: Vec<UnifiedRecord> = deltas
13639            .iter()
13640            .map(|d| {
13641                UnifiedRecord::with_schema(
13642                    Arc::clone(&schema),
13643                    vec![
13644                        SchemaValue::text(principal_label(&d.principal)),
13645                        SchemaValue::text(d.role.as_str()),
13646                        SchemaValue::text(d.action.clone()),
13647                        SchemaValue::text(d.resource_kind.clone()),
13648                        SchemaValue::text(d.resource_name.clone()),
13649                    ],
13650                )
13651            })
13652            .collect();
13653        let mut result = crate::storage::query::unified::UnifiedResult::with_columns(
13654            COLUMNS.iter().map(|c| c.to_string()).collect(),
13655        );
13656        result.records = records;
13657        Ok(RuntimeQueryResult {
13658            query: query.to_string(),
13659            mode: crate::storage::query::modes::QueryMode::Sql,
13660            statement: "migrate_policy_mode",
13661            engine: "iam-policies",
13662            result,
13663            affected_rows: 0,
13664            statement_type: "select",
13665            bookmark: None,
13666        })
13667    }
13668
13669    fn execute_simulate_policy(
13670        &self,
13671        query: &str,
13672        user: &crate::storage::query::ast::PolicyUserRef,
13673        action: &str,
13674        resource: &crate::storage::query::ast::PolicyResourceRef,
13675    ) -> RedDBResult<RuntimeQueryResult> {
13676        use crate::auth::policies::ResourceRef;
13677        use crate::auth::store::SimCtx;
13678        use crate::auth::UserId;
13679        use crate::storage::query::unified::UnifiedRecord;
13680        use crate::storage::schema::Value as SchemaValue;
13681        use std::sync::Arc;
13682
13683        let auth_store = self
13684            .inner
13685            .auth_store
13686            .read()
13687            .clone()
13688            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
13689        let id = UserId::from_parts(user.tenant.as_deref(), &user.username);
13690        let r = ResourceRef::new(resource.kind.clone(), resource.name.clone());
13691        let outcome = auth_store.simulate(&id, action, &r, SimCtx::default());
13692
13693        let principal_str = current_auth_identity()
13694            .map(|(u, _)| u)
13695            .unwrap_or_else(|| "anonymous".into());
13696        let (decision_str, matched_pid, matched_sid) = decision_to_strings(&outcome.decision);
13697        tracing::info!(
13698            target: "audit",
13699            principal = %principal_str,
13700            action = "iam:policy.simulate",
13701            decision = %decision_str,
13702            matched_policy_id = ?matched_pid,
13703            matched_sid = ?matched_sid,
13704            "SIMULATE issued"
13705        );
13706        self.inner.audit_log.record(
13707            "iam/policy.simulate",
13708            &principal_str,
13709            &id.to_string(),
13710            "ok",
13711            crate::json::Value::Null,
13712        );
13713
13714        let mut rec = UnifiedRecord::default();
13715        rec.set_arc(Arc::from("decision"), SchemaValue::text(decision_str));
13716        rec.set_arc(
13717            Arc::from("matched_policy_id"),
13718            matched_pid
13719                .map(SchemaValue::text)
13720                .unwrap_or(SchemaValue::Null),
13721        );
13722        rec.set_arc(
13723            Arc::from("matched_sid"),
13724            matched_sid
13725                .map(SchemaValue::text)
13726                .unwrap_or(SchemaValue::Null),
13727        );
13728        rec.set_arc(Arc::from("reason"), SchemaValue::text(outcome.reason));
13729        rec.set_arc(
13730            Arc::from("trail_len"),
13731            SchemaValue::Integer(outcome.trail.len() as i64),
13732        );
13733        let mut result = crate::storage::query::unified::UnifiedResult::empty();
13734        result.records = vec![rec];
13735        Ok(RuntimeQueryResult {
13736            query: query.to_string(),
13737            mode: crate::storage::query::modes::QueryMode::Sql,
13738            statement: "simulate_policy",
13739            engine: "iam-policies",
13740            result,
13741            affected_rows: 0,
13742            statement_type: "select",
13743            bookmark: None,
13744        })
13745    }
13746}
13747
13748/// Translate a parsed GRANT into a synthetic IAM policy whose id
13749/// starts with `_grant_<unique>`. PUBLIC is represented as an
13750/// implicit IAM group; legacy GROUP grants are still rejected by the
13751/// grant store and are not translated here.
13752fn grant_to_iam_policy(
13753    principal: &crate::auth::privileges::GrantPrincipal,
13754    resource: &crate::auth::privileges::Resource,
13755    actions: &[crate::auth::privileges::Action],
13756    tenant: Option<&str>,
13757) -> Option<crate::auth::policies::Policy> {
13758    use crate::auth::policies::{
13759        compile_action, ActionPattern, Effect, Policy, ResourcePattern, Statement,
13760    };
13761    use crate::auth::privileges::{Action, GrantPrincipal, Resource};
13762
13763    if matches!(principal, GrantPrincipal::Group(_)) {
13764        return None;
13765    }
13766
13767    let now = crate::auth::now_ms();
13768    let id = format!("_grant_{:x}_{:x}", now, std::process::id());
13769
13770    let resource_str = match resource {
13771        Resource::Database => "table:*".to_string(),
13772        Resource::Schema(s) => format!("table:{s}.*"),
13773        Resource::Table { schema, table } => match schema {
13774            Some(s) => format!("table:{s}.{table}"),
13775            None => format!("table:{table}"),
13776        },
13777        Resource::Function { schema, name } => match schema {
13778            Some(s) => format!("function:{s}.{name}"),
13779            None => format!("function:{name}"),
13780        },
13781    };
13782
13783    // Compile actions — fall back to `*` only when the grant included
13784    // `Action::All`. Map every other action keyword to its lowercase
13785    // form so it lines up with the kernel's allowlist.
13786    let action_patterns: Vec<ActionPattern> = if actions.contains(&Action::All) {
13787        vec![ActionPattern::Wildcard]
13788    } else {
13789        actions
13790            .iter()
13791            .map(|a| compile_action(&a.as_str().to_ascii_lowercase()))
13792            .collect()
13793    };
13794    if action_patterns.is_empty() {
13795        return None;
13796    }
13797
13798    // Inline resource compilation matching the kernel's `compile_resource`:
13799    //   * `*` → wildcard
13800    //   * contains `*` → glob
13801    //   * `kind:name` → exact
13802    let resource_patterns = if resource_str == "*" {
13803        vec![ResourcePattern::Wildcard]
13804    } else if resource_str.contains('*') {
13805        vec![ResourcePattern::Glob(resource_str.clone())]
13806    } else if let Some((kind, name)) = resource_str.split_once(':') {
13807        vec![ResourcePattern::Exact {
13808            kind: kind.to_string(),
13809            name: name.to_string(),
13810        }]
13811    } else {
13812        vec![ResourcePattern::Wildcard]
13813    };
13814
13815    let policy = Policy {
13816        id,
13817        version: 1,
13818        tenant: tenant.map(|t| t.to_string()),
13819        created_at: now,
13820        updated_at: now,
13821        statements: vec![Statement {
13822            sid: None,
13823            effect: Effect::Allow,
13824            actions: action_patterns,
13825            resources: resource_patterns,
13826            condition: None,
13827        }],
13828    };
13829    if policy.validate().is_err() {
13830        return None;
13831    }
13832    Some(policy)
13833}
13834
13835/// Coerce a `key => <number>` table-function named argument into a positive
13836/// iteration count for the centrality TVFs (issue #797). The parser lexes all
13837/// named values as `f64`, so an integral, finite, strictly-positive value is
13838/// required here; anything else (fractional, zero, negative, NaN/inf) is a
13839/// clear query error. `func` names the function for the message.
13840fn parse_positive_iterations(func: &str, value: &f64) -> RedDBResult<usize> {
13841    if !value.is_finite() || *value < 1.0 || value.fract() != 0.0 {
13842        return Err(RedDBError::Query(format!(
13843            "table function '{func}' max_iterations must be a positive integer, got {value}"
13844        )));
13845    }
13846    Ok(*value as usize)
13847}
13848
13849fn legacy_action_to_iam(action: crate::auth::privileges::Action) -> &'static str {
13850    use crate::auth::privileges::Action;
13851    match action {
13852        Action::Select => "select",
13853        Action::Insert => "insert",
13854        Action::Update => "update",
13855        Action::Delete => "delete",
13856        Action::Truncate => "truncate",
13857        Action::References => "references",
13858        Action::Execute => "execute",
13859        Action::Usage => "usage",
13860        Action::All => "*",
13861    }
13862}
13863
13864fn update_set_target_columns(query: &crate::storage::query::ast::UpdateQuery) -> Vec<String> {
13865    let mut columns = Vec::new();
13866    for (column, _) in &query.assignment_exprs {
13867        if !columns.iter().any(|seen| seen == column) {
13868            columns.push(column.clone());
13869        }
13870    }
13871    columns
13872}
13873
13874fn column_access_request_for_table_update(
13875    table_name: &str,
13876    columns: Vec<String>,
13877) -> crate::auth::ColumnAccessRequest {
13878    match table_name.split_once('.') {
13879        Some((schema, table)) => {
13880            crate::auth::ColumnAccessRequest::update(table.to_string(), columns)
13881                .with_schema(schema.to_string())
13882        }
13883        None => crate::auth::ColumnAccessRequest::update(table_name.to_string(), columns),
13884    }
13885}
13886
13887fn column_access_request_for_table_select(
13888    table_name: &str,
13889    columns: Vec<String>,
13890) -> crate::auth::ColumnAccessRequest {
13891    match table_name.split_once('.') {
13892        Some((schema, table)) => {
13893            crate::auth::ColumnAccessRequest::select(table.to_string(), columns)
13894                .with_schema(schema.to_string())
13895        }
13896        None => crate::auth::ColumnAccessRequest::select(table_name.to_string(), columns),
13897    }
13898}
13899
13900fn update_returning_columns_for_policy(
13901    runtime: &RedDBRuntime,
13902    query: &crate::storage::query::ast::UpdateQuery,
13903) -> Option<Vec<String>> {
13904    let items = query.returning.as_ref()?;
13905    let mut columns = Vec::new();
13906    let project_all = items
13907        .iter()
13908        .any(|item| matches!(item, crate::storage::query::ast::ReturningItem::All));
13909    if project_all {
13910        collect_returning_star_columns(runtime, query, &mut columns);
13911    } else {
13912        for item in items {
13913            let crate::storage::query::ast::ReturningItem::Column(column) = item else {
13914                continue;
13915            };
13916            push_returning_policy_column(&mut columns, column);
13917        }
13918    }
13919    (!columns.is_empty()).then_some(columns)
13920}
13921
13922fn collect_returning_star_columns(
13923    runtime: &RedDBRuntime,
13924    query: &crate::storage::query::ast::UpdateQuery,
13925    columns: &mut Vec<String>,
13926) {
13927    let store = runtime.db().store();
13928    let Some(manager) = store.get_collection(&query.table) else {
13929        return;
13930    };
13931    if let Some(schema) = manager.column_schema() {
13932        for column in schema.iter() {
13933            push_returning_policy_column(columns, column);
13934        }
13935    }
13936    for entity in manager.query_all(|_| true) {
13937        if !returning_entity_matches_update_target(&entity, query.target) {
13938            continue;
13939        }
13940        match &entity.data {
13941            crate::storage::EntityData::Row(row) => {
13942                for (column, _) in row.iter_fields() {
13943                    push_returning_policy_column(columns, column);
13944                }
13945            }
13946            crate::storage::EntityData::Node(node) => {
13947                push_returning_policy_column(columns, "label");
13948                push_returning_policy_column(columns, "node_type");
13949                for column in node.properties.keys() {
13950                    push_returning_policy_column(columns, column);
13951                }
13952            }
13953            crate::storage::EntityData::Edge(edge) => {
13954                push_returning_policy_column(columns, "label");
13955                push_returning_policy_column(columns, "from_rid");
13956                push_returning_policy_column(columns, "to_rid");
13957                push_returning_policy_column(columns, "weight");
13958                for column in edge.properties.keys() {
13959                    push_returning_policy_column(columns, column);
13960                }
13961            }
13962            _ => {}
13963        }
13964    }
13965}
13966
13967fn push_returning_policy_column(columns: &mut Vec<String>, column: &str) {
13968    if returning_public_envelope_column(column) {
13969        return;
13970    }
13971    if !columns.iter().any(|seen| seen == column) {
13972        columns.push(column.to_string());
13973    }
13974}
13975
13976fn returning_public_envelope_column(column: &str) -> bool {
13977    matches!(
13978        column.to_ascii_lowercase().as_str(),
13979        "rid" | "collection" | "kind" | "tenant" | "created_at" | "updated_at" | "red_entity_id"
13980    )
13981}
13982
13983fn returning_entity_matches_update_target(
13984    entity: &crate::storage::UnifiedEntity,
13985    target: crate::storage::query::ast::UpdateTarget,
13986) -> bool {
13987    use crate::storage::query::ast::UpdateTarget;
13988    match target {
13989        UpdateTarget::Rows => {
13990            matches!(returning_row_item_kind(entity), Some(ReturningRowKind::Row))
13991        }
13992        UpdateTarget::Documents => {
13993            matches!(
13994                returning_row_item_kind(entity),
13995                Some(ReturningRowKind::Document)
13996            )
13997        }
13998        UpdateTarget::Kv => matches!(returning_row_item_kind(entity), Some(ReturningRowKind::Kv)),
13999        UpdateTarget::Nodes => matches!(
14000            (&entity.kind, &entity.data),
14001            (
14002                crate::storage::EntityKind::GraphNode(_),
14003                crate::storage::EntityData::Node(_)
14004            )
14005        ),
14006        UpdateTarget::Edges => matches!(
14007            (&entity.kind, &entity.data),
14008            (
14009                crate::storage::EntityKind::GraphEdge(_),
14010                crate::storage::EntityData::Edge(_)
14011            )
14012        ),
14013    }
14014}
14015
14016#[derive(Debug, Clone, Copy, PartialEq, Eq)]
14017enum ReturningRowKind {
14018    Row,
14019    Document,
14020    Kv,
14021}
14022
14023fn returning_row_item_kind(entity: &crate::storage::UnifiedEntity) -> Option<ReturningRowKind> {
14024    let row = entity.data.as_row()?;
14025    let is_kv = row.iter_fields().all(|(column, _)| {
14026        column.eq_ignore_ascii_case("key") || column.eq_ignore_ascii_case("value")
14027    });
14028    if is_kv {
14029        return Some(ReturningRowKind::Kv);
14030    }
14031    let is_document = row
14032        .iter_fields()
14033        .any(|(_, value)| matches!(value, crate::storage::schema::Value::Json(_)));
14034    if is_document {
14035        Some(ReturningRowKind::Document)
14036    } else {
14037        Some(ReturningRowKind::Row)
14038    }
14039}
14040
14041fn requested_table_columns_for_policy(
14042    table: &crate::storage::query::ast::TableQuery,
14043) -> Vec<String> {
14044    use crate::storage::query::sql_lowering::{
14045        effective_table_filter, effective_table_group_by_exprs, effective_table_having_filter,
14046        effective_table_projections,
14047    };
14048
14049    let table_name = table.table.as_str();
14050    let table_alias = table.alias.as_deref();
14051    let mut columns = std::collections::BTreeSet::new();
14052
14053    for projection in effective_table_projections(table) {
14054        collect_projection_columns(&projection, table_name, table_alias, &mut columns);
14055    }
14056    if let Some(filter) = effective_table_filter(table) {
14057        collect_filter_columns(&filter, table_name, table_alias, &mut columns);
14058    }
14059    for expr in effective_table_group_by_exprs(table) {
14060        collect_expr_columns(&expr, table_name, table_alias, &mut columns);
14061    }
14062    if let Some(filter) = effective_table_having_filter(table) {
14063        collect_filter_columns(&filter, table_name, table_alias, &mut columns);
14064    }
14065    for order in &table.order_by {
14066        if let Some(expr) = order.expr.as_ref() {
14067            collect_expr_columns(expr, table_name, table_alias, &mut columns);
14068        } else {
14069            collect_field_ref_column(&order.field, table_name, table_alias, &mut columns);
14070        }
14071    }
14072
14073    columns.into_iter().collect()
14074}
14075
14076fn collect_projection_columns(
14077    projection: &crate::storage::query::ast::Projection,
14078    table_name: &str,
14079    table_alias: Option<&str>,
14080    columns: &mut std::collections::BTreeSet<String>,
14081) {
14082    use crate::storage::query::ast::Projection;
14083    match projection {
14084        Projection::All => {
14085            columns.insert("*".to_string());
14086        }
14087        Projection::Column(column) | Projection::Alias(column, _) => {
14088            if column != "*" {
14089                columns.insert(column.clone());
14090            }
14091        }
14092        Projection::Function(_, args) => {
14093            for arg in args {
14094                collect_projection_columns(arg, table_name, table_alias, columns);
14095            }
14096        }
14097        Projection::Expression(filter, _) => {
14098            collect_filter_columns(filter, table_name, table_alias, columns);
14099        }
14100        Projection::Field(field, _) => {
14101            collect_field_ref_column(field, table_name, table_alias, columns);
14102        }
14103        // Slice 7a (#589): no runtime support yet; recurse into args so
14104        // any column references are still tracked in case a future
14105        // executor needs the column set.
14106        Projection::Window { args, .. } => {
14107            for arg in args {
14108                collect_projection_columns(arg, table_name, table_alias, columns);
14109            }
14110        }
14111    }
14112}
14113
14114fn collect_filter_columns(
14115    filter: &crate::storage::query::ast::Filter,
14116    table_name: &str,
14117    table_alias: Option<&str>,
14118    columns: &mut std::collections::BTreeSet<String>,
14119) {
14120    use crate::storage::query::ast::Filter;
14121    match filter {
14122        Filter::Compare { field, .. }
14123        | Filter::IsNull(field)
14124        | Filter::IsNotNull(field)
14125        | Filter::In { field, .. }
14126        | Filter::Between { field, .. }
14127        | Filter::Like { field, .. }
14128        | Filter::StartsWith { field, .. }
14129        | Filter::EndsWith { field, .. }
14130        | Filter::Contains { field, .. } => {
14131            collect_field_ref_column(field, table_name, table_alias, columns);
14132        }
14133        Filter::CompareFields { left, right, .. } => {
14134            collect_field_ref_column(left, table_name, table_alias, columns);
14135            collect_field_ref_column(right, table_name, table_alias, columns);
14136        }
14137        Filter::CompareExpr { lhs, rhs, .. } => {
14138            collect_expr_columns(lhs, table_name, table_alias, columns);
14139            collect_expr_columns(rhs, table_name, table_alias, columns);
14140        }
14141        Filter::And(left, right) | Filter::Or(left, right) => {
14142            collect_filter_columns(left, table_name, table_alias, columns);
14143            collect_filter_columns(right, table_name, table_alias, columns);
14144        }
14145        Filter::Not(inner) => collect_filter_columns(inner, table_name, table_alias, columns),
14146    }
14147}
14148
14149fn collect_expr_columns(
14150    expr: &crate::storage::query::ast::Expr,
14151    table_name: &str,
14152    table_alias: Option<&str>,
14153    columns: &mut std::collections::BTreeSet<String>,
14154) {
14155    use crate::storage::query::ast::Expr;
14156    match expr {
14157        Expr::Column { field, .. } => {
14158            collect_field_ref_column(field, table_name, table_alias, columns);
14159        }
14160        Expr::Literal { .. } | Expr::Parameter { .. } => {}
14161        Expr::UnaryOp { operand, .. } | Expr::Cast { inner: operand, .. } => {
14162            collect_expr_columns(operand, table_name, table_alias, columns);
14163        }
14164        Expr::BinaryOp { lhs, rhs, .. } => {
14165            collect_expr_columns(lhs, table_name, table_alias, columns);
14166            collect_expr_columns(rhs, table_name, table_alias, columns);
14167        }
14168        Expr::FunctionCall { args, .. } => {
14169            for arg in args {
14170                collect_expr_columns(arg, table_name, table_alias, columns);
14171            }
14172        }
14173        Expr::Case {
14174            branches, else_, ..
14175        } => {
14176            for (condition, value) in branches {
14177                collect_expr_columns(condition, table_name, table_alias, columns);
14178                collect_expr_columns(value, table_name, table_alias, columns);
14179            }
14180            if let Some(value) = else_ {
14181                collect_expr_columns(value, table_name, table_alias, columns);
14182            }
14183        }
14184        Expr::IsNull { operand, .. } => {
14185            collect_expr_columns(operand, table_name, table_alias, columns);
14186        }
14187        Expr::InList { target, values, .. } => {
14188            collect_expr_columns(target, table_name, table_alias, columns);
14189            for value in values {
14190                collect_expr_columns(value, table_name, table_alias, columns);
14191            }
14192        }
14193        Expr::Between {
14194            target, low, high, ..
14195        } => {
14196            collect_expr_columns(target, table_name, table_alias, columns);
14197            collect_expr_columns(low, table_name, table_alias, columns);
14198            collect_expr_columns(high, table_name, table_alias, columns);
14199        }
14200        Expr::Subquery { .. } => {}
14201        Expr::WindowFunctionCall { args, window, .. } => {
14202            for arg in args {
14203                collect_expr_columns(arg, table_name, table_alias, columns);
14204            }
14205            for e in &window.partition_by {
14206                collect_expr_columns(e, table_name, table_alias, columns);
14207            }
14208            for o in &window.order_by {
14209                collect_expr_columns(&o.expr, table_name, table_alias, columns);
14210            }
14211        }
14212    }
14213}
14214
14215fn collect_field_ref_column(
14216    field: &crate::storage::query::ast::FieldRef,
14217    table_name: &str,
14218    table_alias: Option<&str>,
14219    columns: &mut std::collections::BTreeSet<String>,
14220) {
14221    if let Some(column) = policy_column_name_from_field_ref(field, table_name, table_alias) {
14222        if column != "*" {
14223            columns.insert(column);
14224        }
14225    }
14226}
14227
14228fn policy_column_name_from_field_ref(
14229    field: &crate::storage::query::ast::FieldRef,
14230    table_name: &str,
14231    table_alias: Option<&str>,
14232) -> Option<String> {
14233    match field {
14234        crate::storage::query::ast::FieldRef::TableColumn { table, column } => {
14235            if column == "*" {
14236                return Some("*".to_string());
14237            }
14238            if table.is_empty() || table == table_name || Some(table.as_str()) == table_alias {
14239                Some(column.clone())
14240            } else {
14241                Some(format!("{table}.{column}"))
14242            }
14243        }
14244        _ => None,
14245    }
14246}
14247
14248fn legacy_resource_to_iam(
14249    resource: &crate::auth::privileges::Resource,
14250    tenant: Option<&str>,
14251) -> crate::auth::policies::ResourceRef {
14252    use crate::auth::privileges::Resource;
14253
14254    let (kind, name) = match resource {
14255        Resource::Database => ("database".to_string(), "*".to_string()),
14256        Resource::Schema(s) => ("schema".to_string(), format!("{s}.*")),
14257        Resource::Table { schema, table } => (
14258            "table".to_string(),
14259            match schema {
14260                Some(s) => format!("{s}.{table}"),
14261                None => table.clone(),
14262            },
14263        ),
14264        Resource::Function { schema, name } => (
14265            "function".to_string(),
14266            match schema {
14267                Some(s) => format!("{s}.{name}"),
14268                None => name.clone(),
14269            },
14270        ),
14271    };
14272
14273    let mut out = crate::auth::policies::ResourceRef::new(kind, name);
14274    if let Some(t) = tenant {
14275        out = out.with_tenant(t.to_string());
14276    }
14277    out
14278}
14279
14280#[derive(Debug)]
14281struct JoinTableSide {
14282    table: String,
14283    alias: String,
14284}
14285
14286fn table_side_context(expr: &QueryExpr) -> Option<JoinTableSide> {
14287    match expr {
14288        QueryExpr::Table(table) => Some(JoinTableSide {
14289            table: table.table.clone(),
14290            alias: table.alias.clone().unwrap_or_else(|| table.table.clone()),
14291        }),
14292        _ => None,
14293    }
14294}
14295
14296fn collect_projection_columns_for_table(
14297    projection: &Projection,
14298    table: &str,
14299    alias: Option<&str>,
14300    out: &mut BTreeSet<String>,
14301) {
14302    match projection {
14303        Projection::Column(column) | Projection::Alias(column, _) => {
14304            match split_qualified_column(column) {
14305                Some((qualifier, column))
14306                    if qualifier == table || alias.is_some_and(|alias| qualifier == alias) =>
14307                {
14308                    push_policy_column(column, out);
14309                }
14310                Some(_) => {}
14311                None => push_policy_column(column, out),
14312            }
14313        }
14314        Projection::Field(
14315            FieldRef::TableColumn {
14316                table: qualifier,
14317                column,
14318            },
14319            _,
14320        ) => {
14321            if qualifier.is_empty()
14322                || qualifier == table
14323                || alias.is_some_and(|alias| qualifier == alias)
14324            {
14325                push_policy_column(column, out);
14326            }
14327        }
14328        Projection::Field(
14329            FieldRef::NodeProperty {
14330                alias: qualifier,
14331                property,
14332            },
14333            _,
14334        )
14335        | Projection::Field(
14336            FieldRef::EdgeProperty {
14337                alias: qualifier,
14338                property,
14339            },
14340            _,
14341        ) => {
14342            if qualifier == table || alias.is_some_and(|alias| qualifier == alias) {
14343                push_policy_column(property, out);
14344            }
14345        }
14346        Projection::Function(_, args) => {
14347            for arg in args {
14348                collect_projection_columns_for_table(arg, table, alias, out);
14349            }
14350        }
14351        Projection::Expression(_, _) | Projection::All | Projection::Field(_, _) => {}
14352        Projection::Window { args, .. } => {
14353            for arg in args {
14354                collect_projection_columns_for_table(arg, table, alias, out);
14355            }
14356        }
14357    }
14358}
14359
14360fn collect_projection_columns_for_join_side(
14361    projection: &Projection,
14362    left: Option<&JoinTableSide>,
14363    right: Option<&JoinTableSide>,
14364    out: &mut HashMap<String, BTreeSet<String>>,
14365) -> RedDBResult<()> {
14366    match projection {
14367        Projection::Column(column) | Projection::Alias(column, _) => {
14368            if let Some((qualifier, column)) = split_qualified_column(column) {
14369                push_qualified_join_column(qualifier, column, left, right, out);
14370            } else {
14371                push_unqualified_join_column(column, left, right, out);
14372            }
14373        }
14374        Projection::Field(FieldRef::TableColumn { table, column }, _) => {
14375            if table.is_empty() {
14376                push_unqualified_join_column(column, left, right, out);
14377            } else if let Some(side) = [left, right]
14378                .into_iter()
14379                .flatten()
14380                .find(|side| table == side.table.as_str() || table == side.alias.as_str())
14381            {
14382                push_join_column(&side.table, column, out);
14383            }
14384        }
14385        Projection::Field(FieldRef::NodeProperty { alias, property }, _)
14386        | Projection::Field(FieldRef::EdgeProperty { alias, property }, _) => {
14387            push_qualified_join_column(alias, property, left, right, out);
14388        }
14389        Projection::Function(_, args) => {
14390            for arg in args {
14391                collect_projection_columns_for_join_side(arg, left, right, out)?;
14392            }
14393        }
14394        Projection::Expression(_, _) | Projection::All | Projection::Field(_, _) => {}
14395        Projection::Window { args, .. } => {
14396            for arg in args {
14397                collect_projection_columns_for_join_side(arg, left, right, out)?;
14398            }
14399        }
14400    }
14401    Ok(())
14402}
14403
14404fn split_qualified_column(column: &str) -> Option<(&str, &str)> {
14405    let (qualifier, column) = column.split_once('.')?;
14406    if qualifier.is_empty() || column.is_empty() || column.contains('.') {
14407        return None;
14408    }
14409    Some((qualifier, column))
14410}
14411
14412fn push_qualified_join_column(
14413    qualifier: &str,
14414    column: &str,
14415    left: Option<&JoinTableSide>,
14416    right: Option<&JoinTableSide>,
14417    out: &mut HashMap<String, BTreeSet<String>>,
14418) {
14419    if let Some(side) = [left, right]
14420        .into_iter()
14421        .flatten()
14422        .find(|side| qualifier == side.table.as_str() || qualifier == side.alias.as_str())
14423    {
14424        push_join_column(&side.table, column, out);
14425    }
14426}
14427
14428fn push_unqualified_join_column(
14429    column: &str,
14430    left: Option<&JoinTableSide>,
14431    right: Option<&JoinTableSide>,
14432    out: &mut HashMap<String, BTreeSet<String>>,
14433) {
14434    for side in [left, right].into_iter().flatten() {
14435        push_join_column(&side.table, column, out);
14436    }
14437}
14438
14439fn push_join_column(table: &str, column: &str, out: &mut HashMap<String, BTreeSet<String>>) {
14440    if is_policy_column_name(column) {
14441        out.entry(table.to_string())
14442            .or_default()
14443            .insert(column.to_string());
14444    }
14445}
14446
14447fn push_policy_column(column: &str, out: &mut BTreeSet<String>) {
14448    if is_policy_column_name(column) {
14449        out.insert(column.to_string());
14450    }
14451}
14452
14453fn is_policy_column_name(column: &str) -> bool {
14454    !column.is_empty()
14455        && column != "*"
14456        && !column.starts_with("LIT:")
14457        && !column.starts_with("TYPE:")
14458}
14459
14460fn runtime_iam_context(
14461    role: crate::auth::Role,
14462    tenant: Option<&str>,
14463    principal_is_system_owned: bool,
14464) -> crate::auth::policies::EvalContext {
14465    crate::auth::policies::EvalContext {
14466        principal_tenant: tenant.map(|t| t.to_string()),
14467        current_tenant: tenant.map(|t| t.to_string()),
14468        peer_ip: None,
14469        mfa_present: false,
14470        now_ms: crate::auth::now_ms(),
14471        principal_is_admin_role: role == crate::auth::Role::Admin,
14472        principal_is_system_owned,
14473        principal_is_platform_scoped: tenant.is_none(),
14474    }
14475}
14476
14477fn explicit_table_projection_columns(
14478    query: &crate::storage::query::ast::TableQuery,
14479) -> Vec<String> {
14480    use crate::storage::query::ast::{FieldRef, Projection};
14481
14482    let mut columns = Vec::new();
14483    for projection in crate::storage::query::sql_lowering::effective_table_projections(query) {
14484        match projection {
14485            Projection::Column(column) | Projection::Alias(column, _) => {
14486                push_unique(&mut columns, column)
14487            }
14488            Projection::Field(FieldRef::TableColumn { column, .. }, _) => {
14489                push_unique(&mut columns, column)
14490            }
14491            // SELECT * and expression/function projections need the
14492            // executor-wide column-policy context mapped in
14493            // docs/security/select-relational-column-policy-audit-2026-05-08.md.
14494            _ => {}
14495        }
14496    }
14497    columns
14498}
14499
14500fn explicit_graph_projection_properties(
14501    query: &crate::storage::query::ast::GraphQuery,
14502) -> Vec<String> {
14503    use crate::storage::query::ast::{FieldRef, Projection};
14504
14505    let mut columns = Vec::new();
14506    for projection in &query.return_ {
14507        match projection {
14508            Projection::Field(FieldRef::NodeProperty { property, .. }, _)
14509            | Projection::Field(FieldRef::EdgeProperty { property, .. }, _) => {
14510                push_unique(&mut columns, property.clone())
14511            }
14512            _ => {}
14513        }
14514    }
14515    columns
14516}
14517
14518fn push_unique(columns: &mut Vec<String>, column: String) {
14519    if !columns.iter().any(|existing| existing == &column) {
14520        columns.push(column);
14521    }
14522}
14523
14524fn principal_label(p: &crate::storage::query::ast::PolicyPrincipalRef) -> String {
14525    use crate::storage::query::ast::PolicyPrincipalRef;
14526    match p {
14527        PolicyPrincipalRef::User(u) => match &u.tenant {
14528            Some(t) => format!("user:{t}/{}", u.username),
14529            None => format!("user:{}", u.username),
14530        },
14531        PolicyPrincipalRef::Group(g) => format!("group:{g}"),
14532    }
14533}
14534
14535/// Render a `Decision` into the (decision, matched_policy_id, matched_sid)
14536/// shape used by every audit emit + the simulator response.
14537pub(crate) fn decision_to_strings(
14538    d: &crate::auth::policies::Decision,
14539) -> (String, Option<String>, Option<String>) {
14540    use crate::auth::policies::Decision;
14541    match d {
14542        Decision::Allow {
14543            matched_policy_id,
14544            matched_sid,
14545        } => (
14546            "allow".into(),
14547            Some(matched_policy_id.clone()),
14548            matched_sid.clone(),
14549        ),
14550        Decision::Deny {
14551            matched_policy_id,
14552            matched_sid,
14553        } => (
14554            "deny".into(),
14555            Some(matched_policy_id.clone()),
14556            matched_sid.clone(),
14557        ),
14558        Decision::DefaultDeny => ("default_deny".into(), None, None),
14559        Decision::AdminBypass => ("admin_bypass".into(), None, None),
14560    }
14561}
14562
14563fn relation_scopes_for_query(query: &QueryExpr) -> Vec<String> {
14564    let mut scopes = Vec::new();
14565    collect_relation_scopes(query, &mut scopes);
14566    scopes.sort();
14567    scopes.dedup();
14568    scopes
14569}
14570
14571fn collect_relation_scopes(query: &QueryExpr, scopes: &mut Vec<String>) {
14572    match query {
14573        QueryExpr::Table(table) => {
14574            if !table.table.is_empty() {
14575                scopes.push(table.table.clone());
14576            }
14577            if let Some(alias) = &table.alias {
14578                scopes.push(alias.clone());
14579            }
14580        }
14581        QueryExpr::Join(join) => {
14582            collect_relation_scopes(&join.left, scopes);
14583            collect_relation_scopes(&join.right, scopes);
14584        }
14585        _ => {}
14586    }
14587}
14588
14589fn query_references_outer_scope(query: &QueryExpr, outer_scopes: &[String]) -> bool {
14590    let inner_scopes = relation_scopes_for_query(query);
14591    query_expr_references_outer_scope(query, outer_scopes, &inner_scopes)
14592}
14593
14594fn query_expr_references_outer_scope(
14595    query: &QueryExpr,
14596    outer_scopes: &[String],
14597    inner_scopes: &[String],
14598) -> bool {
14599    match query {
14600        QueryExpr::Table(table) => {
14601            table.select_items.iter().any(|item| match item {
14602                crate::storage::query::ast::SelectItem::Wildcard => false,
14603                crate::storage::query::ast::SelectItem::Expr { expr, .. } => {
14604                    expr_references_outer_scope(expr, outer_scopes, inner_scopes)
14605                }
14606            }) || table
14607                .where_expr
14608                .as_ref()
14609                .is_some_and(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
14610                || table.filter.as_ref().is_some_and(|filter| {
14611                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
14612                })
14613                || table.having_expr.as_ref().is_some_and(|expr| {
14614                    expr_references_outer_scope(expr, outer_scopes, inner_scopes)
14615                })
14616                || table.having.as_ref().is_some_and(|filter| {
14617                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
14618                })
14619                || table
14620                    .group_by_exprs
14621                    .iter()
14622                    .any(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
14623                || table.order_by.iter().any(|clause| {
14624                    clause.expr.as_ref().is_some_and(|expr| {
14625                        expr_references_outer_scope(expr, outer_scopes, inner_scopes)
14626                    })
14627                })
14628        }
14629        QueryExpr::Join(join) => {
14630            query_expr_references_outer_scope(&join.left, outer_scopes, inner_scopes)
14631                || query_expr_references_outer_scope(&join.right, outer_scopes, inner_scopes)
14632                || join.filter.as_ref().is_some_and(|filter| {
14633                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
14634                })
14635                || join.return_items.iter().any(|item| match item {
14636                    crate::storage::query::ast::SelectItem::Wildcard => false,
14637                    crate::storage::query::ast::SelectItem::Expr { expr, .. } => {
14638                        expr_references_outer_scope(expr, outer_scopes, inner_scopes)
14639                    }
14640                })
14641        }
14642        _ => false,
14643    }
14644}
14645
14646fn filter_references_outer_scope(
14647    filter: &crate::storage::query::ast::Filter,
14648    outer_scopes: &[String],
14649    inner_scopes: &[String],
14650) -> bool {
14651    use crate::storage::query::ast::Filter;
14652    match filter {
14653        Filter::Compare { field, .. }
14654        | Filter::IsNull(field)
14655        | Filter::IsNotNull(field)
14656        | Filter::In { field, .. }
14657        | Filter::Between { field, .. }
14658        | Filter::Like { field, .. }
14659        | Filter::StartsWith { field, .. }
14660        | Filter::EndsWith { field, .. }
14661        | Filter::Contains { field, .. } => {
14662            field_ref_references_outer_scope(field, outer_scopes, inner_scopes)
14663        }
14664        Filter::CompareFields { left, right, .. } => {
14665            field_ref_references_outer_scope(left, outer_scopes, inner_scopes)
14666                || field_ref_references_outer_scope(right, outer_scopes, inner_scopes)
14667        }
14668        Filter::CompareExpr { lhs, rhs, .. } => {
14669            expr_references_outer_scope(lhs, outer_scopes, inner_scopes)
14670                || expr_references_outer_scope(rhs, outer_scopes, inner_scopes)
14671        }
14672        Filter::And(left, right) | Filter::Or(left, right) => {
14673            filter_references_outer_scope(left, outer_scopes, inner_scopes)
14674                || filter_references_outer_scope(right, outer_scopes, inner_scopes)
14675        }
14676        Filter::Not(inner) => filter_references_outer_scope(inner, outer_scopes, inner_scopes),
14677    }
14678}
14679
14680fn expr_references_outer_scope(
14681    expr: &crate::storage::query::ast::Expr,
14682    outer_scopes: &[String],
14683    inner_scopes: &[String],
14684) -> bool {
14685    use crate::storage::query::ast::Expr;
14686    match expr {
14687        Expr::Column { field, .. } => {
14688            field_ref_references_outer_scope(field, outer_scopes, inner_scopes)
14689        }
14690        Expr::BinaryOp { lhs, rhs, .. } => {
14691            expr_references_outer_scope(lhs, outer_scopes, inner_scopes)
14692                || expr_references_outer_scope(rhs, outer_scopes, inner_scopes)
14693        }
14694        Expr::UnaryOp { operand, .. }
14695        | Expr::Cast { inner: operand, .. }
14696        | Expr::IsNull { operand, .. } => {
14697            expr_references_outer_scope(operand, outer_scopes, inner_scopes)
14698        }
14699        Expr::FunctionCall { args, .. } => args
14700            .iter()
14701            .any(|arg| expr_references_outer_scope(arg, outer_scopes, inner_scopes)),
14702        Expr::Case {
14703            branches, else_, ..
14704        } => {
14705            branches.iter().any(|(cond, value)| {
14706                expr_references_outer_scope(cond, outer_scopes, inner_scopes)
14707                    || expr_references_outer_scope(value, outer_scopes, inner_scopes)
14708            }) || else_
14709                .as_ref()
14710                .is_some_and(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
14711        }
14712        Expr::InList { target, values, .. } => {
14713            expr_references_outer_scope(target, outer_scopes, inner_scopes)
14714                || values
14715                    .iter()
14716                    .any(|value| expr_references_outer_scope(value, outer_scopes, inner_scopes))
14717        }
14718        Expr::Between {
14719            target, low, high, ..
14720        } => {
14721            expr_references_outer_scope(target, outer_scopes, inner_scopes)
14722                || expr_references_outer_scope(low, outer_scopes, inner_scopes)
14723                || expr_references_outer_scope(high, outer_scopes, inner_scopes)
14724        }
14725        Expr::Subquery { query, .. } => query_references_outer_scope(&query.query, inner_scopes),
14726        Expr::Literal { .. } | Expr::Parameter { .. } => false,
14727        Expr::WindowFunctionCall { args, window, .. } => {
14728            args.iter()
14729                .any(|arg| expr_references_outer_scope(arg, outer_scopes, inner_scopes))
14730                || window
14731                    .partition_by
14732                    .iter()
14733                    .any(|e| expr_references_outer_scope(e, outer_scopes, inner_scopes))
14734                || window
14735                    .order_by
14736                    .iter()
14737                    .any(|o| expr_references_outer_scope(&o.expr, outer_scopes, inner_scopes))
14738        }
14739    }
14740}
14741
14742fn field_ref_references_outer_scope(
14743    field: &crate::storage::query::ast::FieldRef,
14744    outer_scopes: &[String],
14745    inner_scopes: &[String],
14746) -> bool {
14747    match field {
14748        crate::storage::query::ast::FieldRef::TableColumn { table, .. } if !table.is_empty() => {
14749            outer_scopes.iter().any(|scope| scope == table)
14750                && !inner_scopes.iter().any(|scope| scope == table)
14751        }
14752        _ => false,
14753    }
14754}
14755
14756fn first_column_values(
14757    result: crate::storage::query::unified::UnifiedResult,
14758) -> RedDBResult<Vec<Value>> {
14759    if result.columns.len() > 1 {
14760        return Err(RedDBError::Query(
14761            "expression subquery must return exactly one column".to_string(),
14762        ));
14763    }
14764    let fallback_column = result
14765        .records
14766        .first()
14767        .and_then(|record| record.column_names().into_iter().next())
14768        .map(|name| name.to_string());
14769    let column = result.columns.first().cloned().or(fallback_column);
14770    let Some(column) = column else {
14771        return Ok(Vec::new());
14772    };
14773    Ok(result
14774        .records
14775        .iter()
14776        .map(|record| record.get(column.as_str()).cloned().unwrap_or(Value::Null))
14777        .collect())
14778}
14779
14780fn parse_timestamp_to_ms(s: &str) -> Option<u128> {
14781    // Bare integer ms.
14782    if let Ok(n) = s.parse::<u128>() {
14783        return Some(n);
14784    }
14785    // Fallback: ISO-8601 like 2030-01-02 03:04:05 — accept the date
14786    // portion only (midnight UTC). Full RFC3339 parsing is a stretch
14787    // goal; the common case is `'2030-01-01'`.
14788    if let Some(date) = s.split_whitespace().next() {
14789        let parts: Vec<&str> = date.split('-').collect();
14790        if parts.len() == 3 {
14791            let (y, m, d) = (parts[0], parts[1], parts[2]);
14792            if let (Ok(y), Ok(m), Ok(d)) = (y.parse::<i64>(), m.parse::<u32>(), d.parse::<u32>()) {
14793                // Days since 1970-01-01 — simple Julian arithmetic
14794                // suitable for years 1970-2100. Good enough for test
14795                // fixtures; precise parsing lands when we wire chrono.
14796                let days_in = days_from_civil(y, m, d);
14797                return Some((days_in as u128) * 86_400_000u128);
14798            }
14799        }
14800    }
14801    None
14802}
14803
14804/// Days from Unix epoch using H. Hinnant's civil-from-days algorithm.
14805/// Robust for the entire Gregorian range; used by `parse_timestamp_to_ms`.
14806fn days_from_civil(y: i64, m: u32, d: u32) -> i64 {
14807    let y = if m <= 2 { y - 1 } else { y };
14808    let era = if y >= 0 { y } else { y - 399 } / 400;
14809    let yoe = (y - era * 400) as u64; // [0, 399]
14810    let doy = (153 * (if m > 2 { m - 3 } else { m + 9 }) as u64 + 2) / 5 + d as u64 - 1;
14811    let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
14812    era * 146097 + doe as i64 - 719468
14813}
14814
14815fn walk_plan_node(
14816    node: &crate::storage::query::planner::CanonicalLogicalNode,
14817    depth: usize,
14818    out: &mut Vec<crate::storage::query::unified::UnifiedRecord>,
14819) {
14820    use std::sync::Arc;
14821    let mut rec = crate::storage::query::unified::UnifiedRecord::default();
14822    rec.set_arc(Arc::from("op"), Value::text(node.operator.clone()));
14823    rec.set_arc(
14824        Arc::from("source"),
14825        node.source.clone().map(Value::text).unwrap_or(Value::Null),
14826    );
14827    rec.set_arc(Arc::from("est_rows"), Value::Float(node.estimated_rows));
14828    rec.set_arc(Arc::from("est_cost"), Value::Float(node.operator_cost));
14829    rec.set_arc(Arc::from("depth"), Value::Integer(depth as i64));
14830    out.push(rec);
14831    for child in &node.children {
14832        walk_plan_node(child, depth + 1, out);
14833    }
14834}
14835
14836#[cfg(test)]
14837mod inline_graph_tvf_tests {
14838    use super::*;
14839
14840    fn scopes_for(sql: &str) -> HashSet<String> {
14841        let expr = crate::storage::query::parser::parse(sql)
14842            .expect("parse")
14843            .query;
14844        query_expr_result_cache_scopes(&expr)
14845    }
14846
14847    #[test]
14848    fn inline_tvf_cache_scopes_include_source_collections() {
14849        // The result-cache key for the inline form must derive from the
14850        // `nodes`/`edges` source collections so a write to either invalidates
14851        // the cached result (issue #799).
14852        let scopes = scopes_for(
14853            "SELECT * FROM components(nodes => (SELECT id FROM hosts), edges => (SELECT src, dst FROM links))",
14854        );
14855        assert!(scopes.contains("hosts"), "nodes source scoped: {scopes:?}");
14856        assert!(scopes.contains("links"), "edges source scoped: {scopes:?}");
14857    }
14858
14859    #[test]
14860    fn graph_collection_tvf_has_no_cache_scope() {
14861        // The graph-collection form reads the whole graph store and is not
14862        // scoped to any named collection (issue #795 behaviour preserved).
14863        let scopes = scopes_for("SELECT * FROM components(g)");
14864        assert!(scopes.is_empty(), "collection form unscoped: {scopes:?}");
14865    }
14866
14867    #[test]
14868    fn abstract_degree_centrality_counts_undirected_endpoints() {
14869        let nodes = vec!["a".to_string(), "b".to_string(), "c".to_string()];
14870        let edges = vec![
14871            ("a".to_string(), "b".to_string(), 1.0_f32),
14872            ("b".to_string(), "c".to_string(), 1.0_f32),
14873        ];
14874        let degrees = abstract_degree_centrality(&nodes, &edges);
14875        assert_eq!(
14876            degrees,
14877            vec![
14878                ("a".to_string(), 1),
14879                ("b".to_string(), 2),
14880                ("c".to_string(), 1),
14881            ]
14882        );
14883    }
14884}