Skip to main content

reddb_server/runtime/
impl_core.rs

1use super::*;
2use crate::auth::column_policy_gate::ColumnAccessRequest;
3use crate::auth::UserId;
4use crate::replication::cdc::ChangeRecord;
5use crate::storage::query::ast::TableSource;
6
7/// Read a numeric score column out of a result record as `f64`, matching
8/// the column name case-insensitively. Used by the leaderboard-rank head
9/// walk (#918) to compare scores; non-numeric / missing columns yield
10/// `None` so a row with no comparable score never shifts a rank.
11fn record_column_f64(
12    rec: &crate::storage::query::unified::UnifiedRecord,
13    column: &str,
14) -> Option<f64> {
15    let value = rec
16        .get(column)
17        .or_else(|| rec.get(&column.to_lowercase()))?;
18    match value {
19        Value::Integer(n) => Some(*n as f64),
20        Value::UnsignedInteger(n) => Some(*n as f64),
21        Value::Float(n) => Some(*n),
22        Value::Timestamp(n) | Value::Duration(n) => Some(*n as f64),
23        _ => None,
24    }
25}
26
27fn record_rid_u64(rec: &crate::storage::query::unified::UnifiedRecord) -> Option<u64> {
28    match rec.get("rid") {
29        Some(Value::UnsignedInteger(n)) => Some(*n),
30        Some(Value::Integer(n)) if *n >= 0 => Some(*n as u64),
31        _ => None,
32    }
33}
34
35fn seed_storage_deploy_config(
36    store: &crate::storage::UnifiedStore,
37    selection: crate::storage::StorageProfileSelection,
38) {
39    store.set_config_tree(
40        "storage.deploy",
41        &crate::json!({
42            "profile": selection.deploy_profile.as_str(),
43            "packaging": selection.packaging.as_str(),
44            "preset": selection.preset_name(),
45            "replica_count": selection.replica_count,
46            "managed_backup": selection.managed_backup,
47            "wal_retention": selection.wal_retention,
48        }),
49    );
50}
51
52struct RankedHeadEntry {
53    rank: u64,
54    record: crate::storage::query::unified::UnifiedRecord,
55}
56
57fn secret_sql_value_to_string(value: &Value) -> RedDBResult<String> {
58    match value {
59        Value::Text(s) => Ok(s.to_string()),
60        Value::Integer(n) => Ok(n.to_string()),
61        Value::UnsignedInteger(n) => Ok(n.to_string()),
62        Value::Float(n) => Ok(n.to_string()),
63        Value::Boolean(b) => Ok(b.to_string()),
64        Value::Null => Err(RedDBError::Query(
65            "SET SECRET key = NULL deletes the secret; use DELETE SECRET for explicit deletes"
66                .to_string(),
67        )),
68        Value::Password(_) | Value::Secret(_) => Err(RedDBError::Query(
69            "SET SECRET accepts plain scalar literals; PASSWORD() and SECRET() are for typed columns"
70                .to_string(),
71        )),
72        _ => Err(RedDBError::Query(format!(
73            "SET SECRET does not support value type {:?} yet",
74            value.data_type()
75        ))),
76    }
77}
78
79fn insert_config_json_path(
80    root: &mut crate::serde_json::Value,
81    path: &str,
82    value: crate::serde_json::Value,
83) {
84    let segments: Vec<&str> = path
85        .split('.')
86        .filter(|segment| !segment.is_empty())
87        .collect();
88    insert_config_json_segments(root, &segments, value);
89}
90
91fn insert_config_json_segments(
92    root: &mut crate::serde_json::Value,
93    segments: &[&str],
94    value: crate::serde_json::Value,
95) {
96    if segments.is_empty() {
97        *root = value;
98        return;
99    }
100
101    if !matches!(root, crate::serde_json::Value::Object(_)) {
102        *root = crate::serde_json::Value::Object(crate::serde_json::Map::new());
103    }
104
105    let crate::serde_json::Value::Object(map) = root else {
106        return;
107    };
108    if segments.len() == 1 {
109        map.insert(segments[0].to_string(), value);
110        return;
111    }
112    let entry = map
113        .entry(segments[0].to_string())
114        .or_insert_with(|| crate::serde_json::Value::Object(crate::serde_json::Map::new()));
115    insert_config_json_segments(entry, &segments[1..], value);
116}
117
118fn show_config_json_result(
119    query: &str,
120    mode: crate::storage::query::modes::QueryMode,
121    prefix: &Option<String>,
122    value: crate::serde_json::Value,
123) -> RuntimeQueryResult {
124    let mut result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
125    let mut record = UnifiedRecord::new();
126    record.set(
127        "key",
128        prefix
129            .as_ref()
130            .map(|key| Value::text(key.clone()))
131            .unwrap_or(Value::Null),
132    );
133    record.set("value", Value::Json(value.to_string_compact().into_bytes()));
134    result.push(record);
135    RuntimeQueryResult {
136        query: query.to_string(),
137        mode,
138        statement: "show_config_json",
139        engine: "runtime-config",
140        result,
141        affected_rows: 0,
142        statement_type: "select",
143        bookmark: None,
144    }
145}
146
147#[derive(Clone)]
148struct QueryControlEventSpec {
149    kind: crate::runtime::control_events::EventKind,
150    action: &'static str,
151    resource: Option<String>,
152    fields: Vec<(String, crate::runtime::control_events::Sensitivity)>,
153}
154
155#[derive(Clone)]
156struct QueryAuditPlan {
157    statement_kind: &'static str,
158    collections: Vec<String>,
159}
160
161fn query_audit_plan(expr: &QueryExpr) -> Option<QueryAuditPlan> {
162    let mut collections = Vec::new();
163    let statement_kind = match expr {
164        QueryExpr::Table(table) => {
165            push_query_audit_collection(&mut collections, &table.table);
166            "select"
167        }
168        QueryExpr::Join(join) => {
169            collect_query_audit_collections(&join.left, &mut collections);
170            collect_query_audit_collections(&join.right, &mut collections);
171            "select"
172        }
173        QueryExpr::Insert(insert) => {
174            push_query_audit_collection(&mut collections, &insert.table);
175            "insert"
176        }
177        QueryExpr::Update(update) => {
178            push_query_audit_collection(&mut collections, &update.table);
179            "update"
180        }
181        QueryExpr::Delete(delete) => {
182            push_query_audit_collection(&mut collections, &delete.table);
183            "delete"
184        }
185        _ => return None,
186    };
187    if collections.is_empty() {
188        None
189    } else {
190        Some(QueryAuditPlan {
191            statement_kind,
192            collections,
193        })
194    }
195}
196
197fn collect_query_audit_collections(expr: &QueryExpr, collections: &mut Vec<String>) {
198    match expr {
199        QueryExpr::Table(table) => push_query_audit_collection(collections, &table.table),
200        QueryExpr::Join(join) => {
201            collect_query_audit_collections(&join.left, collections);
202            collect_query_audit_collections(&join.right, collections);
203        }
204        _ => {}
205    }
206}
207
208fn push_query_audit_collection(collections: &mut Vec<String>, name: &str) {
209    if name == "red" || name.starts_with("red.") || name.starts_with("__red_schema_") {
210        return;
211    }
212    if !collections.iter().any(|existing| existing == name) {
213        collections.push(name.to_string());
214    }
215}
216
217const RUNTIME_INDEX_REGISTRY_COLLECTION: &str = "red_index_registry";
218
219impl RedDBRuntime {
220    fn execute_create_metric(
221        &self,
222        raw_query: &str,
223        query: &crate::storage::query::ast::CreateMetricQuery,
224    ) -> RedDBResult<RuntimeQueryResult> {
225        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
226        let store = self.inner.db.store();
227        super::metric_descriptor_catalog::create(
228            store.as_ref(),
229            &query.path,
230            &query.kind,
231            &query.role,
232            super::metric_descriptor_catalog::DerivedSpec {
233                source: query.source.clone(),
234                query: query.query.clone(),
235                window_ms: query.window_ms,
236                time_field: query.time_field.clone(),
237            },
238        )?;
239        self.invalidate_result_cache();
240        Ok(RuntimeQueryResult::ok_message(
241            raw_query.to_string(),
242            &format!("metric descriptor '{}' created", query.path),
243            "create",
244        ))
245    }
246
247    /// `CREATE RANKING <name> ON <table> (<column> [ASC|DESC]) [TOP <k>]`
248    /// — declare a Ranking capability over an ordinary table's score
249    /// column (issue #918 / ADR 0035). Persists a WAL-backed catalog
250    /// record; no new Collection model is introduced. Authorized through
251    /// the same DDL write gate as `CREATE METRIC`/`CREATE INDEX`.
252    fn execute_create_ranking(
253        &self,
254        raw_query: &str,
255        req: super::ranking_descriptor_catalog::CreateRankingRequest,
256    ) -> RedDBResult<RuntimeQueryResult> {
257        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
258        let store = self.inner.db.store();
259        let descriptor = super::ranking_descriptor_catalog::create(store.as_ref(), &req)?;
260        self.invalidate_result_cache();
261        Ok(RuntimeQueryResult::ok_message(
262            raw_query.to_string(),
263            &format!(
264                "ranking '{}' created on {}({})",
265                descriptor.name, descriptor.table, descriptor.column
266            ),
267            "create",
268        ))
269    }
270
271    /// `SHOW RANKINGS` — project the declared Ranking capabilities back as
272    /// rows, so a declared capability is observable (the Analytics
273    /// "prefer SELECT over admin verbs" rule).
274    fn execute_show_rankings(&self, raw_query: &str) -> RedDBResult<RuntimeQueryResult> {
275        let store = self.inner.db.store();
276        let entries = super::ranking_descriptor_catalog::list(store.as_ref());
277        let columns = vec![
278            "name".to_string(),
279            "table".to_string(),
280            "column".to_string(),
281            "direction".to_string(),
282            "top_k".to_string(),
283        ];
284        let rows = entries
285            .into_iter()
286            .map(|e| {
287                vec![
288                    ("name".to_string(), Value::text(e.name)),
289                    ("table".to_string(), Value::text(e.table)),
290                    ("column".to_string(), Value::text(e.column)),
291                    (
292                        "direction".to_string(),
293                        Value::text(if e.descending { "DESC" } else { "ASC" }.to_string()),
294                    ),
295                    ("top_k".to_string(), Value::UnsignedInteger(e.top_k)),
296                ]
297            })
298            .collect();
299        let mut result =
300            RuntimeQueryResult::ok_records(raw_query.to_string(), columns, rows, "select");
301        result.statement = "rank_of";
302        result.engine = "runtime-rank";
303        Ok(result)
304    }
305
306    /// `RANK OF <id> IN <name>` — exact, MVCC-correct rank of a specific
307    /// row within the capability's bounded top-K head (issue #918).
308    ///
309    /// Returns a single `rank` row when the row is visible *and* falls
310    /// inside the exact head; an empty result otherwise (not visible, or
311    /// in the approximate tail — a separate slice). The computation runs
312    /// entirely over the regular read pipeline so it inherits MVCC
313    /// visibility, RLS/policy, and tenant scope from ordinary reads.
314    fn execute_rank_of(
315        &self,
316        raw_query: &str,
317        req: &crate::storage::query::ast::RankOfQuery,
318    ) -> RedDBResult<RuntimeQueryResult> {
319        let store = self.inner.db.store();
320        let descriptor = super::ranking_descriptor_catalog::get(store.as_ref(), &req.ranking)
321            .ok_or_else(|| {
322                RedDBError::Query(format!("ranking '{}' does not exist", req.ranking))
323            })?;
324        let rank = self.compute_exact_head_rank(&descriptor, req.entity_id)?;
325        let columns = vec!["rank".to_string()];
326        let rows = match rank {
327            Some(rank) => vec![vec![("rank".to_string(), Value::UnsignedInteger(rank))]],
328            None => Vec::new(),
329        };
330        let mut result =
331            RuntimeQueryResult::ok_records(raw_query.to_string(), columns, rows, "select");
332        result.statement = "rank_range";
333        result.engine = "runtime-rank";
334        Ok(result)
335    }
336
337    /// `RANK RANGE <lo> TO <hi> IN <name>` — exact, MVCC-correct entries
338    /// occupying a contiguous rank range within the bounded top-K head.
339    ///
340    /// The output is in leaderboard order and includes `rank` plus the
341    /// row columns returned by the canonical exact-head SQL read.
342    fn execute_rank_range(
343        &self,
344        raw_query: &str,
345        req: &crate::storage::query::ast::RankRangeQuery,
346    ) -> RedDBResult<RuntimeQueryResult> {
347        let store = self.inner.db.store();
348        let descriptor = super::ranking_descriptor_catalog::get(store.as_ref(), &req.ranking)
349            .ok_or_else(|| {
350                RedDBError::Query(format!("ranking '{}' does not exist", req.ranking))
351            })?;
352        let (head_columns, entries) = self.compute_ranked_head_entries(&descriptor)?;
353
354        let mut columns = Vec::with_capacity(head_columns.len() + 1);
355        columns.push("rank".to_string());
356        for column in &head_columns {
357            if column != "rank" {
358                columns.push(column.clone());
359            }
360        }
361
362        let rows = entries
363            .into_iter()
364            .filter(|entry| entry.rank >= req.lo && entry.rank <= req.hi)
365            .map(|entry| {
366                let mut row = Vec::with_capacity(columns.len());
367                row.push(("rank".to_string(), Value::UnsignedInteger(entry.rank)));
368                for column in &head_columns {
369                    if column == "rank" {
370                        continue;
371                    }
372                    if let Some(value) = entry.record.get(column) {
373                        row.push((column.clone(), value.clone()));
374                    }
375                }
376                row
377            })
378            .collect();
379        let mut result =
380            RuntimeQueryResult::ok_records(raw_query.to_string(), columns, rows, "select");
381        result.statement = "approx_rank_of";
382        result.engine = "runtime-rank";
383        Ok(result)
384    }
385
386    /// Compute the exact rank of `target_id` within the descriptor's
387    /// bounded top-K head, or `None` if the row is invisible to the
388    /// querying snapshot or beyond the exact head.
389    ///
390    /// Faithful to ADR 0035: it walks the sorted index head
391    /// (`ORDER BY <col> {DESC|ASC} LIMIT k`, served by
392    /// `try_sorted_index_lookup` + the per-row MVCC visibility re-check)
393    /// and counts only rows visible to the current snapshot. Running the
394    /// head scan through `execute_query_inner` keeps it on the same
395    /// snapshot/tenant/policy frame as ordinary reads, so the rank agrees
396    /// with `ORDER BY <col> {DESC|ASC} LIMIT` under that snapshot by
397    /// construction. RANK semantics: tied scores share a rank, so the
398    /// rank is `1 + (number of strictly-better visible rows)`.
399    fn compute_exact_head_rank(
400        &self,
401        descriptor: &super::ranking_descriptor_catalog::RankingDescriptor,
402        target_id: u64,
403    ) -> RedDBResult<Option<u64>> {
404        let (_columns, entries) = self.compute_ranked_head_entries(descriptor)?;
405        Ok(entries
406            .into_iter()
407            .find(|entry| record_rid_u64(&entry.record) == Some(target_id))
408            .map(|entry| entry.rank))
409    }
410
411    /// Return the exact head rows in deterministic rank order.
412    fn compute_ranked_head_entries(
413        &self,
414        descriptor: &super::ranking_descriptor_catalog::RankingDescriptor,
415    ) -> RedDBResult<(Vec<String>, Vec<RankedHeadEntry>)> {
416        let table = &descriptor.table;
417        let column = &descriptor.column;
418
419        // The exact head: top-K rows in rank order. Each row here already
420        // passed MVCC visibility *and* RLS/tenant filtering during the
421        // scan, so identifying the target *within* this result (rather
422        // than via a separate `rid` lookup, which takes the
423        // direct entity-fetch path that bypasses the RLS gate) is what
424        // makes the rank honor policy/tenant scope (criterion 5).
425        let dir = if descriptor.descending { "DESC" } else { "ASC" };
426        let head_sql = format!(
427            "SELECT * FROM {table} ORDER BY {column} {dir}, rid ASC LIMIT {}",
428            descriptor.top_k
429        );
430        let head_result = self.execute_query_inner(&head_sql)?;
431
432        let mut entries = Vec::with_capacity(head_result.result.records.len());
433        let mut row_position = 0u64;
434        let mut current_rank = 0u64;
435        let mut previous_score: Option<f64> = None;
436        for rec in &head_result.result.records {
437            let Some(score) = record_column_f64(rec, column) else {
438                continue;
439            };
440            row_position += 1;
441            current_rank = if previous_score == Some(score) {
442                current_rank
443            } else {
444                row_position
445            };
446            previous_score = Some(score);
447            entries.push(RankedHeadEntry {
448                rank: current_rank,
449                record: rec.clone(),
450            });
451        }
452        Ok((head_result.result.columns, entries))
453    }
454
455    /// `APPROX RANK OF <id> IN <name>` — the *approximate tail* read
456    /// (issue #923 / ADR 0035). Serves an explicitly-approximate
457    /// percentile / rank for an entry below the exact top-K head from a
458    /// per-`(table, column)` score sketch.
459    ///
460    /// The result is always labeled approximate (`approximate = true`,
461    /// distinct from the exact `RANK OF` surface which returns only a bare
462    /// `rank`) so a caller never reads a tail estimate as an exact head
463    /// position. An invisible / non-existent row yields no row, exactly
464    /// like the exact surface.
465    fn execute_approx_rank_of(
466        &self,
467        raw_query: &str,
468        req: &crate::storage::query::ast::RankOfQuery,
469    ) -> RedDBResult<RuntimeQueryResult> {
470        let store = self.inner.db.store();
471        let descriptor = super::ranking_descriptor_catalog::get(store.as_ref(), &req.ranking)
472            .ok_or_else(|| {
473                RedDBError::Query(format!("ranking '{}' does not exist", req.ranking))
474            })?;
475
476        let approx = self.compute_approx_rank(&descriptor, req.entity_id)?;
477        let columns = vec![
478            "rank".to_string(),
479            "percentile".to_string(),
480            "approximate".to_string(),
481        ];
482        let rows = match approx {
483            Some(approx) => vec![vec![
484                ("rank".to_string(), Value::UnsignedInteger(approx.rank)),
485                ("percentile".to_string(), Value::Float(approx.percentile)),
486                ("approximate".to_string(), Value::Boolean(true)),
487            ]],
488            None => Vec::new(),
489        };
490        let mut result =
491            RuntimeQueryResult::ok_records(raw_query.to_string(), columns, rows, "select");
492        result.statement = "approx_rank_of";
493        // Tag as `runtime-rank` so the 30s result cache skips this read
494        // (see `should_write_result_cache`). The approximate rank is rebuilt
495        // from a live full scan on every call (criterion 4: it must track
496        // score changes); a cached entry, scoped only to the ranking name and
497        // never the underlying table, would otherwise survive inserts into
498        // that table and serve a stale rank.
499        result.engine = "runtime-rank";
500        Ok(result)
501    }
502
503    /// Refresh the per-`(table, column)` score sketch from the rows visible
504    /// to the current snapshot and return the target's approximate rank, or
505    /// `None` if the target row is invisible to this snapshot / tenant.
506    ///
507    /// The sketch is rebuilt from the live column on each read and persisted
508    /// back to `red_config` keyed by `(table, column)` — so it is maintained
509    /// per `(collection, score column)` and stays current as scores change
510    /// (criterion 4). The scan runs through `execute_query_inner`, inheriting
511    /// the same MVCC snapshot, RLS/tenant scope, and policy as ordinary
512    /// reads. The *approximation* is the histogram bucketing in
513    /// [`super::score_sketch::ScoreSketch`], not the data freshness, so the
514    /// estimate carries the documented error band even though it is built
515    /// from a full scan in this v0 (incremental maintenance is an ADR-0035
516    /// implementation detail, left open and reversible).
517    fn compute_approx_rank(
518        &self,
519        descriptor: &super::ranking_descriptor_catalog::RankingDescriptor,
520        target_id: u64,
521    ) -> RedDBResult<Option<super::score_sketch::ApproxRank>> {
522        let table = &descriptor.table;
523        let column = &descriptor.column;
524
525        // Scan the visible rows once: it both feeds the sketch and locates
526        // the target's score under the same snapshot/tenant/policy frame.
527        let scan_sql = format!("SELECT * FROM {table}");
528        let scan = self.execute_query_inner(&scan_sql)?;
529        let records = &scan.result.records;
530
531        let mut scores: Vec<f64> = Vec::with_capacity(records.len());
532        let mut target_score: Option<f64> = None;
533        for rec in records {
534            let Some(score) = record_column_f64(rec, column) else {
535                continue;
536            };
537            scores.push(score);
538            let rid = match rec.get("rid") {
539                Some(Value::UnsignedInteger(n)) => Some(*n),
540                Some(Value::Integer(n)) if *n >= 0 => Some(*n as u64),
541                _ => None,
542            };
543            if rid == Some(target_id) {
544                target_score = Some(score);
545            }
546        }
547
548        let sketch = super::score_sketch::ScoreSketch::from_scores(&scores);
549        // Persist the refreshed sketch per (table, column).
550        super::ranking_descriptor_catalog::save_sketch(
551            self.inner.db.store().as_ref(),
552            table,
553            column,
554            &sketch,
555        );
556
557        let Some(target_score) = target_score else {
558            // Not visible to this snapshot/tenant ⇒ no rank (matches exact).
559            return Ok(None);
560        };
561        Ok(sketch.approx_rank(target_score, descriptor.descending))
562    }
563
564    fn execute_alter_metric(
565        &self,
566        raw_query: &str,
567        query: &crate::storage::query::ast::AlterMetricQuery,
568    ) -> RedDBResult<RuntimeQueryResult> {
569        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
570        let store = self.inner.db.store();
571        super::metric_descriptor_catalog::update(
572            store.as_ref(),
573            &query.path,
574            query.set_role.as_deref(),
575            query.attempted_kind.as_deref(),
576            query.attempted_path.as_deref(),
577        )?;
578        self.invalidate_result_cache();
579        Ok(RuntimeQueryResult::ok_message(
580            raw_query.to_string(),
581            &format!("metric descriptor '{}' updated", query.path),
582            "alter",
583        ))
584    }
585
586    fn execute_create_slo(
587        &self,
588        raw_query: &str,
589        query: &crate::storage::query::ast::CreateSloQuery,
590    ) -> RedDBResult<RuntimeQueryResult> {
591        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
592        let store = self.inner.db.store();
593        super::slo_descriptor_catalog::create(
594            store.as_ref(),
595            &query.path,
596            &query.metric_path,
597            query.target,
598            query.window_ms,
599        )?;
600        self.invalidate_result_cache();
601        Ok(RuntimeQueryResult::ok_message(
602            raw_query.to_string(),
603            &format!("SLO descriptor '{}' created", query.path),
604            "create",
605        ))
606    }
607
608    fn execute_create_analytics_source(
609        &self,
610        raw_query: &str,
611        query: super::analytics_source_catalog::CreateAnalyticsSourceProfile,
612    ) -> RedDBResult<RuntimeQueryResult> {
613        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
614        let store = self.inner.db.store();
615        let profile = super::analytics_source_catalog::create(
616            store.as_ref(),
617            &self.inner.db.collection_contracts(),
618            query,
619        )?;
620        self.invalidate_result_cache();
621        Ok(RuntimeQueryResult::ok_message(
622            raw_query.to_string(),
623            &format!("analytics source '{}' created", profile.name),
624            "create",
625        ))
626    }
627}
628
629fn query_control_event_specs(expr: &QueryExpr) -> Vec<QueryControlEventSpec> {
630    use crate::runtime::control_events::{EventKind, Sensitivity};
631
632    let mut specs = Vec::new();
633    let mut schema = |action: &'static str, resource: Option<String>| {
634        specs.push(QueryControlEventSpec {
635            kind: EventKind::SchemaDdl,
636            action,
637            resource,
638            fields: Vec::new(),
639        });
640    };
641    match expr {
642        QueryExpr::CreateTable(q) => {
643            schema("create_table", Some(format!("table:{}", q.name)));
644            if let Some(column) = &q.tenant_by {
645                specs.push(QueryControlEventSpec {
646                    kind: EventKind::TenantGovernance,
647                    action: "create_table_tenant_by",
648                    resource: Some(format!("table:{}", q.name)),
649                    fields: vec![("tenant_column".to_string(), Sensitivity::raw(column))],
650                });
651            }
652        }
653        QueryExpr::CreateCollection(q) => {
654            schema("create_collection", Some(format!("collection:{}", q.name)));
655        }
656        QueryExpr::CreateVector(q) => schema("create_vector", Some(format!("vector:{}", q.name))),
657        QueryExpr::DropTable(q) => schema("drop_table", Some(format!("table:{}", q.name))),
658        QueryExpr::DropGraph(q) => schema("drop_graph", Some(format!("graph:{}", q.name))),
659        QueryExpr::DropVector(q) => schema("drop_vector", Some(format!("vector:{}", q.name))),
660        QueryExpr::DropDocument(q) => {
661            schema("drop_document", Some(format!("document:{}", q.name)));
662        }
663        QueryExpr::DropKv(q) => schema("drop_kv", Some(format!("kv:{}", q.name))),
664        QueryExpr::DropCollection(q) => {
665            schema("drop_collection", Some(format!("collection:{}", q.name)));
666        }
667        QueryExpr::Truncate(q) => schema("truncate", Some(format!("collection:{}", q.name))),
668        QueryExpr::AlterTable(q) => {
669            schema("alter_table", Some(format!("table:{}", q.name)));
670            for op in &q.operations {
671                match op {
672                    crate::storage::query::ast::AlterOperation::EnableRowLevelSecurity => {
673                        specs.push(QueryControlEventSpec {
674                            kind: EventKind::RlsGovernance,
675                            action: "enable_rls",
676                            resource: Some(format!("table:{}", q.name)),
677                            fields: Vec::new(),
678                        });
679                    }
680                    crate::storage::query::ast::AlterOperation::DisableRowLevelSecurity => {
681                        specs.push(QueryControlEventSpec {
682                            kind: EventKind::RlsGovernance,
683                            action: "disable_rls",
684                            resource: Some(format!("table:{}", q.name)),
685                            fields: Vec::new(),
686                        });
687                    }
688                    crate::storage::query::ast::AlterOperation::EnableTenancy { column } => {
689                        specs.push(QueryControlEventSpec {
690                            kind: EventKind::TenantGovernance,
691                            action: "enable_tenancy",
692                            resource: Some(format!("table:{}", q.name)),
693                            fields: vec![("tenant_column".to_string(), Sensitivity::raw(column))],
694                        });
695                    }
696                    crate::storage::query::ast::AlterOperation::DisableTenancy => {
697                        specs.push(QueryControlEventSpec {
698                            kind: EventKind::TenantGovernance,
699                            action: "disable_tenancy",
700                            resource: Some(format!("table:{}", q.name)),
701                            fields: Vec::new(),
702                        });
703                    }
704                    _ => {}
705                }
706            }
707        }
708        QueryExpr::CreateIndex(q) => {
709            schema(
710                "create_index",
711                Some(format!("index:{}:{}", q.table, q.name)),
712            );
713        }
714        QueryExpr::DropIndex(q) => {
715            schema("drop_index", Some(format!("index:{}:{}", q.table, q.name)));
716        }
717        QueryExpr::CreateTimeSeries(q) => {
718            schema("create_timeseries", Some(format!("timeseries:{}", q.name)));
719        }
720        QueryExpr::CreateMetric(q) => {
721            schema("create_metric", Some(format!("metric:{}", q.path)));
722        }
723        QueryExpr::AlterMetric(q) => {
724            schema("alter_metric", Some(format!("metric:{}", q.path)));
725        }
726        QueryExpr::CreateSlo(q) => {
727            schema("create_slo", Some(format!("slo:{}", q.path)));
728        }
729        QueryExpr::DropTimeSeries(q) => {
730            schema("drop_timeseries", Some(format!("timeseries:{}", q.name)));
731        }
732        QueryExpr::CreateQueue(q) => schema("create_queue", Some(format!("queue:{}", q.name))),
733        QueryExpr::AlterQueue(q) => schema("alter_queue", Some(format!("queue:{}", q.name))),
734        QueryExpr::DropQueue(q) => schema("drop_queue", Some(format!("queue:{}", q.name))),
735        QueryExpr::CreateTree(q) => {
736            schema(
737                "create_tree",
738                Some(format!("tree:{}:{}", q.collection, q.name)),
739            );
740        }
741        QueryExpr::DropTree(q) => {
742            schema(
743                "drop_tree",
744                Some(format!("tree:{}:{}", q.collection, q.name)),
745            );
746        }
747        QueryExpr::CreateSchema(q) => schema("create_schema", Some(format!("schema:{}", q.name))),
748        QueryExpr::DropSchema(q) => schema("drop_schema", Some(format!("schema:{}", q.name))),
749        QueryExpr::CreateSequence(q) => {
750            schema("create_sequence", Some(format!("sequence:{}", q.name)));
751        }
752        QueryExpr::DropSequence(q) => schema("drop_sequence", Some(format!("sequence:{}", q.name))),
753        QueryExpr::CreateView(q) => schema("create_view", Some(format!("view:{}", q.name))),
754        QueryExpr::DropView(q) => schema("drop_view", Some(format!("view:{}", q.name))),
755        QueryExpr::RefreshMaterializedView(q) => {
756            schema(
757                "refresh_materialized_view",
758                Some(format!("view:{}", q.name)),
759            );
760        }
761        QueryExpr::CreatePolicy(q) => {
762            specs.push(QueryControlEventSpec {
763                kind: EventKind::RlsGovernance,
764                action: "create_policy",
765                resource: Some(format!("table:{}:policy:{}", q.table, q.name)),
766                fields: vec![(
767                    "target_kind".to_string(),
768                    Sensitivity::raw(q.target_kind.as_ident()),
769                )],
770            });
771        }
772        QueryExpr::DropPolicy(q) => {
773            specs.push(QueryControlEventSpec {
774                kind: EventKind::RlsGovernance,
775                action: "drop_policy",
776                resource: Some(format!("table:{}:policy:{}", q.table, q.name)),
777                fields: Vec::new(),
778            });
779        }
780        QueryExpr::SetTenant(value) => {
781            let mut fields = Vec::new();
782            if let Some(value) = value {
783                fields.push(("tenant".to_string(), Sensitivity::raw(value)));
784            }
785            specs.push(QueryControlEventSpec {
786                kind: EventKind::TenantGovernance,
787                action: "set_tenant",
788                resource: Some("tenant:session".to_string()),
789                fields,
790            });
791        }
792        QueryExpr::SetConfig { key, .. } => {
793            specs.push(QueryControlEventSpec {
794                kind: EventKind::ConfigWrite,
795                action: "config:write",
796                resource: Some(format!("config:{key}")),
797                fields: vec![("key".to_string(), Sensitivity::raw(key))],
798            });
799        }
800        QueryExpr::ConfigCommand(cmd) => match cmd {
801            crate::storage::query::ast::ConfigCommand::Put {
802                collection, key, ..
803            }
804            | crate::storage::query::ast::ConfigCommand::Rotate {
805                collection, key, ..
806            } => {
807                let target = format!("{collection}/{key}");
808                specs.push(QueryControlEventSpec {
809                    kind: EventKind::ConfigWrite,
810                    action: "config:write",
811                    resource: Some(format!("config:{target}")),
812                    fields: vec![
813                        ("collection".to_string(), Sensitivity::raw(collection)),
814                        ("key".to_string(), Sensitivity::raw(key)),
815                    ],
816                });
817            }
818            crate::storage::query::ast::ConfigCommand::Delete { collection, key } => {
819                let target = format!("{collection}/{key}");
820                specs.push(QueryControlEventSpec {
821                    kind: EventKind::ConfigDelete,
822                    action: "config:write",
823                    resource: Some(format!("config:{target}")),
824                    fields: vec![
825                        ("collection".to_string(), Sensitivity::raw(collection)),
826                        ("key".to_string(), Sensitivity::raw(key)),
827                    ],
828                });
829            }
830            _ => {}
831        },
832        QueryExpr::AlterUser(stmt) => {
833            let disables = stmt.attributes.iter().any(|attr| {
834                matches!(
835                    attr,
836                    crate::storage::query::ast::AlterUserAttribute::Disable
837                )
838            });
839            specs.push(QueryControlEventSpec {
840                kind: if disables {
841                    EventKind::UserDisable
842                } else {
843                    EventKind::UserUpdate
844                },
845                action: "alter_user",
846                resource: Some(format!("user:{}", stmt.username)),
847                fields: Vec::new(),
848            });
849        }
850        QueryExpr::CreateUser(stmt) => {
851            specs.push(QueryControlEventSpec {
852                kind: EventKind::UserCreate,
853                action: "create_user",
854                resource: Some(format!("user:{}", stmt.username)),
855                fields: Vec::new(),
856            });
857        }
858        _ => {}
859    }
860    specs
861}
862
863pub(crate) fn control_event_outcome_for_error(
864    err: &RedDBError,
865) -> crate::runtime::control_events::Outcome {
866    match err {
867        RedDBError::ReadOnly(_) => crate::runtime::control_events::Outcome::Denied,
868        RedDBError::Query(msg)
869            if msg.contains("permission denied")
870                || msg.contains("cannot issue")
871                || msg.contains("lacks") =>
872        {
873            crate::runtime::control_events::Outcome::Denied
874        }
875        _ => crate::runtime::control_events::Outcome::Error,
876    }
877}
878
879/// Convert the rows produced by a materialized-view body into
880/// `UnifiedEntity` table rows targeting the backing collection.
881/// Issue #595 slice 9c — feeds `UnifiedStore::refresh_collection`.
882///
883/// Graph fragments and vector hits are ignored: a materialized view
884/// is a relational result set (SELECT-shaped); slices 11+ may extend
885/// this once we have a richer view body shape. Each row materialises
886/// the union of its schema-bound columns + overflow.
887fn view_records_to_entities(
888    table: &str,
889    records: &[crate::storage::query::unified::UnifiedRecord],
890) -> Vec<crate::storage::UnifiedEntity> {
891    use std::collections::HashMap;
892    let table_arc: std::sync::Arc<str> = std::sync::Arc::from(table);
893    let mut out = Vec::with_capacity(records.len());
894    for record in records {
895        let mut named: HashMap<String, crate::storage::schema::Value> = HashMap::new();
896        for (name, value) in record.iter_fields() {
897            named.insert(name.to_string(), value.clone());
898        }
899        let entity = crate::storage::UnifiedEntity::new(
900            crate::storage::EntityId::new(0),
901            crate::storage::EntityKind::TableRow {
902                table: std::sync::Arc::clone(&table_arc),
903                row_id: 0,
904            },
905            crate::storage::EntityData::Row(crate::storage::RowData {
906                columns: Vec::new(),
907                named: Some(named),
908                schema: None,
909            }),
910        );
911        out.push(entity);
912    }
913    out
914}
915
916fn system_keyed_collection_contract(
917    name: &str,
918    model: crate::catalog::CollectionModel,
919) -> crate::physical::CollectionContract {
920    let now = crate::utils::now_unix_millis() as u128;
921    crate::physical::CollectionContract {
922        name: name.to_string(),
923        declared_model: model,
924        schema_mode: crate::catalog::SchemaMode::Dynamic,
925        origin: crate::physical::ContractOrigin::Implicit,
926        version: 1,
927        created_at_unix_ms: now,
928        updated_at_unix_ms: now,
929        default_ttl_ms: None,
930        vector_dimension: None,
931        vector_metric: None,
932        context_index_fields: Vec::new(),
933        declared_columns: Vec::new(),
934        table_def: None,
935        timestamps_enabled: false,
936        context_index_enabled: false,
937        metrics_raw_retention_ms: None,
938        metrics_rollup_policies: Vec::new(),
939        metrics_tenant_identity: None,
940        metrics_namespace: None,
941        append_only: false,
942        subscriptions: Vec::new(),
943        analytics_config: Vec::new(),
944        session_key: None,
945        session_gap_ms: None,
946        retention_duration_ms: None,
947        analytical_storage: None,
948
949        ai_policy: None,
950    }
951}
952
953pub use super::execution_context::{
954    capture_current_snapshot, clear_current_auth_identity, clear_current_connection_id,
955    clear_current_snapshot, clear_current_tenant, current_auth_identity_for_audit,
956    current_connection_id, current_tenant, entity_visible_under_current_snapshot,
957    entity_visible_with_context, set_current_auth_identity, set_current_connection_id,
958    set_current_snapshot, set_current_tenant, snapshot_bundle, with_snapshot_bundle,
959    SnapshotBundle, SnapshotContext,
960};
961pub(crate) use super::execution_context::{
962    current_auth_identity, current_config_value, current_role_projected, current_scope_override,
963    current_secret_value, current_snapshot_requires_index_fallback, current_user_projected,
964    has_scope_override_active, parse_set_local_tenant, update_current_config_value,
965    update_current_secret_value, xids_visible_under_current_snapshot, ConfigSnapshotGuard,
966    CurrentSnapshotGuard, ScopeOverrideGuard, SecretStoreGuard, TxLocalTenantGuard,
967};
968
969fn table_row_index_fields(
970    entity: &crate::storage::unified::entity::UnifiedEntity,
971) -> Vec<(String, crate::storage::schema::Value)> {
972    let crate::storage::EntityData::Row(row) = &entity.data else {
973        return Vec::new();
974    };
975    if let Some(named) = &row.named {
976        return named
977            .iter()
978            .map(|(name, value)| (name.clone(), value.clone()))
979            .collect();
980    }
981    if let Some(schema) = &row.schema {
982        return schema
983            .iter()
984            .zip(row.columns.iter())
985            .map(|(name, value)| (name.clone(), value.clone()))
986            .collect();
987    }
988    Vec::new()
989}
990
991fn named_text(
992    named: &std::collections::HashMap<String, crate::storage::schema::Value>,
993    key: &str,
994) -> Option<String> {
995    match named.get(key) {
996        Some(crate::storage::schema::Value::Text(value)) => Some(value.to_string()),
997        _ => None,
998    }
999}
1000
1001fn named_bool(
1002    named: &std::collections::HashMap<String, crate::storage::schema::Value>,
1003    key: &str,
1004) -> Option<bool> {
1005    match named.get(key) {
1006        Some(crate::storage::schema::Value::Boolean(value)) => Some(*value),
1007        _ => None,
1008    }
1009}
1010
1011fn index_method_kind_as_str(method: super::index_store::IndexMethodKind) -> &'static str {
1012    match method {
1013        super::index_store::IndexMethodKind::Hash => "hash",
1014        super::index_store::IndexMethodKind::Bitmap => "bitmap",
1015        super::index_store::IndexMethodKind::Spatial => "spatial",
1016        super::index_store::IndexMethodKind::BTree => "btree",
1017    }
1018}
1019
1020fn index_method_kind_from_str(raw: &str) -> Option<super::index_store::IndexMethodKind> {
1021    match raw {
1022        "hash" => Some(super::index_store::IndexMethodKind::Hash),
1023        "bitmap" => Some(super::index_store::IndexMethodKind::Bitmap),
1024        "spatial" | "rtree" => Some(super::index_store::IndexMethodKind::Spatial),
1025        "btree" => Some(super::index_store::IndexMethodKind::BTree),
1026        _ => None,
1027    }
1028}
1029
1030fn runtime_pool_lock(runtime: &RedDBRuntime) -> std::sync::MutexGuard<'_, PoolState> {
1031    runtime
1032        .inner
1033        .pool
1034        .lock()
1035        .unwrap_or_else(|poisoned| poisoned.into_inner())
1036}
1037
1038/// The graph-analytics table-valued functions recognized in FROM position.
1039/// Both the graph-collection form and the inline `nodes => / edges =>` form
1040/// (issue #799) accept these names.
1041fn is_graph_tvf_name(name: &str) -> bool {
1042    name.eq_ignore_ascii_case("components")
1043        || name.eq_ignore_ascii_case("louvain")
1044        || name.eq_ignore_ascii_case("degree_centrality")
1045        || name.eq_ignore_ascii_case("shortest_path")
1046        || name.eq_ignore_ascii_case("betweenness")
1047        || name.eq_ignore_ascii_case("eigenvector")
1048        || name.eq_ignore_ascii_case("pagerank")
1049}
1050
1051/// Map a declared `WITH ANALYTICS` view to the concrete graph algorithm name
1052/// and named-argument list that [`RedDBRuntime::dispatch_graph_algorithm`]
1053/// consumes (issue #800). The `using` option selects the algorithm inside the
1054/// output family; unsupported algorithms and the options that do not apply to
1055/// the chosen algorithm are rejected so a view never silently ignores a
1056/// declared parameter.
1057fn analytics_view_algorithm(
1058    graph: &str,
1059    view: &crate::catalog::AnalyticsViewDescriptor,
1060) -> RedDBResult<(String, Vec<(String, f64)>)> {
1061    use crate::catalog::AnalyticsOutput;
1062
1063    let mut named_args: Vec<(String, f64)> = Vec::new();
1064    let algorithm = match view.output {
1065        AnalyticsOutput::Communities => {
1066            let algo = view.algorithm.as_deref().unwrap_or("louvain");
1067            if !algo.eq_ignore_ascii_case("louvain") {
1068                return Err(RedDBError::Query(format!(
1069                    "analytics output 'communities' on graph '{graph}' has unsupported algorithm '{algo}' (expected louvain)"
1070                )));
1071            }
1072            if let Some(resolution) = view.resolution {
1073                named_args.push(("resolution".to_string(), resolution));
1074            }
1075            "louvain".to_string()
1076        }
1077        AnalyticsOutput::Components => {
1078            if let Some(algo) = view.algorithm.as_deref() {
1079                if !algo.eq_ignore_ascii_case("components")
1080                    && !algo.eq_ignore_ascii_case("connected_components")
1081                {
1082                    return Err(RedDBError::Query(format!(
1083                        "analytics output 'components' on graph '{graph}' has unsupported algorithm '{algo}' (expected connected_components)"
1084                    )));
1085                }
1086            }
1087            "components".to_string()
1088        }
1089        AnalyticsOutput::Centrality => {
1090            let algo = view
1091                .algorithm
1092                .as_deref()
1093                .unwrap_or("pagerank")
1094                .to_ascii_lowercase();
1095            match algo.as_str() {
1096                "pagerank" => {
1097                    if let Some(max_iterations) = view.max_iterations {
1098                        named_args.push(("max_iterations".to_string(), max_iterations as f64));
1099                    }
1100                }
1101                "eigenvector" => {
1102                    if let Some(max_iterations) = view.max_iterations {
1103                        named_args.push(("max_iterations".to_string(), max_iterations as f64));
1104                    }
1105                    if let Some(tolerance) = view.tolerance {
1106                        named_args.push(("tolerance".to_string(), tolerance));
1107                    }
1108                }
1109                "betweenness" => {}
1110                other => {
1111                    return Err(RedDBError::Query(format!(
1112                        "analytics output 'centrality' on graph '{graph}' has unsupported algorithm '{other}' (expected pagerank, betweenness, or eigenvector)"
1113                    )));
1114                }
1115            }
1116            algo
1117        }
1118    };
1119    Ok((algorithm, named_args))
1120}
1121
1122/// Reject any named arguments for a TVF that accepts none.
1123fn reject_named_args(name: &str, named_args: &[(String, f64)]) -> RedDBResult<()> {
1124    if let Some((key, _)) = named_args.first() {
1125        return Err(RedDBError::Query(format!(
1126            "table function '{name}' has no named argument '{key}'"
1127        )));
1128    }
1129    Ok(())
1130}
1131
1132/// Resolve louvain's optional `resolution` named arg (γ, default 1.0). Any
1133/// other named key, or a non-finite / non-positive resolution, is rejected.
1134fn louvain_resolution(named_args: &[(String, f64)]) -> RedDBResult<f64> {
1135    let mut resolution = 1.0_f64;
1136    for (key, value) in named_args {
1137        if key.eq_ignore_ascii_case("resolution") {
1138            if !value.is_finite() || *value <= 0.0 {
1139                return Err(RedDBError::Query(format!(
1140                    "table function 'louvain' resolution must be > 0, got {value}"
1141                )));
1142            }
1143            resolution = *value;
1144        } else {
1145            return Err(RedDBError::Query(format!(
1146                "table function 'louvain' has no named argument '{key}' (expected 'resolution')"
1147            )));
1148        }
1149    }
1150    Ok(resolution)
1151}
1152
1153/// Undirected degree centrality over abstract inputs: each edge contributes
1154/// 1 to both of its endpoints. Returns `(node_id, degree)` deterministically
1155/// in ascending node-id order, so identical input always yields identical
1156/// rows.
1157fn abstract_degree_centrality(
1158    nodes: &[String],
1159    edges: &[(
1160        String,
1161        String,
1162        crate::storage::engine::graph_algorithms::Weight,
1163    )],
1164) -> Vec<(String, usize)> {
1165    let mut degree: std::collections::BTreeMap<String, usize> = std::collections::BTreeMap::new();
1166    for n in nodes {
1167        degree.entry(n.clone()).or_insert(0);
1168    }
1169    for (a, b, _w) in edges {
1170        *degree.entry(a.clone()).or_insert(0) += 1;
1171        *degree.entry(b.clone()).or_insert(0) += 1;
1172    }
1173    degree.into_iter().collect()
1174}
1175
1176/// Ordered column names for a materialized subquery result: the projection
1177/// columns when present, else the first record's field order.
1178fn ordered_result_columns(result: &crate::storage::query::unified::UnifiedResult) -> Vec<String> {
1179    if !result.columns.is_empty() {
1180        return result.columns.clone();
1181    }
1182    result
1183        .records
1184        .first()
1185        .map(|record| {
1186            record
1187                .column_names()
1188                .iter()
1189                .map(|column| column.to_string())
1190                .collect()
1191        })
1192        .unwrap_or_default()
1193}
1194
1195/// Canonical node-id string for a cell value, so the node universe (from the
1196/// `nodes` subquery) and the edge endpoints (from the `edges` subquery)
1197/// compare equal regardless of integer-vs-text typing. `Null` is not a node.
1198fn value_to_node_id(value: &crate::storage::schema::Value) -> Option<String> {
1199    use crate::storage::schema::Value;
1200    match value {
1201        Value::Null => None,
1202        Value::Text(s) => Some(s.to_string()),
1203        Value::Integer(n) => Some(n.to_string()),
1204        Value::UnsignedInteger(n) => Some(n.to_string()),
1205        Value::NodeRef(s) => Some(s.clone()),
1206        other => Some(other.to_string()),
1207    }
1208}
1209
1210/// Numeric edge weight from a cell value (the optional third `edges` column).
1211fn value_to_weight(value: &crate::storage::schema::Value) -> Option<f32> {
1212    use crate::storage::schema::Value;
1213    match value {
1214        Value::Float(f) => Some(*f as f32),
1215        Value::Integer(n) => Some(*n as f32),
1216        Value::UnsignedInteger(n) => Some(*n as f32),
1217        _ => None,
1218    }
1219}
1220
1221/// Build the node universe from a materialized `nodes` subquery result: the
1222/// first projected column of each row is the node id (issue #799). Zero rows
1223/// is a valid empty node set; a row set with no columns is a shape error.
1224fn inline_node_ids(
1225    name: &str,
1226    result: &crate::storage::query::unified::UnifiedResult,
1227) -> RedDBResult<Vec<String>> {
1228    if result.records.is_empty() {
1229        return Ok(Vec::new());
1230    }
1231    let columns = ordered_result_columns(result);
1232    let Some(first_col) = columns.first() else {
1233        return Err(RedDBError::Query(format!(
1234            "table function '{name}' inline form: `nodes` subquery must project at least one column (the node id)"
1235        )));
1236    };
1237    let mut ids = Vec::with_capacity(result.records.len());
1238    for record in &result.records {
1239        if let Some(id) = record.get(first_col).and_then(value_to_node_id) {
1240            ids.push(id);
1241        }
1242    }
1243    Ok(ids)
1244}
1245
1246/// Build the edge list from a materialized `edges` subquery result: the first
1247/// two projected columns are `(source, target)` and an optional third column
1248/// is the numeric weight (defaulting to 1.0). Fewer than two columns is a
1249/// shape error (issue #799).
1250fn inline_edges(
1251    name: &str,
1252    result: &crate::storage::query::unified::UnifiedResult,
1253) -> RedDBResult<
1254    Vec<(
1255        String,
1256        String,
1257        crate::storage::engine::graph_algorithms::Weight,
1258    )>,
1259> {
1260    if result.records.is_empty() {
1261        return Ok(Vec::new());
1262    }
1263    let columns = ordered_result_columns(result);
1264    if columns.len() < 2 {
1265        return Err(RedDBError::Query(format!(
1266            "table function '{name}' inline form: `edges` subquery must project at least two columns (source, target), got {}",
1267            columns.len()
1268        )));
1269    }
1270    let src_col = &columns[0];
1271    let dst_col = &columns[1];
1272    let weight_col = columns.get(2);
1273    let mut edges = Vec::with_capacity(result.records.len());
1274    for record in &result.records {
1275        let (Some(src), Some(dst)) = (
1276            record.get(src_col).and_then(value_to_node_id),
1277            record.get(dst_col).and_then(value_to_node_id),
1278        ) else {
1279            // A null/absent endpoint is not a valid edge; skip it.
1280            continue;
1281        };
1282        let weight = match weight_col {
1283            Some(col) => match record.get(col) {
1284                None | Some(crate::storage::schema::Value::Null) => 1.0,
1285                Some(value) => value_to_weight(value).ok_or_else(|| {
1286                    RedDBError::Query(format!(
1287                        "table function '{name}' inline form: `edges` weight column must be numeric"
1288                    ))
1289                })?,
1290            },
1291            None => 1.0,
1292        };
1293        edges.push((src, dst, weight));
1294    }
1295    Ok(edges)
1296}
1297
1298fn cache_scope_insert(scopes: &mut HashSet<String>, name: &str) {
1299    if name.is_empty() || name.starts_with("__subq_") || is_universal_query_source(name) {
1300        return;
1301    }
1302    scopes.insert(name.to_string());
1303}
1304
1305fn collect_table_source_scopes(scopes: &mut HashSet<String>, query: &TableQuery) {
1306    match query.source.as_ref() {
1307        Some(crate::storage::query::ast::TableSource::Name(name)) => {
1308            cache_scope_insert(scopes, name)
1309        }
1310        Some(crate::storage::query::ast::TableSource::Subquery(subquery)) => {
1311            collect_query_expr_result_cache_scopes(scopes, subquery);
1312        }
1313        // Graph-collection TVFs (e.g. `louvain(g)`) read the graph store
1314        // read-only. The result is now cached (issue #802) and scoped to the
1315        // graph collection named in the first argument, so any mutation on
1316        // that collection (`INSERT INTO g NODE/EDGE …`) invalidates the
1317        // entry via `invalidate_result_cache_for_table`. Non-graph or
1318        // zero-arg functions contribute no scope.
1319        Some(crate::storage::query::ast::TableSource::Function { name, args, .. }) => {
1320            if is_graph_tvf_name(name) {
1321                if let Some(graph) = args.first() {
1322                    cache_scope_insert(scopes, graph);
1323                }
1324            }
1325        }
1326        // The inline-graph form reads ordinary tables/docs through its
1327        // `nodes`/`edges` subqueries, so its result cache must be scoped to
1328        // those source collections — mutating any of them invalidates the
1329        // cached result (issue #799).
1330        Some(crate::storage::query::ast::TableSource::InlineGraphFunction {
1331            nodes, edges, ..
1332        }) => {
1333            collect_query_expr_result_cache_scopes(scopes, nodes);
1334            collect_query_expr_result_cache_scopes(scopes, edges);
1335        }
1336        None => cache_scope_insert(scopes, &query.table),
1337    }
1338}
1339
1340fn collect_vector_source_scopes(
1341    scopes: &mut HashSet<String>,
1342    source: &crate::storage::query::ast::VectorSource,
1343) {
1344    match source {
1345        crate::storage::query::ast::VectorSource::Reference { collection, .. } => {
1346            cache_scope_insert(scopes, collection);
1347        }
1348        crate::storage::query::ast::VectorSource::Subquery(subquery) => {
1349            collect_query_expr_result_cache_scopes(scopes, subquery);
1350        }
1351        crate::storage::query::ast::VectorSource::Literal(_)
1352        | crate::storage::query::ast::VectorSource::Text(_) => {}
1353    }
1354}
1355
1356fn collect_path_selector_scopes(
1357    scopes: &mut HashSet<String>,
1358    selector: &crate::storage::query::ast::NodeSelector,
1359) {
1360    if let crate::storage::query::ast::NodeSelector::ByRow { table, .. } = selector {
1361        cache_scope_insert(scopes, table);
1362    }
1363}
1364
1365fn collect_query_expr_result_cache_scopes(scopes: &mut HashSet<String>, expr: &QueryExpr) {
1366    match expr {
1367        QueryExpr::Table(query) => collect_table_source_scopes(scopes, query),
1368        QueryExpr::Join(query) => {
1369            collect_query_expr_result_cache_scopes(scopes, &query.left);
1370            collect_query_expr_result_cache_scopes(scopes, &query.right);
1371        }
1372        QueryExpr::Path(query) => {
1373            collect_path_selector_scopes(scopes, &query.from);
1374            collect_path_selector_scopes(scopes, &query.to);
1375        }
1376        QueryExpr::Vector(query) => {
1377            cache_scope_insert(scopes, &query.collection);
1378            collect_vector_source_scopes(scopes, &query.query_vector);
1379        }
1380        QueryExpr::Hybrid(query) => {
1381            collect_query_expr_result_cache_scopes(scopes, &query.structured);
1382            cache_scope_insert(scopes, &query.vector.collection);
1383            collect_vector_source_scopes(scopes, &query.vector.query_vector);
1384        }
1385        QueryExpr::Insert(query) => cache_scope_insert(scopes, &query.table),
1386        QueryExpr::Update(query) => cache_scope_insert(scopes, &query.table),
1387        QueryExpr::Delete(query) => cache_scope_insert(scopes, &query.table),
1388        QueryExpr::CreateTable(query) => cache_scope_insert(scopes, &query.name),
1389        QueryExpr::CreateCollection(query) => cache_scope_insert(scopes, &query.name),
1390        QueryExpr::CreateVector(query) => cache_scope_insert(scopes, &query.name),
1391        QueryExpr::DropTable(query) => cache_scope_insert(scopes, &query.name),
1392        QueryExpr::DropGraph(query) => cache_scope_insert(scopes, &query.name),
1393        QueryExpr::DropVector(query) => cache_scope_insert(scopes, &query.name),
1394        QueryExpr::DropDocument(query) => cache_scope_insert(scopes, &query.name),
1395        QueryExpr::DropKv(query) => cache_scope_insert(scopes, &query.name),
1396        QueryExpr::DropCollection(query) => cache_scope_insert(scopes, &query.name),
1397        QueryExpr::Truncate(query) => cache_scope_insert(scopes, &query.name),
1398        QueryExpr::AlterTable(query) => cache_scope_insert(scopes, &query.name),
1399        QueryExpr::CreateIndex(query) => cache_scope_insert(scopes, &query.table),
1400        QueryExpr::DropIndex(query) => cache_scope_insert(scopes, &query.table),
1401        QueryExpr::CreateTimeSeries(query) => cache_scope_insert(scopes, &query.name),
1402        QueryExpr::CreateMetric(query) => cache_scope_insert(scopes, &query.path),
1403        QueryExpr::AlterMetric(query) => cache_scope_insert(scopes, &query.path),
1404        QueryExpr::CreateSlo(query) => cache_scope_insert(scopes, &query.path),
1405        QueryExpr::DropTimeSeries(query) => cache_scope_insert(scopes, &query.name),
1406        QueryExpr::CreateQueue(query) => cache_scope_insert(scopes, &query.name),
1407        QueryExpr::AlterQueue(query) => cache_scope_insert(scopes, &query.name),
1408        QueryExpr::DropQueue(query) => cache_scope_insert(scopes, &query.name),
1409        QueryExpr::QueueSelect(query) => cache_scope_insert(scopes, &query.queue),
1410        QueryExpr::QueueCommand(query) => match query {
1411            QueueCommand::Push { queue, .. }
1412            | QueueCommand::Pop { queue, .. }
1413            | QueueCommand::Peek { queue, .. }
1414            | QueueCommand::Len { queue }
1415            | QueueCommand::Purge { queue }
1416            | QueueCommand::GroupCreate { queue, .. }
1417            | QueueCommand::GroupRead { queue, .. }
1418            | QueueCommand::Pending { queue, .. }
1419            | QueueCommand::Claim { queue, .. }
1420            | QueueCommand::Ack { queue, .. }
1421            | QueueCommand::Nack { queue, .. } => cache_scope_insert(scopes, queue),
1422            QueueCommand::Move {
1423                source,
1424                destination,
1425                ..
1426            } => {
1427                cache_scope_insert(scopes, source);
1428                cache_scope_insert(scopes, destination);
1429            }
1430        },
1431        QueryExpr::EventsBackfill(query) => {
1432            cache_scope_insert(scopes, &query.collection);
1433            cache_scope_insert(scopes, &query.target_queue);
1434        }
1435        QueryExpr::CreateTree(query) => cache_scope_insert(scopes, &query.collection),
1436        QueryExpr::DropTree(query) => cache_scope_insert(scopes, &query.collection),
1437        QueryExpr::TreeCommand(query) => match query {
1438            TreeCommand::Insert { collection, .. }
1439            | TreeCommand::Move { collection, .. }
1440            | TreeCommand::Delete { collection, .. }
1441            | TreeCommand::Validate { collection, .. }
1442            | TreeCommand::Rebalance { collection, .. } => cache_scope_insert(scopes, collection),
1443        },
1444        QueryExpr::SearchCommand(query) => match query {
1445            SearchCommand::Similar { collection, .. }
1446            | SearchCommand::Hybrid { collection, .. }
1447            | SearchCommand::SpatialRadius { collection, .. }
1448            | SearchCommand::SpatialBbox { collection, .. }
1449            | SearchCommand::SpatialNearest { collection, .. } => {
1450                cache_scope_insert(scopes, collection);
1451            }
1452            SearchCommand::Text { collection, .. }
1453            | SearchCommand::Multimodal { collection, .. }
1454            | SearchCommand::Index { collection, .. }
1455            | SearchCommand::Context { collection, .. } => {
1456                if let Some(collection) = collection.as_deref() {
1457                    cache_scope_insert(scopes, collection);
1458                }
1459            }
1460        },
1461        QueryExpr::Ask(query) => {
1462            if let Some(collection) = query.collection.as_deref() {
1463                cache_scope_insert(scopes, collection);
1464            }
1465        }
1466        QueryExpr::ExplainAlter(query) => cache_scope_insert(scopes, &query.target.name),
1467        QueryExpr::MaintenanceCommand(cmd) => match cmd {
1468            crate::storage::query::ast::MaintenanceCommand::Vacuum { target, .. }
1469            | crate::storage::query::ast::MaintenanceCommand::Analyze { target } => {
1470                if let Some(t) = target {
1471                    cache_scope_insert(scopes, t);
1472                }
1473            }
1474        },
1475        QueryExpr::CopyFrom(cmd) => cache_scope_insert(scopes, &cmd.table),
1476        QueryExpr::CreateView(cmd) => {
1477            cache_scope_insert(scopes, &cmd.name);
1478            // Invalidating the view should also invalidate its dependencies.
1479            collect_query_expr_result_cache_scopes(scopes, &cmd.query);
1480        }
1481        QueryExpr::DropView(cmd) => cache_scope_insert(scopes, &cmd.name),
1482        QueryExpr::RefreshMaterializedView(cmd) => cache_scope_insert(scopes, &cmd.name),
1483        QueryExpr::CreatePolicy(cmd) => cache_scope_insert(scopes, &cmd.table),
1484        QueryExpr::DropPolicy(cmd) => cache_scope_insert(scopes, &cmd.table),
1485        QueryExpr::CreateServer(_) | QueryExpr::DropServer(_) => {}
1486        QueryExpr::CreateForeignTable(cmd) => cache_scope_insert(scopes, &cmd.name),
1487        QueryExpr::DropForeignTable(cmd) => cache_scope_insert(scopes, &cmd.name),
1488        QueryExpr::Graph(_)
1489        | QueryExpr::GraphCommand(_)
1490        | QueryExpr::ProbabilisticCommand(_)
1491        | QueryExpr::SetConfig { .. }
1492        | QueryExpr::ShowConfig { .. }
1493        | QueryExpr::SetSecret { .. }
1494        | QueryExpr::DeleteSecret { .. }
1495        | QueryExpr::ShowSecrets { .. }
1496        | QueryExpr::SetTenant(_)
1497        | QueryExpr::ShowTenant
1498        | QueryExpr::TransactionControl(_)
1499        | QueryExpr::CreateSchema(_)
1500        | QueryExpr::DropSchema(_)
1501        | QueryExpr::CreateSequence(_)
1502        | QueryExpr::DropSequence(_)
1503        | QueryExpr::Grant(_)
1504        | QueryExpr::Revoke(_)
1505        | QueryExpr::AlterUser(_)
1506        | QueryExpr::CreateUser(_)
1507        | QueryExpr::CreateIamPolicy { .. }
1508        | QueryExpr::DropIamPolicy { .. }
1509        | QueryExpr::AttachPolicy { .. }
1510        | QueryExpr::DetachPolicy { .. }
1511        | QueryExpr::ShowPolicies { .. }
1512        | QueryExpr::ShowEffectivePermissions { .. }
1513        | QueryExpr::RankOf(_)
1514        | QueryExpr::ApproxRankOf(_)
1515        | QueryExpr::RankRange(_)
1516        | QueryExpr::SimulatePolicy { .. }
1517        | QueryExpr::LintPolicy { .. }
1518        | QueryExpr::MigratePolicyMode { .. }
1519        | QueryExpr::CreateMigration(_)
1520        | QueryExpr::ApplyMigration(_)
1521        | QueryExpr::RollbackMigration(_)
1522        | QueryExpr::ExplainMigration(_)
1523        | QueryExpr::EventsBackfillStatus { .. } => {}
1524        QueryExpr::KvCommand(cmd) => {
1525            use crate::storage::query::ast::KvCommand;
1526            match cmd {
1527                KvCommand::Put { collection, .. }
1528                | KvCommand::InvalidateTags { collection, .. }
1529                | KvCommand::Get { collection, .. }
1530                | KvCommand::Unseal { collection, .. }
1531                | KvCommand::Rotate { collection, .. }
1532                | KvCommand::History { collection, .. }
1533                | KvCommand::List { collection, .. }
1534                | KvCommand::Purge { collection, .. }
1535                | KvCommand::Watch { collection, .. }
1536                | KvCommand::Delete { collection, .. }
1537                | KvCommand::Incr { collection, .. }
1538                | KvCommand::Cas { collection, .. } => cache_scope_insert(scopes, collection),
1539            }
1540        }
1541        QueryExpr::ConfigCommand(cmd) => {
1542            use crate::storage::query::ast::ConfigCommand;
1543            match cmd {
1544                ConfigCommand::Put { collection, .. }
1545                | ConfigCommand::Get { collection, .. }
1546                | ConfigCommand::Resolve { collection, .. }
1547                | ConfigCommand::Rotate { collection, .. }
1548                | ConfigCommand::Delete { collection, .. }
1549                | ConfigCommand::History { collection, .. }
1550                | ConfigCommand::List { collection, .. }
1551                | ConfigCommand::Watch { collection, .. }
1552                | ConfigCommand::InvalidVolatileOperation { collection, .. } => {
1553                    cache_scope_insert(scopes, collection)
1554                }
1555            }
1556        }
1557    }
1558}
1559
1560/// Combine matching RLS policies for a table + action into a single
1561/// `Filter` suitable for AND-ing into a caller's `WHERE` clause.
1562///
1563/// Returns `None` when RLS is disabled or no policy admits the caller's
1564/// role — callers use that to short-circuit the mutation (for DELETE /
1565/// UPDATE we simply skip the operation, which PG expresses as "no rows
1566/// match the policy + predicate combination").
1567pub(crate) fn rls_policy_filter(
1568    runtime: &RedDBRuntime,
1569    table: &str,
1570    action: crate::storage::query::ast::PolicyAction,
1571) -> Option<crate::storage::query::ast::Filter> {
1572    rls_policy_filter_for_kind(
1573        runtime,
1574        table,
1575        action,
1576        crate::storage::query::ast::PolicyTargetKind::Table,
1577    )
1578}
1579
1580/// Kind-aware policy filter combiner (Phase 2.5.5 RLS universal).
1581/// Graph / vector / queue / timeseries scans pass the concrete kind;
1582/// policies targeting other kinds are ignored. Legacy Table-scoped
1583/// policies still apply cross-kind — callers register auto-tenancy
1584/// policies as Table today.
1585pub(crate) fn rls_policy_filter_for_kind(
1586    runtime: &RedDBRuntime,
1587    table: &str,
1588    action: crate::storage::query::ast::PolicyAction,
1589    kind: crate::storage::query::ast::PolicyTargetKind,
1590) -> Option<crate::storage::query::ast::Filter> {
1591    use crate::storage::query::ast::Filter;
1592
1593    if !runtime.inner.rls_enabled_tables.read().contains(table) {
1594        return None;
1595    }
1596    let role = current_auth_identity().map(|(_, role)| role);
1597    let role_str = role.map(|r| r.as_str().to_string());
1598    let policies = runtime.matching_rls_policies_for_kind(table, role_str.as_deref(), action, kind);
1599    if policies.is_empty() {
1600        return None;
1601    }
1602    policies
1603        .into_iter()
1604        .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1605}
1606
1607/// Returns true when the table has RLS enforcement enabled. Convenience
1608/// shortcut so DML paths can gate the AND-combine work without reaching
1609/// into `runtime.inner.rls_enabled_tables` directly.
1610pub(crate) fn rls_is_enabled(runtime: &RedDBRuntime, table: &str) -> bool {
1611    runtime.inner.rls_enabled_tables.read().contains(table)
1612}
1613
1614/// Per-entity gate used by the graph materialiser for `GraphNode`
1615/// entities. RLS is checked against the source collection with
1616/// `kind = Nodes`, which `matching_rls_policies_for_kind` resolves to
1617/// either `Nodes`-targeted policies or legacy `Table`-targeted ones
1618/// (for back-compat with auto-tenancy declarations). Cached per
1619/// collection so big graphs only resolve the policy chain once.
1620fn node_passes_rls(
1621    runtime: &RedDBRuntime,
1622    collection: &str,
1623    role: Option<&str>,
1624    cache: &mut std::collections::HashMap<String, Option<crate::storage::query::ast::Filter>>,
1625    entity: &crate::storage::unified::entity::UnifiedEntity,
1626) -> bool {
1627    use crate::storage::query::ast::{Filter, PolicyAction, PolicyTargetKind};
1628
1629    if !runtime.inner.rls_enabled_tables.read().contains(collection) {
1630        return true;
1631    }
1632    let filter = cache.entry(collection.to_string()).or_insert_with(|| {
1633        let policies = runtime.matching_rls_policies_for_kind(
1634            collection,
1635            role,
1636            PolicyAction::Select,
1637            PolicyTargetKind::Nodes,
1638        );
1639        if policies.is_empty() {
1640            None
1641        } else {
1642            policies
1643                .into_iter()
1644                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1645        }
1646    });
1647    let Some(filter) = filter else {
1648        return false;
1649    };
1650    crate::runtime::query_exec::evaluate_entity_filter_with_db(
1651        Some(&runtime.inner.db),
1652        entity,
1653        filter,
1654        collection,
1655        collection,
1656    )
1657}
1658
1659/// Edge counterpart of `node_passes_rls`. Same caching strategy with
1660/// `kind = Edges`.
1661fn edge_passes_rls(
1662    runtime: &RedDBRuntime,
1663    collection: &str,
1664    role: Option<&str>,
1665    cache: &mut std::collections::HashMap<String, Option<crate::storage::query::ast::Filter>>,
1666    entity: &crate::storage::unified::entity::UnifiedEntity,
1667) -> bool {
1668    use crate::storage::query::ast::{Filter, PolicyAction, PolicyTargetKind};
1669
1670    if !runtime.inner.rls_enabled_tables.read().contains(collection) {
1671        return true;
1672    }
1673    let filter = cache.entry(collection.to_string()).or_insert_with(|| {
1674        let policies = runtime.matching_rls_policies_for_kind(
1675            collection,
1676            role,
1677            PolicyAction::Select,
1678            PolicyTargetKind::Edges,
1679        );
1680        if policies.is_empty() {
1681            None
1682        } else {
1683            policies
1684                .into_iter()
1685                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1686        }
1687    });
1688    let Some(filter) = filter else {
1689        return false;
1690    };
1691    crate::runtime::query_exec::evaluate_entity_filter_with_db(
1692        Some(&runtime.inner.db),
1693        entity,
1694        filter,
1695        collection,
1696        collection,
1697    )
1698}
1699
1700/// RLS policy injection (Phase 2.5.2 PG parity).
1701///
1702/// Fetch every matching policy for the current thread-local role and
1703/// fold them into the query's filter. Semantics mirror PostgreSQL:
1704///
1705/// * Multiple policies on the same table combine with **OR** — a row is
1706///   visible if *any* policy admits it.
1707/// * The combined policy predicate is **AND**-ed into the caller's
1708///   existing `WHERE` clause so explicit predicates continue to trim
1709///   the policy-allowed set.
1710/// * No matching policies + RLS enabled = zero rows (PG's
1711///   restrictive-default). Callers get `None` and return an empty
1712///   `UnifiedResult` without ever dispatching the scan.
1713///
1714/// This runs only when `RuntimeInner::rls_enabled_tables` already
1715/// contains the table name — callers gate the hot path upfront to
1716/// avoid the lock acquisition on tables without RLS.
1717///
1718/// Returns `None` when no policy admits the current role; returns
1719/// `Some(mutated_table)` with policy filters folded in otherwise.
1720fn inject_rls_filters(
1721    runtime: &RedDBRuntime,
1722    frame: &dyn super::statement_frame::ReadFrame,
1723    mut table: crate::storage::query::ast::TableQuery,
1724) -> Option<crate::storage::query::ast::TableQuery> {
1725    use crate::storage::query::ast::{Filter, PolicyAction};
1726
1727    // `None` role falls through to policies with no `TO role` clause.
1728    let role = frame.identity().map(|(_, role)| role);
1729    let role_str = role.map(|r| r.as_str().to_string());
1730    let policies =
1731        runtime.matching_rls_policies(&table.table, role_str.as_deref(), PolicyAction::Select);
1732
1733    if policies.is_empty() {
1734        // RLS enabled + no policy match = deny everything. Signal the
1735        // caller to short-circuit with an empty result set.
1736        return None;
1737    }
1738
1739    // Combine policy predicates with OR (PG's permissive default).
1740    let combined = policies
1741        .into_iter()
1742        .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1743        .expect("policies non-empty");
1744
1745    // AND into the caller's existing predicate. The predicate may live
1746    // in `where_expr` rather than `filter`: `resolve_table_expr_subqueries`
1747    // nulls `filter` whenever `where_expr` is present (the case for a
1748    // view body rewritten into `SELECT … WHERE …`). Folding only into
1749    // `filter` here would silently drop that `where_expr` predicate at
1750    // eval time because `effective_table_filter` prefers `filter` —
1751    // e.g. `WITHIN TENANT … SELECT * FROM <view>` would apply the
1752    // tenant policy but lose the view's own WHERE (#635).
1753    use crate::storage::query::sql_lowering::{expr_to_filter, filter_to_expr};
1754    let had_where_expr = table.where_expr.is_some();
1755    let existing = table
1756        .filter
1757        .take()
1758        .or_else(|| table.where_expr.as_ref().map(expr_to_filter));
1759    let new_filter = match existing {
1760        Some(existing) => Filter::And(Box::new(existing), Box::new(combined)),
1761        None => combined,
1762    };
1763    // Keep `where_expr` in lock-step with the merged `filter` so
1764    // whichever the executor consults sees the full predicate.
1765    if had_where_expr {
1766        table.where_expr = Some(filter_to_expr(&new_filter));
1767    }
1768    table.filter = Some(new_filter);
1769    Some(table)
1770}
1771
1772/// Apply per-table RLS to a `JoinQuery` by folding each side's policy
1773/// predicate into the join's outer filter. Walking the merged record
1774/// at the join layer (rather than mutating the per-side scan filter)
1775/// keeps the planner's strategy choice and per-side index selection
1776/// undisturbed — the policy predicate uses the qualified `t.col` form
1777/// that resolves cleanly against the merged record's keys.
1778///
1779/// Returns `None` when any leaf has RLS enabled and no policy admits
1780/// the caller — the join short-circuits to an empty result.
1781fn inject_rls_into_join(
1782    runtime: &RedDBRuntime,
1783    frame: &dyn super::statement_frame::ReadFrame,
1784    mut join: crate::storage::query::ast::JoinQuery,
1785) -> Option<crate::storage::query::ast::JoinQuery> {
1786    use crate::storage::query::ast::Filter;
1787
1788    let mut policy_filters: Vec<Filter> = Vec::new();
1789    if !collect_join_side_policy(runtime, frame, join.left.as_ref(), &mut policy_filters) {
1790        return None;
1791    }
1792    if !collect_join_side_policy(runtime, frame, join.right.as_ref(), &mut policy_filters) {
1793        return None;
1794    }
1795
1796    if policy_filters.is_empty() {
1797        return Some(join);
1798    }
1799
1800    let combined = policy_filters
1801        .into_iter()
1802        .reduce(|acc, f| Filter::And(Box::new(acc), Box::new(f)))
1803        .expect("policy_filters non-empty");
1804
1805    join.filter = Some(match join.filter.take() {
1806        Some(existing) => Filter::And(Box::new(existing), Box::new(combined)),
1807        None => combined,
1808    });
1809
1810    Some(join)
1811}
1812
1813/// For each `Table` leaf reachable through nested joins, append the
1814/// RLS-policy filter (combined with OR across that side's matching
1815/// policies) into `out`. Returns `false` when a side has RLS enabled
1816/// but no policy admits the caller — the join must short-circuit.
1817fn collect_join_side_policy(
1818    runtime: &RedDBRuntime,
1819    frame: &dyn super::statement_frame::ReadFrame,
1820    expr: &crate::storage::query::ast::QueryExpr,
1821    out: &mut Vec<crate::storage::query::ast::Filter>,
1822) -> bool {
1823    use crate::storage::query::ast::{Filter, PolicyAction, QueryExpr};
1824    match expr {
1825        QueryExpr::Table(t) => {
1826            if !runtime.inner.rls_enabled_tables.read().contains(&t.table) {
1827                return true;
1828            }
1829            let role = frame.identity().map(|(_, role)| role);
1830            let role_str = role.map(|r| r.as_str().to_string());
1831            let policies =
1832                runtime.matching_rls_policies(&t.table, role_str.as_deref(), PolicyAction::Select);
1833            if policies.is_empty() {
1834                return false;
1835            }
1836            let combined = policies
1837                .into_iter()
1838                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1839                .expect("policies non-empty");
1840            out.push(combined);
1841            true
1842        }
1843        QueryExpr::Join(inner) => {
1844            collect_join_side_policy(runtime, frame, inner.left.as_ref(), out)
1845                && collect_join_side_policy(runtime, frame, inner.right.as_ref(), out)
1846        }
1847        _ => true,
1848    }
1849}
1850
1851/// Foreign-table post-scan filter application (Phase 3.2.2 PG parity).
1852///
1853/// Phase 3.2 FDW wrappers don't advertise filter pushdown, so the runtime
1854/// applies `WHERE` / `ORDER BY` / `LIMIT` / `OFFSET` after the wrapper
1855/// materialises all rows. Projections are best-effort — when the query
1856/// lists explicit columns we keep only those; a `SELECT *` keeps every
1857/// wrapper-emitted field verbatim.
1858///
1859/// When a wrapper later opts into pushdown (`supports_pushdown = true`)
1860/// the runtime will pass the compiled filter down instead of post-filtering.
1861fn apply_foreign_table_filters(
1862    records: Vec<crate::storage::query::unified::UnifiedRecord>,
1863    query: &crate::storage::query::ast::TableQuery,
1864) -> crate::storage::query::unified::UnifiedResult {
1865    use crate::storage::query::sql_lowering::{
1866        effective_table_filter, effective_table_projections,
1867    };
1868    use crate::storage::query::unified::UnifiedResult;
1869
1870    let filter = effective_table_filter(query);
1871    let projections = effective_table_projections(query);
1872
1873    // Step 1 — WHERE. Reuse the cross-store evaluator so the semantics
1874    // match native-collection queries (same operators, same NULL handling).
1875    let mut filtered: Vec<_> = records
1876        .into_iter()
1877        .filter(|record| match &filter {
1878            Some(f) => {
1879                super::join_filter::evaluate_runtime_filter_with_db(None, record, f, None, None)
1880            }
1881            None => true,
1882        })
1883        .collect();
1884
1885    // Step 2 — LIMIT / OFFSET. Applied after filter to match SQL semantics.
1886    if let Some(offset) = query.offset {
1887        let offset = offset as usize;
1888        if offset >= filtered.len() {
1889            filtered.clear();
1890        } else {
1891            filtered.drain(0..offset);
1892        }
1893    }
1894    if let Some(limit) = query.limit {
1895        filtered.truncate(limit as usize);
1896    }
1897
1898    // Step 3 — columns list. `SELECT *` (no explicit projections) keeps
1899    // the wrapper's column set; an explicit list trims to those names.
1900    let columns: Vec<String> = if projections.is_empty() {
1901        filtered
1902            .first()
1903            .map(|r| r.column_names().iter().map(|k| k.to_string()).collect())
1904            .unwrap_or_default()
1905    } else {
1906        projections
1907            .iter()
1908            .map(super::join_filter::projection_name)
1909            .collect()
1910    };
1911
1912    let mut result = UnifiedResult::empty();
1913    result.columns = columns;
1914    result.records = filtered;
1915    result
1916}
1917
1918/// Collect every concrete table reference inside a `QueryExpr`.
1919///
1920/// Used by view bookkeeping (dependency tracking for materialised
1921/// invalidation) and any other rewriter that needs to know the base
1922/// tables a query pulls from. Does not descend into projections/filters;
1923/// only the `FROM` side.
1924pub(crate) fn collect_table_refs(expr: &QueryExpr) -> Vec<String> {
1925    let mut scopes: HashSet<String> = HashSet::new();
1926    collect_query_expr_result_cache_scopes(&mut scopes, expr);
1927    scopes.into_iter().collect()
1928}
1929
1930fn query_expr_result_cache_scopes(expr: &QueryExpr) -> HashSet<String> {
1931    let mut scopes = HashSet::new();
1932    collect_query_expr_result_cache_scopes(&mut scopes, expr);
1933    scopes
1934}
1935
1936/// Heuristic: does the raw SQL reference a built-in whose output
1937/// varies by connection, clock, or randomness? Such queries must
1938/// skip the 30s result cache — see the call site for rationale.
1939///
1940/// ASCII case-insensitive substring match. False positives (the
1941/// token appears in a quoted string) only skip caching, which is
1942/// the conservative direction.
1943/// If `sql` starts with `EXPLAIN` followed by a non-`ALTER` token,
1944/// return the trimmed inner statement; otherwise `None`.
1945///
1946/// `EXPLAIN ALTER FOR CREATE TABLE ...` is a separate schema-diff
1947/// command handled inside the normal SQL parser, so we leave it
1948/// alone here.
1949fn strip_explain_prefix(sql: &str) -> Option<&str> {
1950    let trimmed = sql.trim_start();
1951    let (head, rest) = trimmed.split_at(
1952        trimmed
1953            .find(|c: char| c.is_whitespace())
1954            .unwrap_or(trimmed.len()),
1955    );
1956    if !head.eq_ignore_ascii_case("EXPLAIN") {
1957        return None;
1958    }
1959    let rest = rest.trim_start();
1960    if rest.is_empty() {
1961        return None;
1962    }
1963    // Peek the next token — if ALTER or ASK, defer to the normal parser.
1964    // `EXPLAIN ASK` is an executable read path: it runs retrieval and
1965    // provider selection, then short-circuits before the LLM call.
1966    let next_head_end = rest.find(|c: char| c.is_whitespace()).unwrap_or(rest.len());
1967    if rest[..next_head_end].eq_ignore_ascii_case("ALTER")
1968        || rest[..next_head_end].eq_ignore_ascii_case("ASK")
1969    {
1970        return None;
1971    }
1972    Some(rest)
1973}
1974
1975/// Cheap prefix check for a leading `WITH` keyword. Used to gate the
1976/// CTE-aware parse in `execute_query` without paying for a full
1977/// lexer pass on every statement. Treats `WITHIN` as not-a-CTE so
1978/// `WITHIN TENANT '...' SELECT ...` doesn't mis-route.
1979pub(super) fn has_with_prefix(sql: &str) -> bool {
1980    let trimmed = sql.trim_start();
1981    let head_end = trimmed
1982        .find(|c: char| c.is_whitespace() || c == '(')
1983        .unwrap_or(trimmed.len());
1984    trimmed[..head_end].eq_ignore_ascii_case("WITH")
1985}
1986
1987/// If the query is a plain SELECT whose top-level `TableQuery`
1988/// carries an `AS OF` clause, return a typed spec that the runtime
1989/// can feed to `vcs_resolve_as_of`. Returns `None` for any other
1990/// shape — joins, DML, EXPLAIN, or parse failures — so callers fall
1991/// back to the connection's regular MVCC snapshot. A cheap textual
1992/// prefilter skips the parse entirely when the source doesn't
1993/// mention `AS OF` / `as of`, keeping the autocommit hot path free.
1994fn peek_top_level_as_of(sql: &str) -> Option<crate::application::vcs::AsOfSpec> {
1995    peek_top_level_as_of_with_table(sql).map(|(spec, _)| spec)
1996}
1997
1998/// Same as `peek_top_level_as_of` but also returns the table name
1999/// targeted by the AS OF clause (when the FROM clause names a
2000/// concrete table). `None` for the table slot means scalar SELECT
2001/// or a subquery source — callers treat those as "no enforcement".
2002pub(super) fn peek_top_level_as_of_with_table(
2003    sql: &str,
2004) -> Option<(crate::application::vcs::AsOfSpec, Option<String>)> {
2005    if !sql
2006        .as_bytes()
2007        .windows(5)
2008        .any(|w| w.eq_ignore_ascii_case(b"as of"))
2009    {
2010        return None;
2011    }
2012    let parsed = crate::storage::query::parser::parse(sql).ok()?;
2013    let crate::storage::query::ast::QueryExpr::Table(table) = parsed.query else {
2014        return None;
2015    };
2016    let clause = table.as_of?;
2017    let table_name = if table.table.is_empty() || table.table == "any" {
2018        None
2019    } else {
2020        Some(table.table.clone())
2021    };
2022    let spec = match clause {
2023        crate::storage::query::ast::AsOfClause::Commit(h) => {
2024            crate::application::vcs::AsOfSpec::Commit(h)
2025        }
2026        crate::storage::query::ast::AsOfClause::Branch(b) => {
2027            crate::application::vcs::AsOfSpec::Branch(b)
2028        }
2029        crate::storage::query::ast::AsOfClause::Tag(t) => crate::application::vcs::AsOfSpec::Tag(t),
2030        crate::storage::query::ast::AsOfClause::TimestampMs(ts) => {
2031            crate::application::vcs::AsOfSpec::TimestampMs(ts)
2032        }
2033        crate::storage::query::ast::AsOfClause::Snapshot(x) => {
2034            crate::application::vcs::AsOfSpec::Snapshot(x)
2035        }
2036    };
2037    Some((spec, table_name))
2038}
2039
2040pub(super) fn query_has_volatile_builtin(sql: &str) -> bool {
2041    // Lowercase the bytes up to the first null/newline into a small
2042    // stack buffer for cheap contains() checks. Most SQL fits in the
2043    // buffer; longer queries fall back to owned lowercase.
2044    const VOLATILE_TOKENS: &[&str] = &[
2045        "pg_advisory_lock",
2046        "pg_try_advisory_lock",
2047        "pg_advisory_unlock",
2048        "random()",
2049        // `$config.<path>` / `$secret.<path>` resolve mutable runtime config /
2050        // vault state at execution time (#1370). A cached result would serve a
2051        // stale value after a later `SET CONFIG` / `SET SECRET`, so treat any
2052        // query referencing them as volatile (never result-cache it).
2053        "$config",
2054        "$secret",
2055        // NOW() / CURRENT_TIMESTAMP / CURRENT_DATE intentionally
2056        // omitted for now — they ARE volatile but today's tests rely
2057        // on caching them. Revisit once a tighter volatility story
2058        // lands.
2059    ];
2060    let lowered = sql.to_ascii_lowercase();
2061    VOLATILE_TOKENS.iter().any(|t| lowered.contains(t))
2062}
2063
2064pub(super) fn query_is_ask_statement(sql: &str) -> bool {
2065    let trimmed = sql.trim_start();
2066    let head_end = trimmed
2067        .find(|c: char| c.is_whitespace() || c == '(' || c == ';')
2068        .unwrap_or(trimmed.len());
2069    trimmed[..head_end].eq_ignore_ascii_case("ASK")
2070}
2071
2072/// Pick the `(global_mode, collection_mode)` pair for an expression,
2073/// or `None` for variants that opt out of intent-locking entirely
2074/// (admin statements like `SHOW CONFIG`, transaction control, tenant
2075/// toggles).
2076///
2077/// Phase-1 contract:
2078/// - Reads  — `(IX-compatible) (Global, IS) → (Collection, IS)`
2079/// - Writes — `(IX-compatible) (Global, IX) → (Collection, IX)`
2080/// - DDL    — `(strong)        (Global, IX) → (Collection, X)`
2081pub(super) fn intent_lock_modes_for(
2082    expr: &QueryExpr,
2083) -> Option<(
2084    crate::storage::transaction::lock::LockMode,
2085    crate::storage::transaction::lock::LockMode,
2086)> {
2087    use crate::storage::transaction::lock::LockMode::{Exclusive, IntentExclusive, IntentShared};
2088
2089    match expr {
2090        // Reads — IS / IS.
2091        QueryExpr::Table(_)
2092        | QueryExpr::Join(_)
2093        | QueryExpr::Vector(_)
2094        | QueryExpr::Hybrid(_)
2095        | QueryExpr::Graph(_)
2096        | QueryExpr::Path(_)
2097        | QueryExpr::Ask(_)
2098        | QueryExpr::SearchCommand(_)
2099        | QueryExpr::GraphCommand(_)
2100        | QueryExpr::RankOf(_)
2101        | QueryExpr::ApproxRankOf(_)
2102        | QueryExpr::RankRange(_)
2103        | QueryExpr::QueueSelect(_) => Some((IntentShared, IntentShared)),
2104
2105        // Writes — IX / IX. Non-tabular mutations (vector insert,
2106        // graph node insert, queue push, timeseries point insert)
2107        // don't carry their own dispatch arm here; they ride through
2108        // the Insert variant or a command variant covered by the
2109        // read-side arm above. P1.T4 expands only the TableQuery-ish
2110        // writes; non-tabular kinds inherit when their DML variants
2111        // land in later phases.
2112        QueryExpr::Insert(_)
2113        | QueryExpr::Update(_)
2114        | QueryExpr::Delete(_)
2115        | QueryExpr::QueueCommand(QueueCommand::Move { .. }) => {
2116            Some((IntentExclusive, IntentExclusive))
2117        }
2118        QueryExpr::QueueCommand(_) => Some((IntentShared, IntentShared)),
2119
2120        // DDL — IX / X. A DDL against collection `c` blocks all
2121        // other writers + readers on `c` but leaves other collections
2122        // running (because Global stays IX, not X).
2123        QueryExpr::CreateTable(_)
2124        | QueryExpr::CreateCollection(_)
2125        | QueryExpr::CreateVector(_)
2126        | QueryExpr::DropTable(_)
2127        | QueryExpr::DropGraph(_)
2128        | QueryExpr::DropVector(_)
2129        | QueryExpr::DropDocument(_)
2130        | QueryExpr::DropKv(_)
2131        | QueryExpr::DropCollection(_)
2132        | QueryExpr::Truncate(_)
2133        | QueryExpr::AlterTable(_)
2134        | QueryExpr::CreateIndex(_)
2135        | QueryExpr::DropIndex(_)
2136        | QueryExpr::CreateTimeSeries(_)
2137        | QueryExpr::CreateMetric(_)
2138        | QueryExpr::AlterMetric(_)
2139        | QueryExpr::CreateSlo(_)
2140        | QueryExpr::DropTimeSeries(_)
2141        | QueryExpr::CreateQueue(_)
2142        | QueryExpr::AlterQueue(_)
2143        | QueryExpr::DropQueue(_)
2144        | QueryExpr::CreateTree(_)
2145        | QueryExpr::DropTree(_)
2146        | QueryExpr::CreatePolicy(_)
2147        | QueryExpr::DropPolicy(_)
2148        | QueryExpr::CreateView(_)
2149        | QueryExpr::DropView(_)
2150        | QueryExpr::RefreshMaterializedView(_)
2151        | QueryExpr::CreateSchema(_)
2152        | QueryExpr::DropSchema(_)
2153        | QueryExpr::CreateSequence(_)
2154        | QueryExpr::DropSequence(_)
2155        | QueryExpr::CreateServer(_)
2156        | QueryExpr::DropServer(_)
2157        | QueryExpr::CreateForeignTable(_)
2158        | QueryExpr::DropForeignTable(_) => Some((IntentExclusive, Exclusive)),
2159
2160        // Admin / control — skip intent locks. `SET TENANT`,
2161        // `BEGIN / COMMIT / ROLLBACK`, `SET CONFIG`, `SHOW CONFIG`,
2162        // `VACUUM`, etc. don't touch collection data the same way
2163        // and the existing transaction layer already serialises the
2164        // pieces that matter.
2165        _ => None,
2166    }
2167}
2168
2169/// Best-effort collection inventory for an expression. Used to pick
2170/// `Collection(...)` resources for the intent-lock guard. Overshoots
2171/// are fine (take an extra IS, benign); undershoots leak writes past
2172/// DDL X locks, so err on the side of listing more names.
2173pub(super) fn collections_referenced(expr: &QueryExpr) -> Vec<String> {
2174    let mut out = Vec::new();
2175    walk_collections(expr, &mut out);
2176    out.sort();
2177    out.dedup();
2178    out
2179}
2180
2181fn walk_collections(expr: &QueryExpr, out: &mut Vec<String>) {
2182    match expr {
2183        QueryExpr::Table(t) => out.push(t.table.clone()),
2184        QueryExpr::Join(j) => {
2185            walk_collections(&j.left, out);
2186            walk_collections(&j.right, out);
2187        }
2188        QueryExpr::Insert(i) => out.push(i.table.clone()),
2189        QueryExpr::Update(u) => out.push(u.table.clone()),
2190        QueryExpr::Delete(d) => out.push(d.table.clone()),
2191        QueryExpr::QueueSelect(q) => out.push(q.queue.clone()),
2192
2193        // DDL — include the target collection so DDL takes
2194        // `(Collection, X)` and blocks concurrent readers / writers
2195        // on the same collection. Other collections stay live
2196        // because Global is still IX.
2197        QueryExpr::CreateTable(q) => out.push(q.name.clone()),
2198        QueryExpr::CreateCollection(q) => out.push(q.name.clone()),
2199        QueryExpr::CreateVector(q) => out.push(q.name.clone()),
2200        QueryExpr::DropTable(q) => out.push(q.name.clone()),
2201        QueryExpr::DropGraph(q) => out.push(q.name.clone()),
2202        QueryExpr::DropVector(q) => out.push(q.name.clone()),
2203        QueryExpr::DropDocument(q) => out.push(q.name.clone()),
2204        QueryExpr::DropKv(q) => out.push(q.name.clone()),
2205        QueryExpr::DropCollection(q) => out.push(q.name.clone()),
2206        QueryExpr::Truncate(q) => out.push(q.name.clone()),
2207        QueryExpr::AlterTable(q) => out.push(q.name.clone()),
2208        QueryExpr::CreateIndex(q) => out.push(q.table.clone()),
2209        QueryExpr::DropIndex(q) => out.push(q.table.clone()),
2210        QueryExpr::CreateTimeSeries(q) => out.push(q.name.clone()),
2211        QueryExpr::CreateMetric(q) => out.push(q.path.clone()),
2212        QueryExpr::AlterMetric(q) => out.push(q.path.clone()),
2213        QueryExpr::CreateSlo(q) => out.push(q.path.clone()),
2214        QueryExpr::DropTimeSeries(q) => out.push(q.name.clone()),
2215        QueryExpr::CreateQueue(q) => out.push(q.name.clone()),
2216        QueryExpr::AlterQueue(q) => out.push(q.name.clone()),
2217        QueryExpr::DropQueue(q) => out.push(q.name.clone()),
2218        QueryExpr::QueueCommand(QueueCommand::Move {
2219            source,
2220            destination,
2221            ..
2222        }) => {
2223            out.push(source.clone());
2224            out.push(destination.clone());
2225        }
2226        QueryExpr::CreatePolicy(q) => out.push(q.table.clone()),
2227        QueryExpr::CreateView(q) => out.push(q.name.clone()),
2228        QueryExpr::DropView(q) => out.push(q.name.clone()),
2229        QueryExpr::RefreshMaterializedView(q) => out.push(q.name.clone()),
2230
2231        // Vector / Hybrid / Graph / Path / commands reference
2232        // collections through fields whose shape varies; without a
2233        // uniform accessor we fall back to the global lock only —
2234        // benign because every runtime path still holds the global
2235        // mode.
2236        _ => {}
2237    }
2238}
2239
2240impl RedDBRuntime {
2241    pub fn in_memory() -> RedDBResult<Self> {
2242        Self::with_options(RedDBOptions::in_memory())
2243    }
2244
2245    pub fn flush(&self) -> RedDBResult<()> {
2246        self.inner
2247            .db
2248            .flush()
2249            .map_err(|err| RedDBError::Internal(err.to_string()))
2250    }
2251
2252    /// Handle to the intent-lock manager for tests + introspection.
2253    /// Production code acquires via `LockerGuard::new(rt.lock_manager())`
2254    /// rather than touching the manager directly.
2255    pub fn lock_manager(&self) -> std::sync::Arc<crate::storage::transaction::lock::LockManager> {
2256        self.inner.lock_manager.clone()
2257    }
2258
2259    /// Process-local governance registry for managed policy/config guardrails.
2260    pub fn config_registry(&self) -> std::sync::Arc<crate::auth::registry::ConfigRegistry> {
2261        self.inner.config_registry.clone()
2262    }
2263
2264    pub fn query_audit(&self) -> std::sync::Arc<crate::runtime::query_audit::QueryAuditStream> {
2265        self.inner.query_audit.clone()
2266    }
2267
2268    pub fn control_events_require_persistence(&self) -> bool {
2269        self.inner.control_event_config.require_persistence()
2270    }
2271
2272    pub fn control_event_config(&self) -> crate::runtime::control_events::ControlEventConfig {
2273        self.inner.control_event_config
2274    }
2275
2276    pub fn control_event_ledger(
2277        &self,
2278    ) -> Arc<dyn crate::runtime::control_events::ControlEventLedger> {
2279        self.inner.control_event_ledger.read().clone()
2280    }
2281
2282    #[doc(hidden)]
2283    pub fn replace_control_event_ledger_for_tests(
2284        &self,
2285        ledger: Arc<dyn crate::runtime::control_events::ControlEventLedger>,
2286    ) {
2287        *self.inner.control_event_ledger.write() = ledger;
2288    }
2289
2290    #[inline(never)]
2291    pub fn with_options(options: RedDBOptions) -> RedDBResult<Self> {
2292        Self::with_pool(options, ConnectionPoolConfig::default())
2293    }
2294
2295    pub fn with_pool(
2296        options: RedDBOptions,
2297        pool_config: ConnectionPoolConfig,
2298    ) -> RedDBResult<Self> {
2299        // PLAN.md Phase 9.1 — capture wall-clock before storage
2300        // open so the cold-start phase markers can be backfilled
2301        // once Lifecycle is constructed below. Storage open
2302        // encapsulates auto-restore + WAL replay; we treat the
2303        // whole window as one combined "restore" + "wal_replay"
2304        // phase split at the same boundary because the storage
2305        // layer doesn't yet emit a finer signal.
2306        let boot_open_start_ms = std::time::SystemTime::now()
2307            .duration_since(std::time::UNIX_EPOCH)
2308            .map(|d| d.as_millis() as u64)
2309            .unwrap_or(0);
2310        let embedded_single_file = options.storage_profile.deploy_profile
2311            == crate::storage::DeployProfile::Embedded
2312            && options.storage_profile.packaging == crate::storage::StoragePackaging::SingleFile;
2313        let db = Arc::new(
2314            RedDB::open_with_options(&options)
2315                .map_err(|err| RedDBError::Internal(err.to_string()))?,
2316        );
2317        let result_blob_cache_config = if embedded_single_file {
2318            crate::storage::cache::BlobCacheConfig::default()
2319        } else {
2320            crate::storage::cache::BlobCacheConfig::default().with_l2_path(
2321                reddb_file::layout::result_cache_l2_path(
2322                    &options.resolved_path(reddb_file::default_database_path()),
2323                ),
2324            )
2325        };
2326        let result_blob_cache =
2327            crate::storage::cache::BlobCache::open_with_l2(result_blob_cache_config).map_err(
2328                |err| RedDBError::Internal(format!("open result Blob Cache L2 failed: {err:?}")),
2329            )?;
2330        let storage_ready_ms = std::time::SystemTime::now()
2331            .duration_since(std::time::UNIX_EPOCH)
2332            .map(|d| d.as_millis() as u64)
2333            .unwrap_or(0);
2334
2335        let runtime = Self {
2336            inner: Arc::new(RuntimeInner {
2337                db: db.clone(),
2338                layout: PhysicalLayout::from_options(&options),
2339                embedded_single_file,
2340                indices: IndexCatalog::register_default_vector_graph(
2341                    options.has_capability(crate::api::Capability::Table),
2342                    options.has_capability(crate::api::Capability::Graph),
2343                ),
2344                pool_config,
2345                pool: Mutex::new(PoolState::default()),
2346                started_at_unix_ms: SystemTime::now()
2347                    .duration_since(UNIX_EPOCH)
2348                    .unwrap_or_default()
2349                    .as_millis(),
2350                probabilistic: super::probabilistic_store::ProbabilisticStore::new(),
2351                index_store: super::index_store::IndexStore::new(),
2352                cdc: crate::replication::cdc::CdcBuffer::new(100_000),
2353                backup_scheduler: crate::replication::scheduler::BackupScheduler::new(3600),
2354                query_cache: parking_lot::RwLock::new(
2355                    crate::storage::query::planner::cache::PlanCache::new(1000),
2356                ),
2357                result_cache: parking_lot::RwLock::new((
2358                    HashMap::new(),
2359                    std::collections::VecDeque::new(),
2360                )),
2361                result_blob_cache,
2362                result_blob_entries: parking_lot::RwLock::new((
2363                    HashMap::new(),
2364                    std::collections::VecDeque::new(),
2365                )),
2366                ask_answer_cache_entries: parking_lot::RwLock::new((
2367                    HashSet::new(),
2368                    std::collections::VecDeque::new(),
2369                )),
2370                result_cache_shadow_divergences: std::sync::atomic::AtomicU64::new(0),
2371                result_cache_hits: std::sync::atomic::AtomicU64::new(0),
2372                result_cache_misses: std::sync::atomic::AtomicU64::new(0),
2373                result_cache_evictions: std::sync::atomic::AtomicU64::new(0),
2374                ask_daily_spend: parking_lot::RwLock::new(HashMap::new()),
2375                queue_message_locks: parking_lot::RwLock::new(HashMap::new()),
2376                rmw_locks: RmwLockTable::new(),
2377                planner_dirty_tables: parking_lot::RwLock::new(HashSet::new()),
2378                ec_registry: Arc::new(crate::ec::config::EcRegistry::new()),
2379                config_registry: Arc::new(crate::auth::registry::ConfigRegistry::new()),
2380                ec_worker: crate::ec::worker::EcWorker::new(),
2381                auth_store: parking_lot::RwLock::new(None),
2382                oauth_validator: parking_lot::RwLock::new(None),
2383                browser_token_authority: parking_lot::RwLock::new(None),
2384                views: parking_lot::RwLock::new(HashMap::new()),
2385                materialized_views: parking_lot::RwLock::new(
2386                    crate::storage::cache::result::MaterializedViewCache::new(),
2387                ),
2388                retention_sweeper: parking_lot::RwLock::new(
2389                    crate::runtime::retention_sweeper::RetentionSweeperState::new(),
2390                ),
2391                snapshot_manager: Arc::new(
2392                    crate::storage::transaction::snapshot::SnapshotManager::new(),
2393                ),
2394                tx_contexts: parking_lot::RwLock::new(HashMap::new()),
2395                tx_local_tenants: parking_lot::RwLock::new(HashMap::new()),
2396                env_config_overrides: crate::runtime::config_overlay::collect_env_overrides(),
2397                lock_manager: Arc::new({
2398                    // Sourced from the matrix: Tier B key
2399                    // `concurrency.locking.deadlock_timeout_ms`
2400                    // (default 5000). Env var wins at boot so
2401                    // operators can tune without touching red_config.
2402                    let env = crate::runtime::config_overlay::collect_env_overrides();
2403                    let timeout_ms = env
2404                        .get("concurrency.locking.deadlock_timeout_ms")
2405                        .and_then(|raw| raw.parse::<u64>().ok())
2406                        .unwrap_or_else(|| {
2407                            match crate::runtime::config_matrix::default_for(
2408                                "concurrency.locking.deadlock_timeout_ms",
2409                            ) {
2410                                Some(crate::serde_json::Value::Number(n)) => n as u64,
2411                                _ => 5000,
2412                            }
2413                        });
2414                    let cfg = crate::storage::transaction::lock::LockConfig {
2415                        default_timeout: std::time::Duration::from_millis(timeout_ms),
2416                        ..Default::default()
2417                    };
2418                    crate::storage::transaction::lock::LockManager::new(cfg)
2419                }),
2420                rls_policies: parking_lot::RwLock::new(HashMap::new()),
2421                rls_enabled_tables: parking_lot::RwLock::new(HashSet::new()),
2422                foreign_tables: Arc::new(crate::storage::fdw::ForeignTableRegistry::with_builtins()),
2423                pending_tombstones: parking_lot::RwLock::new(HashMap::new()),
2424                pending_versioned_updates: parking_lot::RwLock::new(HashMap::new()),
2425                pending_kv_watch_events: parking_lot::RwLock::new(HashMap::new()),
2426                pending_store_wal_actions: parking_lot::RwLock::new(HashMap::new()),
2427                queue_wait_registry: std::sync::Arc::new(
2428                    crate::runtime::queue_wait_registry::QueueWaitRegistry::new(),
2429                ),
2430                pending_queue_wakes: parking_lot::RwLock::new(HashMap::new()),
2431                tenant_tables: parking_lot::RwLock::new(HashMap::new()),
2432                ddl_epoch: std::sync::atomic::AtomicU64::new(0),
2433                write_gate: Arc::new(crate::runtime::write_gate::WriteGate::from_options(
2434                    &options,
2435                )),
2436                lifecycle: crate::runtime::lifecycle::Lifecycle::new(),
2437                resource_limits: crate::runtime::resource_limits::ResourceLimits::from_env(),
2438                audit_log: {
2439                    // Default audit-log path for the in-memory case
2440                    // sits in the system temp dir; persistent runs
2441                    // place it next to the resolved data file.
2442                    //
2443                    // gh-471 iter 2: route through the resolved
2444                    // `LogDestination`. Performance/Max tiers emit a
2445                    // file-backed log destination under the file-owned
2446                    // support-directory logs tier;
2447                    // lower tiers / ephemeral runs report `Stderr`
2448                    // and we keep the legacy file-next-to-data sink.
2449                    // #1375 — single-file embedded mode keeps the data
2450                    // directory to exactly the `.rdb` artifact, so the audit
2451                    // log must NOT land as a sibling. Route it to a
2452                    // process-unique temp location even when a data path is
2453                    // set; only the non-embedded case uses the data dir.
2454                    let data_path = if embedded_single_file {
2455                        std::env::temp_dir()
2456                            .join("reddb-embedded-runtime")
2457                            .join(format!("audit-{}", std::process::id()))
2458                    } else {
2459                        options
2460                            .data_path
2461                            .clone()
2462                            .unwrap_or_else(|| std::env::temp_dir().join("reddb"))
2463                    };
2464                    let (audit_dest, _) = crate::api::tier_wiring::current_log_destinations();
2465                    if !matches!(audit_dest, crate::storage::layout::LogDestination::File(_))
2466                        && (embedded_single_file
2467                            || options
2468                                .metadata
2469                                .contains_key(crate::api::EPHEMERAL_RUNTIME_METADATA_KEY))
2470                    {
2471                        // The Stderr/Syslog lower-tier sink resolves to a
2472                        // `for_data_path` sibling that collides across concurrent
2473                        // temp-dir runtimes — nextest's process-per-test model
2474                        // truncates one shared file, flaking audit assertions.
2475                        // Pin a unique sibling for these short-lived ephemeral /
2476                        // single-file embedded runtimes. The file-owned support-
2477                        // dir tier (`File`) is already per-data unique, so leave
2478                        // it to `for_destination` (#1375: the embedded audit then
2479                        // still never lands a sibling next to the `.rdb`).
2480                        let audit_path = reddb_file::layout::sibling_path(
2481                            &data_path,
2482                            &reddb_file::layout::sidecar_file_name(&data_path, "audit.log"),
2483                        );
2484                        Arc::new(crate::runtime::audit_log::AuditLogger::with_path(
2485                            audit_path,
2486                        ))
2487                    } else {
2488                        Arc::new(crate::runtime::audit_log::AuditLogger::for_destination(
2489                            &audit_dest,
2490                            &data_path,
2491                        ))
2492                    }
2493                },
2494                control_event_ledger: parking_lot::RwLock::new(Arc::new(
2495                    crate::runtime::control_events::RuntimeLedger::new(db.store()),
2496                )),
2497                control_event_config: options.control_events,
2498                query_audit: Arc::new(crate::runtime::query_audit::QueryAuditStream::new(
2499                    db.store(),
2500                    options.query_audit.clone(),
2501                )),
2502                lease_lifecycle: std::sync::OnceLock::new(),
2503                replica_apply_metrics: std::sync::Arc::new(
2504                    crate::replication::logical::ReplicaApplyMetrics::default(),
2505                ),
2506                quota_bucket: crate::runtime::quota_bucket::QuotaBucket::from_env(),
2507                schema_vocabulary: parking_lot::RwLock::new(
2508                    crate::runtime::schema_vocabulary::SchemaVocabulary::new(),
2509                ),
2510                slow_query_logger: {
2511                    // Issue #205 — slow-query sink lives in the same
2512                    // directory the audit log uses, so backup/restore
2513                    // ships them together. Threshold + sample-pct
2514                    // default conservatively (1 s, 100% sampling) so
2515                    // emitted lines are rare and complete. Operators
2516                    // tune via env / config matrix in a follow-up.
2517                    //
2518                    // gh-471 iter 2: same routing as the audit log —
2519                    // `LogDestination::File(...)` for Performance/Max
2520                    // lands under the file-owned support-directory logs tier;
2521                    // lower tiers fall back to `red-slow.log` in the
2522                    // data directory.
2523                    // #1375 — see the audit-log note above: single-file mode
2524                    // never writes the slow-query log as a sibling of the
2525                    // `.rdb`. Route to a process-unique temp dir when embedded,
2526                    // regardless of the data path.
2527                    let fallback_dir = if embedded_single_file {
2528                        std::env::temp_dir()
2529                            .join("reddb-embedded-runtime")
2530                            .join(format!("slow-{}", std::process::id()))
2531                    } else {
2532                        options
2533                            .data_path
2534                            .as_ref()
2535                            .and_then(|p| p.parent().map(std::path::PathBuf::from))
2536                            .unwrap_or_else(|| std::env::temp_dir().join("reddb"))
2537                    };
2538                    let threshold_ms = std::env::var("RED_SLOW_QUERY_THRESHOLD_MS")
2539                        .ok()
2540                        .and_then(|s| s.parse::<u64>().ok())
2541                        .unwrap_or(1000);
2542                    let sample_pct = std::env::var("RED_SLOW_QUERY_SAMPLE_PCT")
2543                        .ok()
2544                        .and_then(|s| s.parse::<u8>().ok())
2545                        .unwrap_or(100);
2546                    let (_, slow_dest) = crate::api::tier_wiring::current_log_destinations();
2547                    crate::telemetry::slow_query_logger::SlowQueryLogger::for_destination(
2548                        &slow_dest,
2549                        &fallback_dir,
2550                        threshold_ms,
2551                        sample_pct,
2552                    )
2553                },
2554                slow_query_store: crate::telemetry::slow_query_store::SlowQueryStore::new(
2555                    crate::telemetry::slow_query_store::DEFAULT_CAP,
2556                ),
2557                kv_stats: crate::runtime::KvStatsCounters::default(),
2558                metrics_ingest_stats: crate::runtime::MetricsIngestCounters::default(),
2559                metrics_tenant_activity_stats:
2560                    crate::runtime::MetricsTenantActivityCounters::default(),
2561                queue_telemetry: Arc::new(
2562                    crate::runtime::queue_telemetry::QueueTelemetryCounters::default(),
2563                ),
2564                query_latency_telemetry: Arc::new(
2565                    crate::runtime::query_latency_telemetry::QueryLatencyTelemetry::default(),
2566                ),
2567                queue_presence: Arc::new(
2568                    crate::storage::queue::presence::ConsumerPresenceRegistry::new(),
2569                ),
2570                vector_introspection: Arc::new(
2571                    crate::storage::vector::introspection::VectorIntrospectionRegistry::new(),
2572                ),
2573                kv_tag_index: crate::runtime::KvTagIndex::default(),
2574                chain_tip_cache: parking_lot::Mutex::new(HashMap::new()),
2575                chain_integrity_broken: parking_lot::Mutex::new(HashMap::new()),
2576                integrity_tombstones: parking_lot::Mutex::new(Vec::new()),
2577                integrity_tombstones_state: std::sync::atomic::AtomicU8::new(0),
2578            }),
2579        };
2580
2581        // Issue #205 — install the process-wide OperatorEvent sink so
2582        // emit sites buried in storage / replication / signal handlers
2583        // can record without threading an `&AuditLogger` through every
2584        // call stack. First registration wins; subsequent in-memory
2585        // runtimes (test harnesses) fall through to tracing+eprintln.
2586        crate::telemetry::operator_event::install_global_audit_sink(Arc::clone(
2587            &runtime.inner.audit_log,
2588        ));
2589
2590        // Issue #1238 — wire the slow-query telemetry substrate (ADR 0060).
2591        // The logger dual-writes: file sink (existing) + ring store (new).
2592        runtime
2593            .inner
2594            .slow_query_logger
2595            .attach_store(Arc::clone(&runtime.inner.slow_query_store));
2596
2597        // PLAN.md Phase 9.1 — backfill cold-start phase markers
2598        // from the wall-clock captured before storage open. The
2599        // entire `RedDB::open_with_options` call covers both
2600        // auto-restore (when configured) and WAL replay. We
2601        // record both phases against the same boundary today;
2602        // a follow-up will split them once the storage layer
2603        // surfaces a finer-grained event.
2604        runtime
2605            .inner
2606            .lifecycle
2607            .set_restore_started_at_ms(boot_open_start_ms);
2608        runtime
2609            .inner
2610            .lifecycle
2611            .set_restore_ready_at_ms(storage_ready_ms);
2612        runtime
2613            .inner
2614            .lifecycle
2615            .set_wal_replay_started_at_ms(boot_open_start_ms);
2616        runtime
2617            .inner
2618            .lifecycle
2619            .set_wal_replay_ready_at_ms(storage_ready_ms);
2620
2621        let restored_cdc_lsn = runtime
2622            .inner
2623            .db
2624            .replication
2625            .as_ref()
2626            .map(|repl| {
2627                repl.logical_wal_spool
2628                    .as_ref()
2629                    .map(|spool| spool.current_lsn())
2630                    .unwrap_or(0)
2631            })
2632            .unwrap_or(0)
2633            .max(runtime.config_u64("red.config.timeline.last_archived_lsn", 0));
2634        runtime.inner.cdc.set_current_lsn(restored_cdc_lsn);
2635        runtime.rehydrate_snapshot_xid_floor();
2636        runtime
2637            .bootstrap_system_keyed_collections()
2638            .map_err(|err| RedDBError::Internal(format!("bootstrap system collections: {err}")))?;
2639        runtime.rehydrate_declared_column_schemas();
2640        runtime.rehydrate_runtime_index_registry()?;
2641        runtime
2642            .load_probabilistic_state()
2643            .map_err(|err| RedDBError::Internal(format!("load probabilistic state: {err}")))?;
2644
2645        // Phase 2.5.4: replay `tenant_tables.{table}.column` markers so
2646        // tables declared via `TENANT BY (col)` survive restart. Each
2647        // entry re-registers the auto-policy and flips RLS on again.
2648        runtime.rehydrate_tenant_tables();
2649        // Issue #593 slice 9a — replay persisted materialized-view
2650        // descriptors so `CREATE MATERIALIZED VIEW v AS …` survives a
2651        // restart. Runs after the system-keyed collections bootstrap
2652        // and before the API opens.
2653        runtime.rehydrate_materialized_view_descriptors();
2654        if let Some(repl) = &runtime.inner.db.replication {
2655            repl.wal_buffer.set_current_lsn(restored_cdc_lsn);
2656        }
2657
2658        // Save system info to red_config on boot
2659        {
2660            let sys = SystemInfo::collect();
2661            runtime.inner.db.store().set_config_tree(
2662                "red.system",
2663                &crate::serde_json::json!({
2664                    "pid": sys.pid,
2665                    "cpu_cores": sys.cpu_cores,
2666                    "total_memory_bytes": sys.total_memory_bytes,
2667                    "available_memory_bytes": sys.available_memory_bytes,
2668                    "os": sys.os,
2669                    "arch": sys.arch,
2670                    "hostname": sys.hostname,
2671                    "started_at": SystemTime::now()
2672                        .duration_since(UNIX_EPOCH)
2673                        .unwrap_or_default()
2674                        .as_millis() as u64
2675                }),
2676            );
2677
2678            // Seed defaults on first boot (only if red_config is empty or missing defaults)
2679            let store = runtime.inner.db.store();
2680            if store
2681                .get_collection("red_config")
2682                .map(|m| m.query_all(|_| true).len())
2683                .unwrap_or(0)
2684                <= 10
2685            {
2686                store.set_config_tree("red.ai", &crate::json!({
2687                    "default": crate::json!({
2688                        "provider": "openai",
2689                        "model": crate::ai::DEFAULT_OPENAI_PROMPT_MODEL
2690                    }),
2691                    "max_embedding_inputs": 256,
2692                    "max_prompt_batch": 256,
2693                    "timeout": crate::json!({ "connect_secs": 10, "read_secs": 90, "write_secs": 30 })
2694                }));
2695                store.set_config_tree(
2696                    "red.server",
2697                    &crate::json!({
2698                        "max_scan_limit": 1000,
2699                        "max_body_size": 1048576,
2700                        "read_timeout_ms": 5000,
2701                        "write_timeout_ms": 5000
2702                    }),
2703                );
2704                store.set_config_tree(
2705                    "red.storage",
2706                    &crate::json!({
2707                        "page_size": 4096,
2708                        "page_cache_capacity": 100000,
2709                        "auto_checkpoint_pages": 1000,
2710                        "snapshot_retention": 16,
2711                        "verify_checksums": true,
2712                        "segment": crate::json!({
2713                            "max_entities": 100000,
2714                            "max_bytes": 268435456_u64,
2715                            "compression_level": 6
2716                        }),
2717                        "hnsw": crate::json!({ "m": 16, "ef_construction": 100, "ef_search": 50 }),
2718                        "ivf": crate::json!({ "n_lists": 100, "n_probes": 10 }),
2719                        "bm25": crate::json!({ "k1": 1.2, "b": 0.75 })
2720                    }),
2721                );
2722                store.set_config_tree(
2723                    "red.search",
2724                    &crate::json!({
2725                        "rag": crate::json!({
2726                            "max_chunks_per_source": 10,
2727                            "max_total_chunks": 25,
2728                            "similarity_threshold": 0.8,
2729                            "graph_depth": 2,
2730                            "min_relevance": 0.3
2731                        }),
2732                        "fusion": crate::json!({
2733                            "vector_weight": 0.5,
2734                            "graph_weight": 0.3,
2735                            "table_weight": 0.2,
2736                            "dedup_threshold": 0.85
2737                        })
2738                    }),
2739                );
2740                store.set_config_tree(
2741                    "red.auth",
2742                    &crate::json!({
2743                        "enabled": false,
2744                        "session_ttl_secs": 3600,
2745                        "require_auth": false
2746                    }),
2747                );
2748                store.set_config_tree(
2749                    "red.query",
2750                    &crate::json!({
2751                        "connection_pool": crate::json!({ "max_connections": 64, "max_idle": 16 }),
2752                        "max_recursion_depth": 1000
2753                    }),
2754                );
2755                store.set_config_tree(
2756                    "red.indexes",
2757                    &crate::json!({
2758                        "auto_select": true,
2759                        "bloom_filter": crate::json!({
2760                            "enabled": true,
2761                            "false_positive_rate": 0.01,
2762                            "prune_on_scan": true
2763                        }),
2764                        "hash": crate::json!({ "enabled": true }),
2765                        "bitmap": crate::json!({ "enabled": true, "max_cardinality": 1000 }),
2766                        "spatial": crate::json!({ "enabled": true })
2767                    }),
2768                );
2769                store.set_config_tree(
2770                    "red.memtable",
2771                    &crate::json!({
2772                        "enabled": true,
2773                        "max_bytes": 67108864_u64,
2774                        "flush_threshold": 0.75
2775                    }),
2776                );
2777                store.set_config_tree(
2778                    "red.probabilistic",
2779                    &crate::json!({
2780                        "hll_registers": 16384,
2781                        "sketch_default_width": 1000,
2782                        "sketch_default_depth": 5,
2783                        "filter_default_capacity": 100000
2784                    }),
2785                );
2786                store.set_config_tree(
2787                    "red.timeseries",
2788                    &crate::json!({
2789                        "default_chunk_size": 1024,
2790                        "compression": crate::json!({
2791                            "timestamps": "delta_of_delta",
2792                            "values": "gorilla_xor"
2793                        }),
2794                        "default_retention_days": 0
2795                    }),
2796                );
2797                store.set_config_tree(
2798                    "red.queue",
2799                    &crate::json!({
2800                        "default_max_size": 0,
2801                        "default_max_attempts": 3,
2802                        "visibility_timeout_ms": 30000,
2803                        "consumer_idle_timeout_ms": 60000
2804                    }),
2805                );
2806                store.set_config_tree(
2807                    "red.backup",
2808                    &crate::json!({
2809                        "enabled": false,
2810                        "interval_secs": 3600,
2811                        "retention_count": 24,
2812                        "upload": false,
2813                        "backend": "local"
2814                    }),
2815                );
2816                store.set_config_tree(
2817                    "red.wal",
2818                    &crate::json!({
2819                        "archive": crate::json!({
2820                            "enabled": false,
2821                            "retention_hours": 168,
2822                            "prefix": reddb_file::backup_wal_prefix("")
2823                        })
2824                    }),
2825                );
2826                store.set_config_tree(
2827                    "red.cdc",
2828                    &crate::json!({
2829                        "enabled": true,
2830                        "buffer_size": 100000
2831                    }),
2832                );
2833                store.set_config_tree(
2834                    "red.config.secret",
2835                    &crate::json!({
2836                        "auto_encrypt": true,
2837                        "auto_decrypt": true
2838                    }),
2839                );
2840            }
2841
2842            // Perf-parity config matrix: heal the Tier A (critical)
2843            // keys unconditionally on every boot. Idempotent — only
2844            // writes the default when the key is missing. Keeps
2845            // `SHOW CONFIG` showing every guarantee the operator has
2846            // (durability.mode, concurrency.locking.enabled, …) even
2847            // on long-running datadirs that predate the matrix.
2848            crate::runtime::config_matrix::heal_critical_keys(store.as_ref());
2849            seed_storage_deploy_config(store.as_ref(), options.storage_profile);
2850
2851            // Phase 5 — Lehman-Yao runtime flag. Read the Tier A
2852            // `storage.btree.lehman_yao` value from the matrix (env
2853            // > file > red_config > default) and publish it to the
2854            // storage layer's atomic so the B-tree read / split
2855            // paths can branch without re-reading the config on
2856            // every hot-path call.
2857            let lehman_yao = runtime.config_bool("storage.btree.lehman_yao", true);
2858            crate::storage::engine::btree::lehman_yao::set_enabled(lehman_yao);
2859            if lehman_yao {
2860                tracing::info!(
2861                    "storage.btree.lehman_yao=true — lock-free concurrent descent enabled"
2862                );
2863            }
2864
2865            // Config file overlay — mounted `/etc/reddb/config.json`
2866            // (override path via REDDB_CONFIG_FILE). Writes keys with
2867            // write-if-absent semantics so a later user `SET CONFIG`
2868            // always wins. Missing file = silent no-op.
2869            let overlay_path = crate::runtime::config_overlay::config_file_path();
2870            let _ =
2871                crate::runtime::config_overlay::apply_config_file(store.as_ref(), &overlay_path);
2872        }
2873
2874        // VCS ("Git for Data") — create the `red_*` metadata
2875        // collections on first boot. Idempotent: `get_or_create_collection`
2876        // is a no-op if the collection already exists.
2877        {
2878            let store = runtime.inner.db.store();
2879            for name in crate::application::vcs_collections::ALL {
2880                let _ = store.get_or_create_collection(*name);
2881            }
2882            // Seed VCS config namespace with sensible defaults on first
2883            // boot, matching the pattern used by red.ai / red.storage.
2884            store.set_config_tree(
2885                crate::application::vcs_collections::CONFIG_NAMESPACE,
2886                &crate::json!({
2887                    "default_branch": "main",
2888                    "author": crate::json!({
2889                        "name": "reddb",
2890                        "email": "reddb@localhost"
2891                    }),
2892                    "protected_branches": crate::json!(["main"]),
2893                    "closure": crate::json!({
2894                        "enabled": true,
2895                        "lazy": true
2896                    }),
2897                    "merge": crate::json!({
2898                        "default_strategy": "auto",
2899                        "fast_forward": true
2900                    })
2901                }),
2902            );
2903        }
2904
2905        // Migrations — create the `red_migrations` / `red_migration_deps`
2906        // system collections on first boot. Idempotent.
2907        {
2908            let store = runtime.inner.db.store();
2909            for name in crate::application::migration_collections::ALL {
2910                let _ = store.get_or_create_collection(*name);
2911            }
2912        }
2913
2914        // Topology graph (#803) — ensure the built-in `red.topology.cluster`
2915        // graph collection (declared WITH ANALYTICS) and its metadata sidecar
2916        // exist. Idempotent and survives restarts via the WAL-backed contract.
2917        let _ = crate::application::topology_collections::ensure(&runtime);
2918
2919        // #1369 — reserve a fixed internal-id floor so the first user-inserted
2920        // entity always receives a stable, documented `rid` (FIRST_USER_ENTITY_ID),
2921        // independent of how many internal collection-descriptor / config-default
2922        // entities the boot sequence seeded above. `register_entity_id` only ever
2923        // raises the allocator, so a database that already holds user data
2924        // (counter past the floor) is untouched; a freshly-seeded database jumps
2925        // straight to the floor.
2926        runtime
2927            .inner
2928            .db
2929            .store()
2930            .register_entity_id(crate::storage::EntityId::new(
2931                crate::storage::FIRST_USER_ENTITY_ID - 1,
2932            ));
2933
2934        // Start background maintenance thread (context index refresh +
2935        // session purge). Held by a WEAK reference to `RuntimeInner`
2936        // so dropping the last `RedDBRuntime` handle actually releases
2937        // the underlying Arc<Pager> (and its file lock). Polling at
2938        // 200ms means shutdown latency is bounded; the real 60-second
2939        // work cadence is tracked independently via a `last_work`
2940        // timestamp.
2941        //
2942        // The previous version captured `rt = runtime.clone()` by
2943        // strong reference and ran an unterminated `loop`, which held
2944        // Arc<RuntimeInner> forever — reopening a persistent database
2945        // in the same process failed with "Database is locked" because
2946        // the pager could never drop. See the regression test
2947        // `finding_1_select_after_bulk_insert_persistent_reopen`.
2948        {
2949            let weak = Arc::downgrade(&runtime.inner);
2950            std::thread::Builder::new()
2951                .name("reddb-maintenance".into())
2952                .spawn(move || {
2953                    let tick = std::time::Duration::from_millis(200);
2954                    let work_interval = std::time::Duration::from_secs(60);
2955                    let mut last_work = std::time::Instant::now();
2956                    loop {
2957                        std::thread::sleep(tick);
2958                        let Some(inner) = weak.upgrade() else {
2959                            // All strong references dropped — the
2960                            // runtime is gone, exit cleanly.
2961                            break;
2962                        };
2963                        if last_work.elapsed() >= work_interval {
2964                            let _stats = inner.db.store().context_index().stats();
2965                            last_work = std::time::Instant::now();
2966                        }
2967                    }
2968                })
2969                .ok();
2970        }
2971
2972        // Start backup scheduler if enabled via red_config
2973        {
2974            let store = runtime.inner.db.store();
2975            let mut backup_enabled = false;
2976            let mut backup_interval = 3600u64;
2977
2978            if let Some(manager) = store.get_collection("red_config") {
2979                manager.for_each_entity(|entity| {
2980                    if let Some(row) = entity.data.as_row() {
2981                        let key = row.get_field("key").and_then(|v| match v {
2982                            crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2983                            _ => None,
2984                        });
2985                        let val = row.get_field("value");
2986                        if key == Some("red.config.backup.enabled") {
2987                            backup_enabled = match val {
2988                                Some(crate::storage::schema::Value::Boolean(true)) => true,
2989                                Some(crate::storage::schema::Value::Text(s)) => &**s == "true",
2990                                _ => false,
2991                            };
2992                        } else if key == Some("red.config.backup.interval_secs") {
2993                            if let Some(crate::storage::schema::Value::Integer(n)) = val {
2994                                backup_interval = *n as u64;
2995                            }
2996                        }
2997                    }
2998                    true
2999                });
3000            }
3001
3002            if backup_enabled {
3003                runtime.inner.backup_scheduler.set_interval(backup_interval);
3004                let rt = runtime.clone();
3005                runtime
3006                    .inner
3007                    .backup_scheduler
3008                    .start(move || rt.trigger_backup().map_err(|e| format!("{}", e)));
3009            }
3010        }
3011
3012        // Load EC registry from red_config and start worker
3013        {
3014            runtime
3015                .inner
3016                .ec_registry
3017                .load_from_config_store(runtime.inner.db.store().as_ref());
3018            if !runtime.inner.ec_registry.async_configs().is_empty() {
3019                runtime.inner.ec_worker.start(
3020                    Arc::clone(&runtime.inner.ec_registry),
3021                    Arc::clone(&runtime.inner.db.store()),
3022                );
3023            }
3024        }
3025
3026        if let crate::replication::ReplicationRole::Replica { primary_addr } =
3027            runtime.inner.db.options().replication.role.clone()
3028        {
3029            let rt = runtime.clone();
3030            std::thread::Builder::new()
3031                .name("reddb-replica".into())
3032                .spawn(move || rt.run_replica_loop(primary_addr))
3033                .ok();
3034        }
3035
3036        // PLAN.md Phase 1 — Lifecycle Contract. Mark Ready once every
3037        // boot stage above has completed (WAL replay, restore-from-
3038        // remote, replica-loop spawn). Health probes flip from 503 to
3039        // 200 here; shutdown begins from this state.
3040        runtime.inner.lifecycle.mark_ready();
3041
3042        // Issue #583 slice 10 — ContinuousMaterializedView scheduler.
3043        // Low-priority background ticker that drains the cache's
3044        // `claim_due_at` set every ~50ms. Holds only a Weak<RuntimeInner>
3045        // so the thread exits cleanly when the runtime drops (≤50ms
3046        // latency between drop and exit). Materialized views without
3047        // a `REFRESH EVERY` clause stay on the manual-refresh path
3048        // and are skipped by `claim_due_at`, so the loop is a no-op
3049        // when no scheduled views exist.
3050        {
3051            let weak_inner = Arc::downgrade(&runtime.inner);
3052            std::thread::Builder::new()
3053                .name("reddb-mv-scheduler".into())
3054                .spawn(move || loop {
3055                    std::thread::sleep(std::time::Duration::from_millis(50));
3056                    let Some(inner) = weak_inner.upgrade() else {
3057                        break;
3058                    };
3059                    let rt = RedDBRuntime { inner };
3060                    rt.refresh_due_materialized_views();
3061                })
3062                .ok();
3063        }
3064
3065        // Issue #584 slice 12 — DeclarativeRetention background sweeper.
3066        // Low-priority ticker that physically reclaims rows whose
3067        // timestamp has fallen beyond the retention window. Holds a
3068        // `Weak<RuntimeInner>` so the thread exits within one tick of
3069        // the runtime drop (graceful shutdown leaves storage consistent
3070        // because each tick goes through the standard DELETE path —
3071        // there is no half-finished mutation state to clean up). The
3072        // tick interval is intentionally longer than the MV scheduler
3073        // (500ms) because retention is order-of-seconds at minimum.
3074        if !runtime.write_gate().is_read_only() {
3075            let weak_inner = Arc::downgrade(&runtime.inner);
3076            std::thread::Builder::new()
3077                .name("reddb-retention-sweeper".into())
3078                .spawn(move || loop {
3079                    std::thread::sleep(std::time::Duration::from_millis(500));
3080                    let Some(inner) = weak_inner.upgrade() else {
3081                        break;
3082                    };
3083                    let rt = RedDBRuntime { inner };
3084                    rt.sweep_retention_tick(
3085                        crate::runtime::retention_sweeper::DEFAULT_SWEEPER_BATCH,
3086                    );
3087                })
3088                .ok();
3089        }
3090
3091        Ok(runtime)
3092    }
3093
3094    fn rehydrate_snapshot_xid_floor(&self) {
3095        let store = self.inner.db.store();
3096        for collection in store.list_collections() {
3097            let Some(manager) = store.get_collection(&collection) else {
3098                continue;
3099            };
3100            for entity in manager.query_all(|_| true) {
3101                self.inner
3102                    .snapshot_manager
3103                    .observe_committed_xid(entity.xmin);
3104                self.inner
3105                    .snapshot_manager
3106                    .observe_committed_xid(entity.xmax);
3107            }
3108        }
3109    }
3110
3111    /// Provision an empty Table-shaped collection that backs a
3112    /// `CREATE MATERIALIZED VIEW v` (issue #594 slice 9b of #575).
3113    /// `SELECT FROM v` reads this collection directly; the rewriter is
3114    /// configured to skip materialized views so the body is no longer
3115    /// substituted. REFRESH still writes to the cache slot — wiring it
3116    /// into this backing collection is the job of slice 9c.
3117    ///
3118    /// Idempotent: re-running for the same name leaves the existing
3119    /// collection in place (mirrors `CREATE TABLE IF NOT EXISTS`
3120    /// semantics). This keeps `CREATE OR REPLACE MATERIALIZED VIEW v`
3121    /// cheap — the body change does not invalidate already-buffered
3122    /// rows. Until 9c lands the backing is always empty anyway.
3123    pub(crate) fn ensure_materialized_view_backing(&self, name: &str) -> RedDBResult<()> {
3124        let store = self.inner.db.store();
3125        let mut changed = false;
3126        if store.get_collection(name).is_none() {
3127            store.get_or_create_collection(name);
3128            changed = true;
3129        }
3130        if self.inner.db.collection_contract(name).is_none() {
3131            self.inner
3132                .db
3133                .save_collection_contract(system_keyed_collection_contract(
3134                    name,
3135                    crate::catalog::CollectionModel::Table,
3136                ))
3137                .map_err(|err| RedDBError::Internal(err.to_string()))?;
3138            changed = true;
3139        }
3140        if changed {
3141            self.inner
3142                .db
3143                .persist_metadata()
3144                .map_err(|err| RedDBError::Internal(err.to_string()))?;
3145        }
3146        Ok(())
3147    }
3148
3149    /// Inverse of [`ensure_materialized_view_backing`] — drops the
3150    /// backing collection on `DROP MATERIALIZED VIEW v`. No-op when
3151    /// the collection was never created (e.g. a `DROP MATERIALIZED
3152    /// VIEW IF EXISTS v` against an unknown name).
3153    pub(crate) fn drop_materialized_view_backing(&self, name: &str) -> RedDBResult<()> {
3154        let store = self.inner.db.store();
3155        if store.get_collection(name).is_none() {
3156            return Ok(());
3157        }
3158        store
3159            .drop_collection(name)
3160            .map_err(|err| RedDBError::Internal(err.to_string()))?;
3161        // The contract may have been dropped already (DROP TABLE path)
3162        // — ignore "not found" errors by checking presence first.
3163        if self.inner.db.collection_contract(name).is_some() {
3164            self.inner
3165                .db
3166                .remove_collection_contract(name)
3167                .map_err(|err| RedDBError::Internal(err.to_string()))?;
3168        }
3169        self.invalidate_result_cache();
3170        self.inner
3171            .db
3172            .persist_metadata()
3173            .map_err(|err| RedDBError::Internal(err.to_string()))?;
3174        Ok(())
3175    }
3176
3177    fn bootstrap_system_keyed_collections(&self) -> RedDBResult<()> {
3178        let mut changed = false;
3179        for (name, model) in [
3180            ("red.config", crate::catalog::CollectionModel::Config),
3181            ("red.vault", crate::catalog::CollectionModel::Vault),
3182            // Issue #593 — materialized-view catalog. One row per
3183            // `CREATE MATERIALIZED VIEW`; rehydrated at boot before
3184            // the API opens.
3185            (
3186                crate::runtime::continuous_materialized_view::CATALOG_COLLECTION,
3187                crate::catalog::CollectionModel::Config,
3188            ),
3189        ] {
3190            if self.inner.db.store().get_collection(name).is_none() {
3191                self.inner.db.store().get_or_create_collection(name);
3192                changed = true;
3193            }
3194            if self.inner.db.collection_contract(name).is_none() {
3195                self.inner
3196                    .db
3197                    .save_collection_contract(system_keyed_collection_contract(name, model))
3198                    .map_err(|err| RedDBError::Internal(err.to_string()))?;
3199                changed = true;
3200            }
3201        }
3202        if changed {
3203            self.inner
3204                .db
3205                .persist_metadata()
3206                .map_err(|err| RedDBError::Internal(err.to_string()))?;
3207        }
3208        Ok(())
3209    }
3210
3211    pub fn db(&self) -> Arc<RedDB> {
3212        Arc::clone(&self.inner.db)
3213    }
3214
3215    /// Direct access to the runtime's secondary-index store.
3216    /// Used by bulk-insert entry points (gRPC binary bulk, HTTP bulk,
3217    /// wire bulk) that need to push new rows through the per-index
3218    /// maintenance hook after `store.bulk_insert` returns.
3219    pub fn index_store_ref(&self) -> &super::index_store::IndexStore {
3220        &self.inner.index_store
3221    }
3222
3223    /// Apply a DDL event to the schema-vocabulary reverse index
3224    /// (issue #120). Called by DDL execution paths after the catalog
3225    /// mutation has succeeded so the index never holds entries for
3226    /// half-applied DDL.
3227    pub(crate) fn schema_vocabulary_apply(
3228        &self,
3229        event: crate::runtime::schema_vocabulary::DdlEvent,
3230    ) {
3231        self.inner.schema_vocabulary.write().on_ddl(event);
3232    }
3233
3234    /// Lookup `token` in the schema-vocabulary reverse index. Returns
3235    /// an owned `Vec<VocabHit>` because the underlying read lock
3236    /// cannot be borrowed across the call boundary; the slice from
3237    /// `SchemaVocabulary::lookup` is cloned per hit.
3238    pub fn schema_vocabulary_lookup(
3239        &self,
3240        token: &str,
3241    ) -> Vec<crate::runtime::schema_vocabulary::VocabHit> {
3242        self.inner.schema_vocabulary.read().lookup(token).to_vec()
3243    }
3244
3245    /// Inject an AuthStore into the runtime. Called by server boot
3246    /// after the vault has been bootstrapped, so that `Value::Secret`
3247    /// auto-encrypt/decrypt can reach the vault AES key.
3248    pub fn set_auth_store(&self, store: Arc<crate::auth::store::AuthStore>) {
3249        *self.inner.auth_store.write() = Some(store);
3250    }
3251
3252    /// Snapshot the current AuthStore (if any). Used by the wire listener
3253    /// to validate bearer tokens issued via HTTP `/auth/login`.
3254    pub fn auth_store(&self) -> Option<Arc<crate::auth::store::AuthStore>> {
3255        self.inner.auth_store.read().clone()
3256    }
3257
3258    /// Read a vault KV secret from the configured AuthStore, if present.
3259    pub fn vault_kv_get(&self, key: &str) -> Option<String> {
3260        self.inner
3261            .auth_store
3262            .read()
3263            .as_ref()
3264            .and_then(|store| store.vault_kv_get(key))
3265    }
3266
3267    /// Write a vault KV secret and fail if the encrypted vault write is
3268    /// unavailable or cannot be made durable.
3269    pub fn vault_kv_try_set(&self, key: String, value: String) -> RedDBResult<()> {
3270        let store = self.inner.auth_store.read().clone().ok_or_else(|| {
3271            RedDBError::Query("secret storage requires an enabled, unsealed vault".to_string())
3272        })?;
3273        store
3274            .vault_kv_try_set(key, value)
3275            .map_err(|err| RedDBError::Query(err.to_string()))
3276    }
3277
3278    /// Inject an `OAuthValidator` into the runtime. When set, HTTP and
3279    /// wire transports try OAuth JWT validation before falling back to
3280    /// the local AuthStore lookup. Pass `None` to disable.
3281    pub fn set_oauth_validator(&self, validator: Option<Arc<crate::auth::oauth::OAuthValidator>>) {
3282        *self.inner.oauth_validator.write() = validator;
3283    }
3284
3285    /// Returns a clone of the configured `OAuthValidator` Arc, if any.
3286    /// Hot path: called per HTTP request when an Authorization header
3287    /// is present, so we hand back a cheap Arc clone.
3288    pub fn oauth_validator(&self) -> Option<Arc<crate::auth::oauth::OAuthValidator>> {
3289        self.inner.oauth_validator.read().clone()
3290    }
3291
3292    /// Inject the browser-token authority (issue #936). When set, the
3293    /// RedWire WS handshake accepts the short-lived access JWT it mints
3294    /// (alongside, and tried before, the federated OAuth validator), and
3295    /// the `/auth/browser/*` HTTP endpoints can issue/rotate the pair.
3296    /// `None` leaves the browser credential flow inert.
3297    pub fn set_browser_token_authority(
3298        &self,
3299        authority: Option<Arc<crate::auth::browser_token::BrowserTokenAuthority>>,
3300    ) {
3301        *self.inner.browser_token_authority.write() = authority;
3302    }
3303
3304    /// Snapshot the browser-token authority, if wired. Read on the WS
3305    /// handshake path and by the `/auth/browser/*` handlers; a cheap Arc
3306    /// clone keeps the lock hold short.
3307    pub fn browser_token_authority(
3308        &self,
3309    ) -> Option<Arc<crate::auth::browser_token::BrowserTokenAuthority>> {
3310        self.inner.browser_token_authority.read().clone()
3311    }
3312
3313    /// Returns the vault AES key (`red.secret.aes_key`) if an auth
3314    /// store is wired and a key has been generated. Used by the
3315    /// `Value::Secret` encrypt/decrypt pipeline.
3316    pub(crate) fn secret_aes_key(&self) -> Option<[u8; 32]> {
3317        let guard = self.inner.auth_store.read();
3318        guard.as_ref().and_then(|s| s.vault_secret_key())
3319    }
3320
3321    /// Resolve a boolean flag from `red_config`. Defaults to `default`
3322    /// when the key is missing or not coercible. If the same key has
3323    /// been written multiple times (SET CONFIG appends new rows), the
3324    /// most recent entity wins. Env-var overrides
3325    /// (`REDDB_<UP_DOTTED>`) take highest precedence.
3326    pub(crate) fn config_bool(&self, key: &str, default: bool) -> bool {
3327        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3328            if let Some(crate::storage::schema::Value::Boolean(b)) =
3329                crate::runtime::config_overlay::coerce_env_value(key, raw)
3330            {
3331                return b;
3332            }
3333        }
3334        let store = self.inner.db.store();
3335        let Some(manager) = store.get_collection("red_config") else {
3336            return default;
3337        };
3338        let mut result = default;
3339        let mut latest_id: u64 = 0;
3340        manager.for_each_entity(|entity| {
3341            if let Some(row) = entity.data.as_row() {
3342                let entry_key = row.get_field("key").and_then(|v| match v {
3343                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3344                    _ => None,
3345                });
3346                if entry_key == Some(key) {
3347                    let id = entity.id.raw();
3348                    if id >= latest_id {
3349                        latest_id = id;
3350                        result = match row.get_field("value") {
3351                            Some(crate::storage::schema::Value::Boolean(b)) => *b,
3352                            Some(crate::storage::schema::Value::Text(s)) => {
3353                                matches!(s.as_ref(), "true" | "TRUE" | "True" | "1")
3354                            }
3355                            Some(crate::storage::schema::Value::Integer(n)) => *n != 0,
3356                            _ => default,
3357                        };
3358                    }
3359                }
3360            }
3361            true
3362        });
3363        result
3364    }
3365
3366    pub(crate) fn config_u64(&self, key: &str, default: u64) -> u64 {
3367        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3368            if let Some(crate::storage::schema::Value::UnsignedInteger(n)) =
3369                crate::runtime::config_overlay::coerce_env_value(key, raw)
3370            {
3371                return n;
3372            }
3373        }
3374        let store = self.inner.db.store();
3375        let Some(manager) = store.get_collection("red_config") else {
3376            return default;
3377        };
3378        let mut result = default;
3379        let mut latest_id: u64 = 0;
3380        manager.for_each_entity(|entity| {
3381            if let Some(row) = entity.data.as_row() {
3382                let entry_key = row.get_field("key").and_then(|v| match v {
3383                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3384                    _ => None,
3385                });
3386                if entry_key == Some(key) {
3387                    let id = entity.id.raw();
3388                    if id >= latest_id {
3389                        latest_id = id;
3390                        result = match row.get_field("value") {
3391                            Some(crate::storage::schema::Value::Integer(n)) => *n as u64,
3392                            Some(crate::storage::schema::Value::UnsignedInteger(n)) => *n,
3393                            Some(crate::storage::schema::Value::Text(s)) => {
3394                                s.parse::<u64>().unwrap_or(default)
3395                            }
3396                            _ => default,
3397                        };
3398                    }
3399                }
3400            }
3401            true
3402        });
3403        result
3404    }
3405
3406    pub(crate) fn config_f64(&self, key: &str, default: f64) -> f64 {
3407        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3408            if let Ok(n) = raw.parse::<f64>() {
3409                return n;
3410            }
3411        }
3412        let store = self.inner.db.store();
3413        let Some(manager) = store.get_collection("red_config") else {
3414            return default;
3415        };
3416        let mut result = default;
3417        let mut latest_id: u64 = 0;
3418        manager.for_each_entity(|entity| {
3419            if let Some(row) = entity.data.as_row() {
3420                let entry_key = row.get_field("key").and_then(|v| match v {
3421                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3422                    _ => None,
3423                });
3424                if entry_key == Some(key) {
3425                    let id = entity.id.raw();
3426                    if id >= latest_id {
3427                        latest_id = id;
3428                        result = match row.get_field("value") {
3429                            Some(crate::storage::schema::Value::Float(n)) => *n,
3430                            Some(crate::storage::schema::Value::Integer(n)) => *n as f64,
3431                            Some(crate::storage::schema::Value::UnsignedInteger(n)) => *n as f64,
3432                            Some(crate::storage::schema::Value::Text(s)) => {
3433                                s.parse::<f64>().unwrap_or(default)
3434                            }
3435                            _ => default,
3436                        };
3437                    }
3438                }
3439            }
3440            true
3441        });
3442        result
3443    }
3444
3445    pub(crate) fn config_string(&self, key: &str, default: &str) -> String {
3446        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3447            return raw.clone();
3448        }
3449        let store = self.inner.db.store();
3450        let Some(manager) = store.get_collection("red_config") else {
3451            return default.to_string();
3452        };
3453        let mut result = default.to_string();
3454        let mut latest_id: u64 = 0;
3455        manager.for_each_entity(|entity| {
3456            if let Some(row) = entity.data.as_row() {
3457                let entry_key = row.get_field("key").and_then(|v| match v {
3458                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3459                    _ => None,
3460                });
3461                if entry_key == Some(key) {
3462                    let id = entity.id.raw();
3463                    if id >= latest_id {
3464                        latest_id = id;
3465                        if let Some(crate::storage::schema::Value::Text(value)) =
3466                            row.get_field("value")
3467                        {
3468                            result = value.to_string();
3469                        }
3470                    }
3471                }
3472            }
3473            true
3474        });
3475        result
3476    }
3477
3478    /// Whether `SECRET('...')` literals should be encrypted with the
3479    /// vault AES key on INSERT. Default `true`.
3480    pub(crate) fn secret_auto_encrypt(&self) -> bool {
3481        self.config_bool("red.config.secret.auto_encrypt", true)
3482    }
3483
3484    /// Whether `Value::Secret` columns should be decrypted back to
3485    /// plaintext on SELECT when the vault is unsealed. Default `true`.
3486    /// Turning this off keeps secrets masked as `***` even while the
3487    /// vault is open — useful for audit trails or read-only exports.
3488    pub(crate) fn secret_auto_decrypt(&self) -> bool {
3489        self.config_bool("red.config.secret.auto_decrypt", true)
3490    }
3491
3492    /// Walk every record in `result` and swap `Value::Secret(bytes)`
3493    /// for the decrypted plaintext when the runtime has the vault
3494    /// AES key AND `red.config.secret.auto_decrypt = true`. If the
3495    /// key is missing, the vault is sealed, or auto_decrypt is off,
3496    /// secrets are left as `Value::Secret` which every formatter
3497    /// (Display, JSON) already masks as `***`.
3498    pub(crate) fn apply_secret_decryption(&self, result: &mut RuntimeQueryResult) {
3499        if !self.secret_auto_decrypt() {
3500            return;
3501        }
3502        let Some(key) = self.secret_aes_key() else {
3503            return;
3504        };
3505        for record in result.result.records.iter_mut() {
3506            for value in record.values_mut() {
3507                if let Value::Secret(ref bytes) = value {
3508                    if let Some(plain) =
3509                        super::impl_dml::decrypt_secret_payload(&key, bytes.as_slice())
3510                    {
3511                        if let Ok(text) = String::from_utf8(plain) {
3512                            *value = Value::text(text);
3513                        }
3514                    }
3515                }
3516            }
3517        }
3518    }
3519
3520    /// Emit a CDC change event and replicate to WAL buffer.
3521    /// Create a `MutationEngine` bound to this runtime.
3522    ///
3523    /// The engine is cheap to construct (no allocation) and should be
3524    /// dropped after `apply` returns. Use this from application-layer
3525    /// `create_row` / `create_rows_batch` instead of calling
3526    /// `bulk_insert` + `index_entity_insert` + `cdc_emit` separately.
3527    pub(crate) fn mutation_engine(&self) -> crate::runtime::mutation::MutationEngine<'_> {
3528        crate::runtime::mutation::MutationEngine::new(self)
3529    }
3530
3531    /// Public-mutation gate snapshot (PLAN.md W1).
3532    ///
3533    /// Surfaces that accept untrusted client requests (SQL DML/DDL,
3534    /// gRPC mutating RPCs, HTTP/native wire mutations, admin
3535    /// maintenance, serverless lifecycle) call `check_write` before
3536    /// dispatching to storage. Returns `RedDBError::ReadOnly` on any
3537    /// instance running as a replica or with `options.read_only =
3538    /// true`. The replica internal logical-WAL apply path reaches into
3539    /// the store directly and never calls this method, so legitimate
3540    /// replica catch-up still works.
3541    pub fn check_write(&self, kind: crate::runtime::write_gate::WriteKind) -> RedDBResult<()> {
3542        self.inner.write_gate.check(kind)
3543    }
3544
3545    /// Read-only handle to the gate, useful for transports that want
3546    /// to surface the policy in health/status output without taking on
3547    /// a dependency on the concrete enum.
3548    pub fn write_gate(&self) -> &crate::runtime::write_gate::WriteGate {
3549        &self.inner.write_gate
3550    }
3551
3552    /// Process lifecycle handle (PLAN.md Phase 1). Health probes,
3553    /// admin/shutdown, and signal handlers consult this single
3554    /// state machine.
3555    pub fn lifecycle(&self) -> &crate::runtime::lifecycle::Lifecycle {
3556        &self.inner.lifecycle
3557    }
3558
3559    /// Operator-imposed resource limits (PLAN.md Phase 4.1).
3560    pub fn resource_limits(&self) -> &crate::runtime::resource_limits::ResourceLimits {
3561        &self.inner.resource_limits
3562    }
3563
3564    /// Append-only audit log for admin mutations (PLAN.md Phase 6.5).
3565    pub fn audit_log(&self) -> &crate::runtime::audit_log::AuditLogger {
3566        &self.inner.audit_log
3567    }
3568
3569    /// Shared `Arc` to the audit logger — used by collaborators (the
3570    /// lease lifecycle, future request-context plumbing) that need to
3571    /// keep the logger alive past the runtime's stack frame.
3572    pub fn audit_log_arc(&self) -> Arc<crate::runtime::audit_log::AuditLogger> {
3573        Arc::clone(&self.inner.audit_log)
3574    }
3575
3576    pub(crate) fn emit_control_event(
3577        &self,
3578        kind: crate::runtime::control_events::EventKind,
3579        outcome: crate::runtime::control_events::Outcome,
3580        action: &'static str,
3581        resource: Option<String>,
3582        reason: Option<String>,
3583        extra_fields: Vec<(String, crate::runtime::control_events::Sensitivity)>,
3584    ) -> RedDBResult<()> {
3585        use crate::runtime::control_events::{
3586            ActorRef, ControlEvent, ControlEventCtx, ControlEventLedger, Sensitivity,
3587        };
3588
3589        let tenant = current_tenant();
3590        let principal = current_auth_identity();
3591        let actor_user = principal
3592            .as_ref()
3593            .map(|(principal, _)| UserId::from_parts(tenant.as_deref(), principal));
3594        let actor = actor_user
3595            .as_ref()
3596            .map(ActorRef::User)
3597            .unwrap_or(ActorRef::Anonymous);
3598        let ctx = ControlEventCtx {
3599            actor,
3600            scope: tenant
3601                .as_ref()
3602                .map(|scope| std::borrow::Cow::Borrowed(scope.as_str())),
3603            request_id: Some(std::borrow::Cow::Owned(format!(
3604                "conn-{}",
3605                current_connection_id()
3606            ))),
3607            trace_id: None,
3608        };
3609        let mut fields = std::collections::HashMap::new();
3610        fields.insert(
3611            "connection_id".to_string(),
3612            Sensitivity::raw(current_connection_id().to_string()),
3613        );
3614        if let Some((_, role)) = principal {
3615            fields.insert("actor_role".to_string(), Sensitivity::raw(role.as_str()));
3616        }
3617        for (key, value) in extra_fields {
3618            fields.insert(key, value);
3619        }
3620        let event = ControlEvent {
3621            kind,
3622            outcome,
3623            action: std::borrow::Cow::Borrowed(action),
3624            resource,
3625            reason,
3626            matched_policy_id: None,
3627            fields,
3628        };
3629        let ledger = self.inner.control_event_ledger.read();
3630        match ledger.emit(&ctx, event) {
3631            Ok(_) => Ok(()),
3632            Err(err) if self.inner.control_event_config.require_persistence() => {
3633                Err(RedDBError::Internal(err.to_string()))
3634            }
3635            Err(_) => Ok(()),
3636        }
3637    }
3638
3639    fn policy_mutation_control_ctx<'a>(
3640        &self,
3641        actor: &'a crate::auth::UserId,
3642        tenant: Option<&'a str>,
3643    ) -> crate::runtime::control_events::ControlEventCtx<'a> {
3644        crate::runtime::control_events::ControlEventCtx {
3645            actor: crate::runtime::control_events::ActorRef::User(actor),
3646            scope: tenant.map(std::borrow::Cow::Borrowed),
3647            request_id: Some(std::borrow::Cow::Owned(format!(
3648                "conn-{}",
3649                current_connection_id()
3650            ))),
3651            trace_id: None,
3652        }
3653    }
3654
3655    fn emit_query_audit(
3656        &self,
3657        query: &str,
3658        plan: &QueryAuditPlan,
3659        duration_ms: u64,
3660        result: &RuntimeQueryResult,
3661    ) {
3662        if !self.inner.query_audit.has_rules() {
3663            return;
3664        }
3665        let actor = current_auth_identity().map(|(principal, _)| principal);
3666        let tenant = current_tenant();
3667        let row_count = if result.statement_type == "select" {
3668            result.result.records.len() as u64
3669        } else {
3670            result.affected_rows
3671        };
3672        self.inner
3673            .query_audit
3674            .emit(crate::runtime::query_audit::QueryAuditEvent {
3675                actor,
3676                tenant,
3677                statement_kind: plan.statement_kind,
3678                touched_collections: plan.collections.clone(),
3679                duration_ms,
3680                row_count,
3681                request_id: Some(crate::crypto::uuid::Uuid::new_v7().to_string()),
3682                query_hash: Some(blake3::hash(query.as_bytes()).to_hex().to_string()),
3683            });
3684    }
3685
3686    /// Shared queue telemetry counters (delivered/acked/nacked).
3687    pub(crate) fn queue_telemetry(
3688        &self,
3689    ) -> &crate::runtime::queue_telemetry::QueueTelemetryCounters {
3690        &self.inner.queue_telemetry
3691    }
3692
3693    /// Snapshots of the queue telemetry counters in label-deterministic
3694    /// order for `/metrics` rendering and the integration test.
3695    pub fn queue_telemetry_snapshot(
3696        &self,
3697    ) -> crate::runtime::queue_telemetry::QueueTelemetrySnapshot {
3698        crate::runtime::queue_telemetry::QueueTelemetrySnapshot {
3699            delivered: self.inner.queue_telemetry.delivered_snapshot(),
3700            acked: self.inner.queue_telemetry.acked_snapshot(),
3701            nacked: self.inner.queue_telemetry.nacked_snapshot(),
3702            wait_started: self.inner.queue_telemetry.wait_started_snapshot(),
3703            wait_woken: self.inner.queue_telemetry.wait_woken_snapshot(),
3704            wait_timed_out: self.inner.queue_telemetry.wait_timed_out_snapshot(),
3705            wait_cancelled: self.inner.queue_telemetry.wait_cancelled_snapshot(),
3706            wait_duration: self.inner.queue_telemetry.wait_duration_snapshot(),
3707        }
3708    }
3709
3710    /// Per-`kind` query latency histograms for `/metrics` (only kinds with
3711    /// a real sample are present — empty kinds are absent, not zero-filled).
3712    pub fn query_latency_snapshot(
3713        &self,
3714    ) -> Vec<crate::runtime::query_latency_telemetry::QueryLatencyHistogram> {
3715        self.inner.query_latency_telemetry.snapshot()
3716    }
3717
3718    /// Cross-kind query latency rollup for `/cluster/status` and the
3719    /// red-ui percentile panels. `count == 0` until a real sample exists.
3720    pub fn query_latency_rollup(
3721        &self,
3722    ) -> crate::runtime::query_latency_telemetry::QueryLatencyHistogram {
3723        self.inner.query_latency_telemetry.rollup()
3724    }
3725
3726    /// Issue #742 — consumer presence registry. Heartbeats land here
3727    /// from `QUEUE READ` (and, in a follow-up slice, an explicit
3728    /// `QUEUE HEARTBEAT` command); Red UI and `red.queue_consumers`
3729    /// read snapshots through `queue_consumer_presence_snapshot`.
3730    pub(crate) fn queue_presence(
3731        &self,
3732    ) -> &std::sync::Arc<crate::storage::queue::presence::ConsumerPresenceRegistry> {
3733        &self.inner.queue_presence
3734    }
3735
3736    /// Issue #742 — point-in-time presence snapshot, classifying each
3737    /// `(queue, group, consumer)` as active/stale/expired against the
3738    /// supplied TTL. Wall-clock is read once here so the lifecycle
3739    /// flags inside the snapshot are internally consistent.
3740    pub fn queue_consumer_presence_snapshot(
3741        &self,
3742        ttl_ms: u64,
3743    ) -> Vec<crate::storage::queue::presence::ConsumerPresence> {
3744        let now_ns = std::time::SystemTime::now()
3745            .duration_since(std::time::UNIX_EPOCH)
3746            .map(|d| d.as_nanos() as u64)
3747            .unwrap_or(0);
3748        self.inner.queue_presence.snapshot(now_ns, ttl_ms)
3749    }
3750
3751    /// Issue #742 — active-consumer count per `(queue, group)` for the
3752    /// queue-metadata surface. Stale/expired entries are excluded by
3753    /// definition; they are still visible in the per-row snapshot.
3754    pub fn queue_active_consumer_counts(
3755        &self,
3756        ttl_ms: u64,
3757    ) -> std::collections::HashMap<(String, String), u32> {
3758        let now_ns = std::time::SystemTime::now()
3759            .duration_since(std::time::UNIX_EPOCH)
3760            .map(|d| d.as_nanos() as u64)
3761            .unwrap_or(0);
3762        self.inner
3763            .queue_presence
3764            .count_active_by_group(now_ns, ttl_ms)
3765    }
3766
3767    /// Issue #743 — vector + TurboQuant introspection registry. Engine
3768    /// publish points (collection create, artifact build start /
3769    /// finish, fallback toggle, drop) update this; Red UI and
3770    /// `red.*` vector virtual tables read snapshots through
3771    /// `vector_introspection_snapshot` / `vector_introspection_get`.
3772    pub(crate) fn vector_introspection_registry(
3773        &self,
3774    ) -> &std::sync::Arc<crate::storage::vector::introspection::VectorIntrospectionRegistry> {
3775        &self.inner.vector_introspection
3776    }
3777
3778    /// Issue #743 — full snapshot of every tracked vector collection's
3779    /// `(VectorMetadata, ArtifactMetadata)`. Deterministically ordered
3780    /// by collection name so Red UI tables and tests both see a
3781    /// stable shape.
3782    pub fn vector_introspection_snapshot(
3783        &self,
3784    ) -> Vec<crate::storage::vector::introspection::VectorIntrospection> {
3785        self.inner.vector_introspection.snapshot()
3786    }
3787
3788    /// Issue #743 — single-collection lookup, for the per-collection
3789    /// metadata endpoint Red UI hits when an operator opens one
3790    /// vector's toolbar.
3791    pub fn vector_introspection_get(
3792        &self,
3793        collection: &str,
3794    ) -> Option<crate::storage::vector::introspection::VectorIntrospection> {
3795        self.inner.vector_introspection.get(collection)
3796    }
3797
3798    /// Issue #1238 — ADR 0060 read-model accessor for slow-query telemetry.
3799    ///
3800    /// Returns a reference to the bounded ring store so HTTP handlers and
3801    /// the red-ui read model can call `store.read(filter)` without
3802    /// touching `red-slow.log` directly.
3803    pub fn slow_query_store(&self) -> &Arc<crate::telemetry::slow_query_store::SlowQueryStore> {
3804        &self.inner.slow_query_store
3805    }
3806
3807    /// Slice 10 of issue #527 — render-time scan of pending entries
3808    /// per (queue, group) for the `queue_pending_gauge` exposition.
3809    /// Walks `red_queue_meta` live so the gauge cannot drift from
3810    /// the source of truth.
3811    pub fn queue_pending_counts(&self) -> Vec<((String, String), u64)> {
3812        let store = self.inner.db.store();
3813        crate::runtime::impl_queue::pending_counts_by_group(store.as_ref())
3814            .into_iter()
3815            .collect()
3816    }
3817
3818    /// Shared `Arc` to the write gate. Same rationale as
3819    /// `audit_log_arc`: collaborators (lease lifecycle, refresh
3820    /// thread) need a clone-cheap handle they can move into a
3821    /// background thread.
3822    pub fn write_gate_arc(&self) -> Arc<crate::runtime::write_gate::WriteGate> {
3823        Arc::clone(&self.inner.write_gate)
3824    }
3825
3826    /// Serverless writer-lease state machine. `None` when the operator
3827    /// did not opt into lease fencing (`RED_LEASE_REQUIRED` unset).
3828    pub fn lease_lifecycle(&self) -> Option<&Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>> {
3829        self.inner.lease_lifecycle.get()
3830    }
3831
3832    /// Install the lease lifecycle. Idempotent; subsequent calls
3833    /// return the previously stored value untouched.
3834    pub fn set_lease_lifecycle(
3835        &self,
3836        lifecycle: Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>,
3837    ) -> Result<(), Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>> {
3838        self.inner.lease_lifecycle.set(lifecycle)
3839    }
3840
3841    /// Reject the call when the requested batch size exceeds
3842    /// `RED_MAX_BATCH_SIZE`. Returns `RedDBError::QuotaExceeded`
3843    /// shaped so the HTTP layer can map it to 413 Payload Too
3844    /// Large (PLAN.md Phase 4.1).
3845    pub fn check_batch_size(&self, requested: usize) -> RedDBResult<()> {
3846        if self.inner.resource_limits.batch_size_exceeded(requested) {
3847            let max = self.inner.resource_limits.max_batch_size.unwrap_or(0);
3848            return Err(RedDBError::QuotaExceeded(format!(
3849                "max_batch_size:{requested}:{max}"
3850            )));
3851        }
3852        Ok(())
3853    }
3854
3855    /// Reject the call when the local DB file exceeds
3856    /// `RED_MAX_DB_SIZE_BYTES`. Reads file metadata once per call —
3857    /// the cost is a single `stat()` syscall, negligible against the
3858    /// I/O the caller is about to do. Returns `QuotaExceeded` shaped
3859    /// for HTTP 507 Insufficient Storage.
3860    pub fn check_db_size(&self) -> RedDBResult<()> {
3861        let Some(limit) = self.inner.resource_limits.max_db_size_bytes else {
3862            return Ok(());
3863        };
3864        if limit == 0 {
3865            return Ok(());
3866        }
3867        let Some(path) = self.inner.db.path() else {
3868            return Ok(());
3869        };
3870        let current = std::fs::metadata(path).map(|m| m.len()).unwrap_or(0);
3871        if current > limit {
3872            return Err(RedDBError::QuotaExceeded(format!(
3873                "max_db_size_bytes:{current}:{limit}"
3874            )));
3875        }
3876        Ok(())
3877    }
3878
3879    /// Graceful shutdown coordinator (PLAN.md Phase 1.1).
3880    ///
3881    /// Steps, in order, all idempotent across re-entrant calls:
3882    ///   1. Move lifecycle into `ShuttingDown` (concurrent callers
3883    ///      observe `Stopped` after first finishes).
3884    ///   2. Flush WAL + run final checkpoint via `db.flush()` so
3885    ///      every acked write is durable on disk.
3886    ///   3. If `backup_on_shutdown == true` and a remote backend is
3887    ///      configured, run a synchronous `trigger_backup()` so the
3888    ///      remote head reflects the final state.
3889    ///   4. Stamp the report and move to `Stopped`. Subsequent calls
3890    ///      return the cached report without re-running anything.
3891    ///
3892    /// On any error, the runtime is still marked `Stopped` so the
3893    /// process can exit; the caller logs the error context but does
3894    /// not retry the same shutdown — the operator can inspect the
3895    /// report fields to see which step failed.
3896    pub fn graceful_shutdown(
3897        &self,
3898        backup_on_shutdown: bool,
3899    ) -> RedDBResult<crate::runtime::lifecycle::ShutdownReport> {
3900        if !self.inner.lifecycle.begin_shutdown() {
3901            // Someone else already shut down (or is in flight). Return
3902            // the cached report so the HTTP caller and SIGTERM handler
3903            // get the same idempotent answer.
3904            return Ok(self.inner.lifecycle.shutdown_report().unwrap_or_default());
3905        }
3906
3907        let started_ms = std::time::SystemTime::now()
3908            .duration_since(std::time::UNIX_EPOCH)
3909            .map(|d| d.as_millis() as u64)
3910            .unwrap_or(0);
3911        let mut report = crate::runtime::lifecycle::ShutdownReport {
3912            started_at_ms: started_ms,
3913            ..Default::default()
3914        };
3915
3916        // Flush WAL + run any pending checkpoint. Local fsync is
3917        // unconditional — even a lease-lost replica needs its WAL on
3918        // disk before exit so a future restore has the latest tail.
3919        // The remote upload is gated separately so a lost-lease writer
3920        // doesn't clobber the new holder's state on its way out.
3921        let flush_res = self.inner.db.flush_local_only();
3922        report.flushed_wal = flush_res.is_ok();
3923        report.final_checkpoint = flush_res.is_ok();
3924        if let Err(err) = &flush_res {
3925            tracing::error!(
3926                target: "reddb::lifecycle",
3927                error = %err,
3928                "graceful_shutdown: local flush failed"
3929            );
3930        } else if let Err(lease_err) =
3931            self.assert_remote_write_allowed("shutdown/checkpoint_upload")
3932        {
3933            tracing::warn!(
3934                target: "reddb::serverless::lease",
3935                error = %lease_err,
3936                "graceful_shutdown: remote upload skipped — lease not held"
3937            );
3938        } else if let Err(err) = self.inner.db.upload_to_remote_backend() {
3939            tracing::error!(
3940                target: "reddb::lifecycle",
3941                error = %err,
3942                "graceful_shutdown: remote upload failed"
3943            );
3944        }
3945
3946        // Optional final backup. Skipped silently when no remote
3947        // backend is configured — `trigger_backup()` returns Err
3948        // anyway in that case, but logging it as a shutdown failure
3949        // would be misleading on a standalone (no-backend) runtime.
3950        if backup_on_shutdown && self.inner.db.remote_backend.is_some() {
3951            // The trigger_backup gate now reads `WriteKind::Backup`,
3952            // which a replica/read_only instance refuses. That's
3953            // intentional — replicas don't drive backups; only the
3954            // primary does. We still want shutdown to flush its WAL
3955            // even if the backup branch is gated off.
3956            match self.trigger_backup() {
3957                Ok(result) => {
3958                    report.backup_uploaded = result.uploaded;
3959                }
3960                Err(err) => {
3961                    tracing::warn!(
3962                        target: "reddb::lifecycle",
3963                        error = %err,
3964                        "graceful_shutdown: final backup skipped"
3965                    );
3966                }
3967            }
3968        }
3969
3970        let completed_ms = std::time::SystemTime::now()
3971            .duration_since(std::time::UNIX_EPOCH)
3972            .map(|d| d.as_millis() as u64)
3973            .unwrap_or(started_ms);
3974        report.completed_at_ms = completed_ms;
3975        report.duration_ms = completed_ms.saturating_sub(started_ms);
3976
3977        self.inner.lifecycle.finish_shutdown(report.clone());
3978        Ok(report)
3979    }
3980
3981    /// PLAN.md Phase 4.4 — per-caller quota bucket. Always
3982    /// returned; `is_configured()` lets callers short-circuit.
3983    pub fn quota_bucket(&self) -> &crate::runtime::quota_bucket::QuotaBucket {
3984        &self.inner.quota_bucket
3985    }
3986
3987    /// PLAN.md Phase 6.3 — whether at-rest encryption is configured.
3988    /// Reads `RED_ENCRYPTION_KEY` / `RED_ENCRYPTION_KEY_FILE` lazily;
3989    /// returns `("enabled", None)` when a key is loadable, `("error", Some(msg))`
3990    /// when the operator set the env but it doesn't parse, and
3991    /// `("disabled", None)` when no key is configured. The pager
3992    /// hookup is deferred — this accessor surfaces the operator's
3993    /// intent for /admin/status without yet using the key in writes.
3994    pub fn encryption_at_rest_status(&self) -> (&'static str, Option<String>) {
3995        match crate::crypto::page_encryption::key_from_env() {
3996            Ok(Some(_)) => ("enabled", None),
3997            Ok(None) => ("disabled", None),
3998            Err(err) => ("error", Some(err)),
3999        }
4000    }
4001
4002    /// PLAN.md Phase 11.5 — current replica apply health label
4003    /// (`ok`, `gap`, `divergence`, `apply_error`, `connecting`,
4004    /// `stalled_gap`). Read from the persisted `red.replication.state`
4005    /// config key updated by the replica loop. Returns `None` on
4006    /// non-replica instances or when no apply has run yet.
4007    pub fn replica_apply_health(&self) -> Option<String> {
4008        let state = self.config_string("red.replication.state", "");
4009        if state.is_empty() {
4010            None
4011        } else {
4012            Some(state)
4013        }
4014    }
4015
4016    pub fn acquire(&self) -> RedDBResult<RuntimeConnection> {
4017        let mut pool = self
4018            .inner
4019            .pool
4020            .lock()
4021            .map_err(|e| RedDBError::Internal(format!("connection pool lock poisoned: {e}")))?;
4022        if pool.active >= self.inner.pool_config.max_connections {
4023            return Err(RedDBError::Internal(
4024                "connection pool exhausted".to_string(),
4025            ));
4026        }
4027
4028        let id = if let Some(id) = pool.idle.pop() {
4029            id
4030        } else {
4031            let id = pool.next_id;
4032            pool.next_id += 1;
4033            id
4034        };
4035        pool.active += 1;
4036        pool.total_checkouts += 1;
4037        drop(pool);
4038
4039        Ok(RuntimeConnection {
4040            id,
4041            inner: Arc::clone(&self.inner),
4042        })
4043    }
4044
4045    pub fn checkpoint(&self) -> RedDBResult<()> {
4046        // Local fsync always allowed — losing the lease shouldn't
4047        // prevent us from durably persisting what's already in memory.
4048        // The remote upload is the side-effect that risks clobbering a
4049        // peer's state, so it's behind the lease gate.
4050        self.inner.db.flush_local_only().map_err(|err| {
4051            // Issue #205 — local flush failure is a CheckpointFailed
4052            // operator-grade event. The local-flush path also covers
4053            // the WAL fsync we depend on, so a failure here doubles as
4054            // the WalFsyncFailed signal for the runtime entry point.
4055            let msg = err.to_string();
4056            crate::telemetry::operator_event::OperatorEvent::CheckpointFailed {
4057                lsn: 0,
4058                error: msg.clone(),
4059            }
4060            .emit_global();
4061            crate::telemetry::operator_event::OperatorEvent::WalFsyncFailed {
4062                path: "<flush_local_only>".to_string(),
4063                error: msg.clone(),
4064            }
4065            .emit_global();
4066            RedDBError::Engine(msg)
4067        })?;
4068        if let Err(err) = self.assert_remote_write_allowed("checkpoint") {
4069            tracing::warn!(
4070                target: "reddb::serverless::lease",
4071                error = %err,
4072                "checkpoint: skipping remote upload — lease not held"
4073            );
4074            return Ok(());
4075        }
4076        self.inner
4077            .db
4078            .upload_to_remote_backend()
4079            .map_err(|err| RedDBError::Engine(err.to_string()))
4080    }
4081
4082    /// Guard remote-mutating operations on the writer lease.
4083    /// Returns `Ok(())` when no remote backend is configured (the
4084    /// lease is irrelevant) or the lease state is `NotRequired` /
4085    /// `Held`. Returns `RedDBError::ReadOnly` when the lease is
4086    /// `NotHeld`, with an audit-friendly action label so the caller
4087    /// can record the rejection.
4088    pub(crate) fn assert_remote_write_allowed(&self, action: &str) -> RedDBResult<()> {
4089        if self.inner.db.remote_backend.is_none() {
4090            return Ok(());
4091        }
4092        match self.inner.write_gate.lease_state() {
4093            crate::runtime::write_gate::LeaseGateState::NotHeld => {
4094                self.inner.audit_log.record(
4095                    action,
4096                    "system",
4097                    "remote_backend",
4098                    "err: writer lease not held",
4099                    crate::json::Value::Null,
4100                );
4101                Err(RedDBError::ReadOnly(format!(
4102                    "writer lease not held — {action} blocked (serverless fence)"
4103                )))
4104            }
4105            _ => Ok(()),
4106        }
4107    }
4108
4109    pub fn run_maintenance(&self) -> RedDBResult<()> {
4110        self.inner
4111            .db
4112            .run_maintenance()
4113            .map_err(|err| RedDBError::Internal(err.to_string()))
4114    }
4115
4116    pub fn scan_collection(
4117        &self,
4118        collection: &str,
4119        cursor: Option<ScanCursor>,
4120        limit: usize,
4121    ) -> RedDBResult<ScanPage> {
4122        let store = self.inner.db.store();
4123        let manager = store
4124            .get_collection(collection)
4125            .ok_or_else(|| RedDBError::NotFound(collection.to_string()))?;
4126
4127        let mut entities = manager.query_all(|_| true);
4128        entities.sort_by_key(|entity| entity.id.raw());
4129
4130        let offset = cursor.map(|cursor| cursor.offset).unwrap_or(0);
4131        let total = entities.len();
4132        let end = total.min(offset.saturating_add(limit.max(1)));
4133        let items = if offset >= total {
4134            Vec::new()
4135        } else {
4136            entities[offset..end].to_vec()
4137        };
4138        let next = (end < total).then_some(ScanCursor { offset: end });
4139
4140        Ok(ScanPage {
4141            collection: collection.to_string(),
4142            items,
4143            next,
4144            total,
4145        })
4146    }
4147
4148    pub fn catalog(&self) -> CatalogModelSnapshot {
4149        self.inner.db.catalog_model_snapshot()
4150    }
4151
4152    pub fn catalog_consistency_report(&self) -> crate::catalog::CatalogConsistencyReport {
4153        self.inner.db.catalog_consistency_report()
4154    }
4155
4156    pub fn catalog_attention_summary(&self) -> CatalogAttentionSummary {
4157        crate::catalog::attention_summary(&self.catalog())
4158    }
4159
4160    pub fn collection_attention(&self) -> Vec<CollectionDescriptor> {
4161        crate::catalog::collection_attention(&self.catalog())
4162    }
4163
4164    pub fn index_attention(&self) -> Vec<CatalogIndexStatus> {
4165        crate::catalog::index_attention(&self.catalog())
4166    }
4167
4168    pub fn graph_projection_attention(&self) -> Vec<CatalogGraphProjectionStatus> {
4169        crate::catalog::graph_projection_attention(&self.catalog())
4170    }
4171
4172    pub fn analytics_job_attention(&self) -> Vec<CatalogAnalyticsJobStatus> {
4173        crate::catalog::analytics_job_attention(&self.catalog())
4174    }
4175
4176    pub fn stats(&self) -> RuntimeStats {
4177        let pool = runtime_pool_lock(self);
4178        RuntimeStats {
4179            active_connections: pool.active,
4180            idle_connections: pool.idle.len(),
4181            total_checkouts: pool.total_checkouts,
4182            paged_mode: self.inner.db.is_paged(),
4183            started_at_unix_ms: self.inner.started_at_unix_ms,
4184            store: self.inner.db.stats(),
4185            system: SystemInfo::collect(),
4186            result_blob_cache: self.inner.result_blob_cache.stats(),
4187            kv: self.inner.kv_stats.snapshot(),
4188            metrics_ingest: self.inner.metrics_ingest_stats.snapshot(),
4189        }
4190    }
4191
4192    pub(crate) fn record_metrics_ingest(
4193        &self,
4194        accepted_samples: u64,
4195        accepted_series: u64,
4196        rejected_samples: u64,
4197        rejected_series: u64,
4198    ) {
4199        self.inner.metrics_ingest_stats.record(
4200            accepted_samples,
4201            accepted_series,
4202            rejected_samples,
4203            rejected_series,
4204        );
4205    }
4206
4207    pub(crate) fn record_metrics_cardinality_budget_rejections(&self, rejected_series: u64) {
4208        self.inner
4209            .metrics_ingest_stats
4210            .record_cardinality_budget_rejections(rejected_series);
4211    }
4212
4213    pub(crate) fn record_metrics_tenant_activity(
4214        &self,
4215        tenant: &str,
4216        namespace: &str,
4217        operation: &str,
4218    ) {
4219        self.inner
4220            .metrics_tenant_activity_stats
4221            .record(tenant, namespace, operation);
4222    }
4223
4224    pub(crate) fn metrics_tenant_activity_snapshot(
4225        &self,
4226    ) -> Vec<crate::runtime::MetricsTenantActivityStats> {
4227        self.inner.metrics_tenant_activity_stats.snapshot()
4228    }
4229
4230    /// Execute a query under a typed scope override without embedding
4231    /// the tenant / user / role values into the SQL string. Use this
4232    /// from transport middleware (HTTP / gRPC / worker loops) where the
4233    /// scope is resolved from auth claims and the SQL is a parameterised
4234    /// template — avoids the string-concat injection risk of building
4235    /// `WITHIN TENANT '<id>' …` manually, and is drop-in compatible with
4236    /// prepared statements that didn't know about tenancy.
4237    ///
4238    /// Precedence matches the `WITHIN` clause: the passed `scope`
4239    /// overrides `SET LOCAL TENANT`, which overrides `SET TENANT`.
4240    /// The override is pushed on the thread-local scope stack for the
4241    /// duration of the call and popped on return — pool-shared
4242    /// connections cannot leak it across requests.
4243    pub fn execute_query_with_scope(
4244        &self,
4245        query: &str,
4246        scope: crate::runtime::within_clause::ScopeOverride,
4247    ) -> RedDBResult<RuntimeQueryResult> {
4248        if scope.is_empty() {
4249            return self.execute_query(query);
4250        }
4251        let _scope_guard = ScopeOverrideGuard::install(scope);
4252        self.execute_query(query)
4253    }
4254
4255    /// Issue #205 — single lifecycle exit for slow-query logging.
4256    ///
4257    /// `execute_query_inner` does the real work; this wrapper times it
4258    /// and, if elapsed exceeds the configured threshold, hands the
4259    /// triple `(QueryKind, elapsed_ms, sql_redacted, scope)` to the
4260    /// SlowQueryLogger. The threshold + sample_pct were captured at
4261    /// SlowQueryLogger construction (runtime startup), so the per-call
4262    /// cost on below-threshold paths is one relaxed atomic load.
4263    pub fn execute_query(&self, query: &str) -> RedDBResult<RuntimeQueryResult> {
4264        let started = std::time::Instant::now();
4265        let result = self.execute_query_inner(query);
4266        self.finish_query_lifecycle(query, started, result)
4267    }
4268
4269    /// Execute a SQL statement with already-decoded positional bind
4270    /// parameters. Transports should call this instead of parsing +
4271    /// binding on their side and then reaching for `execute_query_expr`:
4272    /// this entry keeps parameterized statements inside the same
4273    /// statement lifecycle as textual SQL (snapshot guard, config/secret
4274    /// guards, coarse auth, intent locks, slow-query logging, integrity
4275    /// tombstone filtering, and causal bookmarks).
4276    pub fn execute_query_with_params(
4277        &self,
4278        query: &str,
4279        params: &[Value],
4280    ) -> RedDBResult<RuntimeQueryResult> {
4281        if params.is_empty() {
4282            return self.execute_query(query);
4283        }
4284        let started = std::time::Instant::now();
4285        let result = self.execute_query_with_params_inner(query, params);
4286        self.finish_query_lifecycle(query, started, result)
4287    }
4288
4289    fn finish_query_lifecycle(
4290        &self,
4291        query: &str,
4292        started: std::time::Instant,
4293        mut result: RedDBResult<RuntimeQueryResult>,
4294    ) -> RedDBResult<RuntimeQueryResult> {
4295        // Issue #765 / S6 — filter integrity-tombstoned rows out of SELECT
4296        // results before they reach any consumer. Fast no-op (one relaxed
4297        // atomic load) unless an input-stream digest mismatch has tombstoned
4298        // a RID range on this store.
4299        if let Ok(ref mut query_result) = result {
4300            if query_result.statement_type == "select" {
4301                self.filter_integrity_tombstoned(&mut query_result.result);
4302            }
4303        }
4304        let elapsed_ms = started.elapsed().as_millis() as u64;
4305
4306        // Build EffectiveScope from the same thread-locals frame-build
4307        // consults — keeps the slow-log row consistent with the audit /
4308        // RLS view of "this statement". `ai_scope()` is the canonical
4309        // builder.
4310        let scope = self.ai_scope();
4311        let kind = match result
4312            .as_ref()
4313            .map(|r| r.statement_type)
4314            .unwrap_or("select")
4315        {
4316            "select" => crate::telemetry::slow_query_logger::QueryKind::Select,
4317            "insert" => crate::telemetry::slow_query_logger::QueryKind::Insert,
4318            "update" => crate::telemetry::slow_query_logger::QueryKind::Update,
4319            "delete" => crate::telemetry::slow_query_logger::QueryKind::Delete,
4320            _ => crate::telemetry::slow_query_logger::QueryKind::Internal,
4321        };
4322        // SQL redaction: pass the raw query through. The slow-query
4323        // logger writes structured JSON so embedded literals stay
4324        // escape-safe at the JSON boundary (proven by
4325        // `adversarial_sql_is_escape_safe` in slow_query_logger.rs).
4326        // PII redaction (e.g. literal masking) is a follow-up.
4327        self.inner
4328            .slow_query_logger
4329            .record(kind, elapsed_ms, query.to_string(), &scope);
4330
4331        // Issue #1241 — record latency into the bounded per-`kind`
4332        // histogram substrate (always, not only above the slow-query
4333        // threshold). `started.elapsed()` is re-read here for sub-ms
4334        // resolution; the cost is one `Instant::now` plus a handful of
4335        // relaxed atomic adds (see `query_latency_telemetry` docs).
4336        self.inner
4337            .query_latency_telemetry
4338            .observe(kind, started.elapsed().as_secs_f64());
4339
4340        if let Ok(ref mut query_result) = result {
4341            if matches!(query_result.statement_type, "insert" | "update" | "delete") {
4342                let bookmark = crate::replication::CausalBookmark::new(
4343                    self.current_replication_term(),
4344                    self.cdc_current_lsn(),
4345                );
4346                query_result.bookmark = Some(bookmark.encode());
4347            }
4348        }
4349
4350        result
4351    }
4352
4353    fn execute_query_with_params_inner(
4354        &self,
4355        query: &str,
4356        params: &[Value],
4357    ) -> RedDBResult<RuntimeQueryResult> {
4358        let parsed = parse_multi(query).map_err(|err| RedDBError::Query(err.to_string()))?;
4359        let bound = crate::storage::query::user_params::bind(&parsed, params).map_err(|err| {
4360            RedDBError::Validation {
4361                message: err.to_string(),
4362                validation: crate::json!({
4363                    "code": "INVALID_PARAMS",
4364                    "surface": "query.params",
4365                }),
4366            }
4367        })?;
4368        self.execute_bound_query_expr_in_frame(query, bound)
4369    }
4370
4371    fn execute_bound_query_expr_in_frame(
4372        &self,
4373        query: &str,
4374        expr: QueryExpr,
4375    ) -> RedDBResult<RuntimeQueryResult> {
4376        let rewritten_query = super::red_schema::rewrite_virtual_names(query);
4377        let execution_query = rewritten_query.as_deref().unwrap_or(query);
4378        let frame = super::statement_frame::StatementExecutionFrame::build(self, execution_query)?;
4379        let _frame_guards = frame.install(self);
4380        let _log_span = crate::telemetry::span::query_span(query).entered();
4381
4382        let expr = self.rewrite_view_refs(expr);
4383        let mode = detect_mode(execution_query);
4384        let control_event_specs = query_control_event_specs(&expr);
4385        let _lock_guard = match frame.prepare_dispatch(self, &expr) {
4386            Ok(guard) => guard,
4387            Err(err) => {
4388                let outcome = control_event_outcome_for_error(&err);
4389                for spec in &control_event_specs {
4390                    self.emit_control_event(
4391                        spec.kind,
4392                        outcome,
4393                        spec.action,
4394                        spec.resource.clone(),
4395                        Some(err.to_string()),
4396                        spec.fields.clone(),
4397                    )?;
4398                }
4399                return Err(err);
4400            }
4401        };
4402
4403        let mut result = self.dispatch_expr(expr, query, mode)?;
4404        if result.statement_type == "select" {
4405            self.apply_secret_decryption(&mut result);
4406        }
4407        Ok(result)
4408    }
4409
4410    pub fn causal_session(&self) -> crate::runtime::CausalSession {
4411        crate::runtime::CausalSession {
4412            runtime: self.clone(),
4413            bookmark: None,
4414            wait_timeout: std::time::Duration::from_secs(5),
4415        }
4416    }
4417
4418    pub fn wait_for_bookmark(
4419        &self,
4420        bookmark: &crate::replication::CausalBookmark,
4421        timeout: std::time::Duration,
4422    ) -> RedDBResult<()> {
4423        let deadline = std::time::Instant::now() + timeout;
4424        loop {
4425            let applied_lsn = self.local_contiguous_applied_lsn();
4426            if applied_lsn >= bookmark.commit_lsn() {
4427                return Ok(());
4428            }
4429            let now = std::time::Instant::now();
4430            if now >= deadline {
4431                return Err(RedDBError::InvalidOperation(format!(
4432                    "timed out waiting for causal bookmark lsn {}; applied={}",
4433                    bookmark.commit_lsn(),
4434                    applied_lsn
4435                )));
4436            }
4437            let remaining = deadline.saturating_duration_since(now);
4438            std::thread::sleep(remaining.min(std::time::Duration::from_millis(5)));
4439        }
4440    }
4441
4442    fn local_contiguous_applied_lsn(&self) -> u64 {
4443        match self.inner.db.options().replication.role {
4444            crate::replication::ReplicationRole::Replica { .. } => {
4445                self.config_u64("red.replication.last_applied_lsn", 0)
4446            }
4447            _ => self.cdc_current_lsn(),
4448        }
4449    }
4450
4451    #[inline(never)]
4452    fn execute_query_inner(&self, query: &str) -> RedDBResult<RuntimeQueryResult> {
4453        // ── ULTRA-TURBO: autocommit `SELECT * FROM t WHERE _entity_id = N` ──
4454        //
4455        // Moved above every boot-cost the normal path pays (WITHIN
4456        // strip, SET LOCAL parse, tx_local_tenants read, snapshot
4457        // guard, tracing span, tx_contexts read) because the bench's
4458        // `select_point` scenario was observed at 28× vs PostgreSQL —
4459        // the dominant cost wasn't the entity fetch but the ceremony
4460        // before it. Only fires when there's no ambient transaction
4461        // context or WITHIN override, so the snapshot install we skip
4462        // truly is a no-op for this query.
4463        if !has_scope_override_active()
4464            && !query.trim_start().starts_with("WITHIN")
4465            && !query.trim_start().starts_with("within")
4466            && !self.inner.query_audit.has_rules()
4467            && !self
4468                .inner
4469                .tx_contexts
4470                .read()
4471                .contains_key(&current_connection_id())
4472        {
4473            if let Some(result) = self.try_fast_entity_lookup(query) {
4474                return result;
4475            }
4476        }
4477
4478        // `WITHIN TENANT '<id>' [USER '<u>'] [AS ROLE '<r>'] <stmt>` —
4479        // strip the prefix, push a stack-scoped override, recurse on
4480        // the inner statement, pop on return. Stack lives in a
4481        // thread-local but is balanced by the RAII guard, so a
4482        // pool-shared connection cannot leak the override across
4483        // requests and an early `?` return still pops cleanly.
4484        match crate::runtime::within_clause::try_strip_within_prefix(query) {
4485            Ok(Some((scope, inner))) => {
4486                let _scope_guard = ScopeOverrideGuard::install(scope);
4487                // Re-enter the inner path, NOT `execute_query`, so the
4488                // slow-query lifecycle hook records exactly one row per
4489                // top-level statement (the WITHIN-stripped form would
4490                // double-record).
4491                return self.execute_query_inner(inner);
4492            }
4493            Ok(None) => {}
4494            Err(msg) => return Err(RedDBError::Query(msg)),
4495        }
4496
4497        // `EXPLAIN <stmt>` — introspection. Runs the planner on the
4498        // inner statement (WITHOUT executing it) and returns the
4499        // CanonicalLogicalNode tree as rows so the caller can see the
4500        // operator shape and estimated cost. `EXPLAIN ALTER FOR ...`
4501        // is a distinct schema-diff command and continues down the
4502        // regular SQL path.
4503        if let Some(inner) = strip_explain_prefix(query) {
4504            return self.explain_as_rows(query, inner);
4505        }
4506
4507        // `SET LOCAL TENANT '<id>'` — write the per-transaction tenant
4508        // override and return. Outside a transaction the statement is
4509        // an error (matches PG semantics: SET LOCAL only takes effect
4510        // within an active transaction).
4511        if let Some(value) = parse_set_local_tenant(query)? {
4512            let conn_id = current_connection_id();
4513            if !self.inner.tx_contexts.read().contains_key(&conn_id) {
4514                return Err(RedDBError::Query(
4515                    "SET LOCAL TENANT requires an active transaction".to_string(),
4516                ));
4517            }
4518            self.inner
4519                .tx_local_tenants
4520                .write()
4521                .insert(conn_id, value.clone());
4522            return Ok(RuntimeQueryResult::ok_message(
4523                query.to_string(),
4524                &match &value {
4525                    Some(id) => format!("local tenant set: {id}"),
4526                    None => "local tenant cleared".to_string(),
4527                },
4528                "set_local_tenant",
4529            ));
4530        }
4531
4532        if super::red_schema::is_system_schema_write(query) {
4533            return Err(RedDBError::Query(
4534                super::red_schema::READ_ONLY_ERROR.to_string(),
4535            ));
4536        }
4537
4538        if let Some(create_source) = super::analytics_source_catalog::parse_create_statement(query)?
4539        {
4540            return self.execute_create_analytics_source(query, create_source);
4541        }
4542
4543        // Issue #790 — `READ METRIC <path>` is intentionally rejected at
4544        // v0. The descriptor itself is readable through
4545        // `red.analytics.metrics`; the *output* read returns a
4546        // structured error so callers can tell "execution engine not yet
4547        // built" apart from "metric does not exist".
4548        if let Some(path) = super::metric_descriptor_catalog::parse_read_metric_statement(query) {
4549            return Err(super::metric_descriptor_catalog::read_output_unsupported(
4550                &path,
4551            ));
4552        }
4553
4554        // Issue #918 / ADR 0035 — leaderboard rank capability catalog
4555        // declarations are still recognised before the general parser.
4556        // Rank reads themselves are parser AST nodes, including Redis-flavor
4557        // Z* sugar that desugars to the same canonical rank shapes.
4558        if let Some(parsed) = super::ranking_descriptor_catalog::parse_create_ranking(query) {
4559            return self.execute_create_ranking(query, parsed?);
4560        }
4561        if super::ranking_descriptor_catalog::parse_show_rankings(query) {
4562            return self.execute_show_rankings(query);
4563        }
4564
4565        let rewritten_query = super::red_schema::rewrite_virtual_names(query);
4566        let execution_query = rewritten_query.as_deref().unwrap_or(query);
4567
4568        let frame = super::statement_frame::StatementExecutionFrame::build(self, execution_query)?;
4569        let _frame_guards = frame.install(self);
4570
4571        // Phase 6 logging: enter a span stamped with conn_id / tenant
4572        // / query_len. Every downstream tracing::info!/warn!/error!
4573        // inherits these fields — no need to thread them manually
4574        // through storage/scan layers. Entered AFTER the WITHIN /
4575        // SET LOCAL TENANT resolution above so the span reflects the
4576        // effective scope for this statement.
4577        let _log_span = crate::telemetry::span::query_span(query).entered();
4578
4579        // ── CTE prelude (#41) — `WITH x AS (...) SELECT ... FROM x` ──
4580        if let Some(rewritten) = frame.prepare_cte(execution_query)? {
4581            return self.execute_query_expr(rewritten);
4582        }
4583
4584        // ── TURBO: bypass SQL parse for SELECT * FROM x WHERE _entity_id = N ──
4585        if !self.inner.query_audit.has_rules() {
4586            if let Some(result) = self.try_fast_entity_lookup(execution_query) {
4587                return result;
4588            }
4589        }
4590
4591        // ── Result cache: return cached result if still fresh (30s TTL) ──
4592        if !self.inner.query_audit.has_rules() {
4593            if let Some(result) = frame.read_result_cache(self) {
4594                return Ok(result);
4595            }
4596        }
4597
4598        let prepared = frame.prepare_statement(self, execution_query)?;
4599        let mode = prepared.mode;
4600        let expr = prepared.expr;
4601
4602        let statement = query_expr_name(&expr);
4603        let result_cache_scopes = query_expr_result_cache_scopes(&expr);
4604        let control_event_specs = query_control_event_specs(&expr);
4605        let query_audit_plan = query_audit_plan(&expr);
4606
4607        let _lock_guard = match frame.prepare_dispatch(self, &expr) {
4608            Ok(guard) => guard,
4609            Err(err) => {
4610                let outcome = control_event_outcome_for_error(&err);
4611                for spec in &control_event_specs {
4612                    self.emit_control_event(
4613                        spec.kind,
4614                        outcome,
4615                        spec.action,
4616                        spec.resource.clone(),
4617                        Some(err.to_string()),
4618                        spec.fields.clone(),
4619                    )?;
4620                }
4621                return Err(err);
4622            }
4623        };
4624        let frame_iface: &dyn super::statement_frame::ReadFrame = &frame;
4625        let query_audit_started = std::time::Instant::now();
4626
4627        let query_result = match expr {
4628            QueryExpr::Graph(_) | QueryExpr::Path(_) => {
4629                // Apply MVCC visibility + RLS gate while materialising the
4630                // graph: every node entity is screened against the source
4631                // collection's policy chain (basic and `Nodes`-targeted)
4632                // and dropped when the caller's tenant / role doesn't
4633                // admit it. Edges are pruned automatically because the
4634                // graph builder skips edges whose endpoints aren't in
4635                // `allowed_nodes`.
4636                let (graph, node_properties, edge_properties) =
4637                    self.materialize_graph_with_rls()?;
4638                let result =
4639                    crate::storage::query::unified::UnifiedExecutor::execute_on_with_graph_properties(
4640                        &graph,
4641                        &expr,
4642                        node_properties,
4643                        edge_properties,
4644                    )
4645                        .map_err(|err| RedDBError::Query(err.to_string()))?;
4646
4647                Ok(RuntimeQueryResult {
4648                    query: query.to_string(),
4649                    mode,
4650                    statement,
4651                    engine: "materialized-graph",
4652                    result,
4653                    affected_rows: 0,
4654                    statement_type: "select",
4655                    bookmark: None,
4656                })
4657            }
4658            QueryExpr::Table(table) => {
4659                let table = self.resolve_table_expr_subqueries(
4660                    table,
4661                    &frame as &dyn super::statement_frame::ReadFrame,
4662                )?;
4663                // Table-valued functions (e.g. components(g)) dispatch to a
4664                // read-only executor before any catalog/virtual-table routing
4665                // (issue #795).
4666                if let Some(TableSource::Function {
4667                    name,
4668                    args,
4669                    named_args,
4670                }) = table.source.clone()
4671                {
4672                    // The graph-collection form is cacheable (issue #802): the
4673                    // result-cache read at the top of this function keys on the
4674                    // query string, and `result_cache_scopes` carries the graph
4675                    // collection (see `collect_table_source_scopes`) so a write
4676                    // to it invalidates the entry. Deterministic algorithm
4677                    // output is worth caching at any row count, so the write
4678                    // bypasses the generic ≤5-row payload heuristic.
4679                    let tvf_result = RuntimeQueryResult {
4680                        query: query.to_string(),
4681                        mode,
4682                        statement,
4683                        engine: "runtime-graph-tvf",
4684                        result: self.execute_table_function(&name, &args, &named_args)?,
4685                        affected_rows: 0,
4686                        statement_type: "select",
4687                        bookmark: None,
4688                    };
4689                    frame.write_result_cache(self, &tvf_result, result_cache_scopes.clone());
4690                    return Ok(tvf_result);
4691                }
4692                // Inline-graph TVF (issue #799): the graph is supplied by two
4693                // subqueries instead of a collection reference. Unlike the
4694                // graph-collection form, the result IS cacheable — its cache
4695                // key is the query string (the result-cache read at the top of
4696                // `execute_query_inner` keys on it) and `result_cache_scopes`
4697                // already carries the `nodes`/`edges` source collections, so a
4698                // write to any of them invalidates the entry.
4699                if let Some(TableSource::InlineGraphFunction {
4700                    name,
4701                    nodes,
4702                    edges,
4703                    named_args,
4704                }) = table.source.clone()
4705                {
4706                    let inline_result = RuntimeQueryResult {
4707                        query: query.to_string(),
4708                        mode,
4709                        statement,
4710                        engine: "runtime-graph-tvf-inline",
4711                        result: self.execute_inline_graph_function(
4712                            &name,
4713                            &nodes,
4714                            &edges,
4715                            &named_args,
4716                        )?,
4717                        affected_rows: 0,
4718                        statement_type: "select",
4719                        bookmark: None,
4720                    };
4721                    frame.write_result_cache(self, &inline_result, result_cache_scopes);
4722                    return Ok(inline_result);
4723                }
4724                if super::red_schema::is_virtual_table(&table.table) {
4725                    return Ok(RuntimeQueryResult {
4726                        query: query.to_string(),
4727                        mode,
4728                        statement,
4729                        engine: "runtime-red-schema",
4730                        result: super::red_schema::red_query(
4731                            self,
4732                            &table.table,
4733                            &table,
4734                            &frame as &dyn super::statement_frame::ReadFrame,
4735                        )?,
4736                        affected_rows: 0,
4737                        statement_type: "select",
4738                        bookmark: None,
4739                    });
4740                }
4741
4742                // `<graph>.<output>` analytics virtual view (issue #800).
4743                // Recomputed on demand — intentionally not result-cached, so it
4744                // always reflects the current graph data.
4745                if let Some(view_result) = self.try_resolve_analytics_view(
4746                    &table,
4747                    &frame as &dyn super::statement_frame::ReadFrame,
4748                )? {
4749                    return Ok(RuntimeQueryResult {
4750                        query: query.to_string(),
4751                        mode,
4752                        statement,
4753                        engine: "runtime-graph-analytics-view",
4754                        result: view_result,
4755                        affected_rows: 0,
4756                        statement_type: "select",
4757                        bookmark: None,
4758                    });
4759                }
4760
4761                if let Some(result) = self.execute_probabilistic_select(&table)? {
4762                    return Ok(RuntimeQueryResult {
4763                        query: query.to_string(),
4764                        mode,
4765                        statement,
4766                        engine: "runtime-probabilistic",
4767                        result,
4768                        affected_rows: 0,
4769                        statement_type: "select",
4770                        bookmark: None,
4771                    });
4772                }
4773
4774                // Foreign-table intercept (Phase 3.2.2 PG parity).
4775                //
4776                // When the referenced table matches a `CREATE FOREIGN TABLE`
4777                // registration, short-circuit into the FDW scan. Phase 3.2
4778                // wrappers don't yet support pushdown, so filters/projections
4779                // apply post-scan via `apply_foreign_table_filters` — good
4780                // enough for correctness; perf work lands in 3.2.3.
4781                if self.inner.foreign_tables.is_foreign_table(&table.table) {
4782                    let records = self
4783                        .inner
4784                        .foreign_tables
4785                        .scan(&table.table)
4786                        .map_err(|e| RedDBError::Internal(e.to_string()))?;
4787                    let result = apply_foreign_table_filters(records, &table);
4788                    return Ok(RuntimeQueryResult {
4789                        query: query.to_string(),
4790                        mode,
4791                        statement,
4792                        engine: "runtime-fdw",
4793                        result,
4794                        affected_rows: 0,
4795                        statement_type: "select",
4796                        bookmark: None,
4797                    });
4798                }
4799
4800                // Row-Level Security enforcement (Phase 2.5.2 PG parity).
4801                //
4802                // When RLS is enabled on this table, fetch every policy
4803                // that applies to the current (role, SELECT) pair and
4804                // fold them into the query's WHERE clause: policies
4805                // OR-combine (any of them admitting the row is enough),
4806                // then AND into the caller's existing filter.
4807                //
4808                // Anonymous callers (no thread-local identity) pass
4809                // `role = None`; policies with a specific `TO role`
4810                // clause skip, but `TO PUBLIC` policies still apply.
4811                //
4812                // When `inject_rls_filters` returns `None` the table has
4813                // RLS enabled but no policy admits the caller's role —
4814                // short-circuit with an empty result set instead of
4815                // synthesising a contradiction filter.
4816                let Some(table_with_rls) = self.authorize_relational_table_select(
4817                    table,
4818                    &frame as &dyn super::statement_frame::ReadFrame,
4819                )?
4820                else {
4821                    let empty = crate::storage::query::unified::UnifiedResult::empty();
4822                    return Ok(RuntimeQueryResult {
4823                        query: query.to_string(),
4824                        mode,
4825                        statement,
4826                        engine: "runtime-table-rls",
4827                        result: empty,
4828                        affected_rows: 0,
4829                        statement_type: "select",
4830                        bookmark: None,
4831                    });
4832                };
4833                Ok(RuntimeQueryResult {
4834                    query: query.to_string(),
4835                    mode,
4836                    statement,
4837                    engine: "runtime-table",
4838                    // #885: lend the frame-owned row-buffer arena to the
4839                    // streaming path so chunk buffers are reused across
4840                    // this statement's chunk-fetches instead of allocated
4841                    // fresh per chunk. This is the table-query dispatch
4842                    // that runs under a `StatementExecutionFrame`; the
4843                    // frameless prepared/subquery paths keep `None`.
4844                    result: execute_runtime_table_query_in(
4845                        &self.inner.db,
4846                        &table_with_rls,
4847                        Some(&self.inner.index_store),
4848                        Some(frame.row_arena()),
4849                    )?,
4850                    affected_rows: 0,
4851                    statement_type: "select",
4852                    bookmark: None,
4853                })
4854            }
4855            QueryExpr::Join(join) => {
4856                // Fold per-table RLS filters into each `QueryExpr::Table`
4857                // leaf of the join tree before executing. Without this
4858                // the join executor scans both tables raw and ignores
4859                // policies — a `WITHIN TENANT 'x'` against a join of
4860                // two tenant-scoped tables would leak cross-tenant rows.
4861                // When any leaf has RLS enabled and zero matching policy,
4862                // short-circuit to an empty join result instead of
4863                // emitting a contradiction filter.
4864                let join_with_rls = match self.authorize_relational_join_select(
4865                    join,
4866                    &frame as &dyn super::statement_frame::ReadFrame,
4867                )? {
4868                    Some(j) => j,
4869                    None => {
4870                        return Ok(RuntimeQueryResult {
4871                            query: query.to_string(),
4872                            mode,
4873                            statement,
4874                            engine: "runtime-join-rls",
4875                            result: crate::storage::query::unified::UnifiedResult::empty(),
4876                            affected_rows: 0,
4877                            statement_type: "select",
4878                            bookmark: None,
4879                        });
4880                    }
4881                };
4882                Ok(RuntimeQueryResult {
4883                    query: query.to_string(),
4884                    mode,
4885                    statement,
4886                    engine: "runtime-join",
4887                    result: execute_runtime_join_query(&self.inner.db, &join_with_rls)?,
4888                    affected_rows: 0,
4889                    statement_type: "select",
4890                    bookmark: None,
4891                })
4892            }
4893            QueryExpr::Vector(vector) => Ok(RuntimeQueryResult {
4894                query: query.to_string(),
4895                mode,
4896                statement,
4897                engine: "runtime-vector",
4898                result: execute_runtime_vector_query(&self.inner.db, &vector)?,
4899                affected_rows: 0,
4900                statement_type: "select",
4901                bookmark: None,
4902            }),
4903            QueryExpr::Hybrid(hybrid) => Ok(RuntimeQueryResult {
4904                query: query.to_string(),
4905                mode,
4906                statement,
4907                engine: "runtime-hybrid",
4908                result: execute_runtime_hybrid_query(&self.inner.db, &hybrid)?,
4909                affected_rows: 0,
4910                statement_type: "select",
4911                bookmark: None,
4912            }),
4913            QueryExpr::RankOf(ref rank) => self.execute_rank_of(query, rank),
4914            QueryExpr::ApproxRankOf(ref rank) => self.execute_approx_rank_of(query, rank),
4915            QueryExpr::RankRange(ref range) => self.execute_rank_range(query, range),
4916            // DML execution
4917            QueryExpr::Insert(ref insert) if super::red_schema::is_virtual_table(&insert.table) => {
4918                Err(RedDBError::Query(
4919                    super::red_schema::READ_ONLY_ERROR.to_string(),
4920                ))
4921            }
4922            QueryExpr::Update(ref update) if super::red_schema::is_virtual_table(&update.table) => {
4923                Err(RedDBError::Query(
4924                    super::red_schema::READ_ONLY_ERROR.to_string(),
4925                ))
4926            }
4927            QueryExpr::Delete(ref delete) if super::red_schema::is_virtual_table(&delete.table) => {
4928                Err(RedDBError::Query(
4929                    super::red_schema::READ_ONLY_ERROR.to_string(),
4930                ))
4931            }
4932            QueryExpr::Insert(ref insert) => self
4933                .with_deferred_store_wal_for_dml(self.insert_may_emit_events(insert), || {
4934                    self.execute_insert(query, insert)
4935                }),
4936            QueryExpr::Update(ref update) => self
4937                .with_deferred_store_wal_for_dml(self.update_may_emit_events(update), || {
4938                    self.execute_update(query, update)
4939                }),
4940            QueryExpr::Delete(ref delete) => self
4941                .with_deferred_store_wal_for_dml(self.delete_may_emit_events(delete), || {
4942                    self.execute_delete(query, delete)
4943                }),
4944            // DDL execution
4945            QueryExpr::CreateTable(ref create) => self.execute_create_table(query, create),
4946            QueryExpr::CreateCollection(ref create) => {
4947                self.execute_create_collection(query, create)
4948            }
4949            QueryExpr::CreateVector(ref create) => self.execute_create_vector(query, create),
4950            QueryExpr::DropTable(ref drop_tbl) => self.execute_drop_table(query, drop_tbl),
4951            QueryExpr::DropGraph(ref drop_graph) => self.execute_drop_graph(query, drop_graph),
4952            QueryExpr::DropVector(ref drop_vector) => self.execute_drop_vector(query, drop_vector),
4953            QueryExpr::DropDocument(ref drop_document) => {
4954                self.execute_drop_document(query, drop_document)
4955            }
4956            QueryExpr::DropKv(ref drop_kv) => self.execute_drop_kv(query, drop_kv),
4957            QueryExpr::DropCollection(ref drop_collection) => {
4958                self.execute_drop_collection(query, drop_collection)
4959            }
4960            QueryExpr::Truncate(ref truncate) => self.execute_truncate(query, truncate),
4961            QueryExpr::AlterTable(ref alter) => self.execute_alter_table(query, alter),
4962            QueryExpr::ExplainAlter(ref explain) => self.execute_explain_alter(query, explain),
4963            // Graph analytics commands
4964            QueryExpr::GraphCommand(ref cmd) => self.execute_graph_command(query, cmd),
4965            // Search commands
4966            QueryExpr::SearchCommand(ref cmd) => self.execute_search_command(query, cmd),
4967            // ASK: RAG query with LLM synthesis
4968            QueryExpr::Ask(ref ask) => self.execute_ask(query, ask),
4969            QueryExpr::CreateIndex(ref create_idx) => self.execute_create_index(query, create_idx),
4970            QueryExpr::DropIndex(ref drop_idx) => self.execute_drop_index(query, drop_idx),
4971            QueryExpr::ProbabilisticCommand(ref cmd) => {
4972                self.execute_probabilistic_command(query, cmd)
4973            }
4974            // Time-series DDL
4975            QueryExpr::CreateTimeSeries(ref ts) => self.execute_create_timeseries(query, ts),
4976            QueryExpr::CreateMetric(ref metric) => self.execute_create_metric(query, metric),
4977            QueryExpr::AlterMetric(ref alter) => self.execute_alter_metric(query, alter),
4978            QueryExpr::CreateSlo(ref slo) => self.execute_create_slo(query, slo),
4979            QueryExpr::DropTimeSeries(ref ts) => self.execute_drop_timeseries(query, ts),
4980            // Queue DDL and commands
4981            QueryExpr::CreateQueue(ref q) => self.execute_create_queue(query, q),
4982            QueryExpr::AlterQueue(ref q) => self.execute_alter_queue(query, q),
4983            QueryExpr::DropQueue(ref q) => self.execute_drop_queue(query, q),
4984            QueryExpr::QueueSelect(ref q) => self.execute_queue_select(query, q),
4985            QueryExpr::QueueCommand(ref cmd) => self.execute_queue_command(query, cmd),
4986            QueryExpr::EventsBackfill(ref backfill) => {
4987                self.execute_events_backfill(query, backfill)
4988            }
4989            QueryExpr::EventsBackfillStatus { ref collection } => Err(RedDBError::Query(format!(
4990                "EVENTS BACKFILL STATUS for '{collection}' is not implemented in this slice"
4991            ))),
4992            QueryExpr::KvCommand(ref cmd) => self.execute_kv_command(query, cmd),
4993            QueryExpr::ConfigCommand(ref cmd) => self.execute_config_command(query, cmd),
4994            QueryExpr::CreateTree(ref tree) => self.execute_create_tree(query, tree),
4995            QueryExpr::DropTree(ref tree) => self.execute_drop_tree(query, tree),
4996            QueryExpr::TreeCommand(ref cmd) => self.execute_tree_command(query, cmd),
4997            // SET CONFIG key = value
4998            QueryExpr::SetConfig { ref key, ref value } => {
4999                if key.starts_with("red.secret.") {
5000                    return Err(RedDBError::Query(
5001                        "red.secret.* is reserved for vault secrets; use SET SECRET".to_string(),
5002                    ));
5003                }
5004                if key.starts_with("red.secrets.") {
5005                    return Err(RedDBError::Query(
5006                        "red.secrets.* is reserved for vault secrets; use SET SECRET".to_string(),
5007                    ));
5008                }
5009                match self.check_managed_config_write_for_set_config(key) {
5010                    Err(err) => Err(err),
5011                    Ok(()) => {
5012                        let store = self.inner.db.store();
5013                        let json_val = match value {
5014                            Value::Text(s) => crate::serde_json::Value::String(s.to_string()),
5015                            Value::Integer(n) => crate::serde_json::Value::Number(*n as f64),
5016                            Value::Float(n) => crate::serde_json::Value::Number(*n),
5017                            Value::Boolean(b) => crate::serde_json::Value::Bool(*b),
5018                            _ => crate::serde_json::Value::String(value.to_string()),
5019                        };
5020                        store.set_config_tree(key, &json_val);
5021                        update_current_config_value(key, value.clone());
5022                        // Config changes can flip runtime behavior mid-session
5023                        // (auto_decrypt, auto_encrypt, etc.) — invalidate the
5024                        // result cache so subsequent reads re-execute against
5025                        // the new config.
5026                        self.invalidate_result_cache();
5027                        Ok(RuntimeQueryResult::ok_message(
5028                            query.to_string(),
5029                            &format!("config set: {key}"),
5030                            "set",
5031                        ))
5032                    }
5033                }
5034            }
5035            // SET SECRET key = value
5036            QueryExpr::SetSecret { ref key, ref value } => {
5037                if key.starts_with("red.config.") {
5038                    return Err(RedDBError::Query(
5039                        "red.config.* is reserved for config; use SET CONFIG".to_string(),
5040                    ));
5041                }
5042                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
5043                    RedDBError::Query("SET SECRET requires an enabled, unsealed vault".to_string())
5044                })?;
5045                if matches!(value, Value::Null) {
5046                    auth_store
5047                        .vault_kv_try_delete(key)
5048                        .map_err(|err| RedDBError::Query(err.to_string()))?;
5049                    update_current_secret_value(key, None);
5050                    self.invalidate_result_cache();
5051                    return Ok(RuntimeQueryResult::ok_message(
5052                        query.to_string(),
5053                        &format!("secret deleted: {key}"),
5054                        "delete_secret",
5055                    ));
5056                }
5057                let value = secret_sql_value_to_string(value)?;
5058                auth_store
5059                    .vault_kv_try_set(key.clone(), value.clone())
5060                    .map_err(|err| RedDBError::Query(err.to_string()))?;
5061                update_current_secret_value(key, Some(value));
5062                self.invalidate_result_cache();
5063                Ok(RuntimeQueryResult::ok_message(
5064                    query.to_string(),
5065                    &format!("secret set: {key}"),
5066                    "set_secret",
5067                ))
5068            }
5069            // DELETE SECRET key
5070            QueryExpr::DeleteSecret { ref key } => {
5071                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
5072                    RedDBError::Query(
5073                        "DELETE SECRET requires an enabled, unsealed vault".to_string(),
5074                    )
5075                })?;
5076                let deleted = auth_store
5077                    .vault_kv_try_delete(key)
5078                    .map_err(|err| RedDBError::Query(err.to_string()))?;
5079                if deleted {
5080                    update_current_secret_value(key, None);
5081                }
5082                self.invalidate_result_cache();
5083                Ok(RuntimeQueryResult::ok_message(
5084                    query.to_string(),
5085                    &format!("secret deleted: {key}"),
5086                    if deleted {
5087                        "delete_secret"
5088                    } else {
5089                        "delete_secret_not_found"
5090                    },
5091                ))
5092            }
5093            // SHOW SECRET[S] [prefix]
5094            QueryExpr::ShowSecrets { ref prefix } => {
5095                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
5096                    RedDBError::Query("SHOW SECRET requires an enabled, unsealed vault".to_string())
5097                })?;
5098                if !auth_store.is_vault_backed() {
5099                    return Err(RedDBError::Query(
5100                        "SHOW SECRET requires an enabled, unsealed vault".to_string(),
5101                    ));
5102                }
5103                let mut keys = auth_store.vault_kv_keys();
5104                keys.sort();
5105                let mut result = UnifiedResult::with_columns(vec![
5106                    "key".into(),
5107                    "value".into(),
5108                    "status".into(),
5109                ]);
5110                for key in keys {
5111                    if let Some(ref pfx) = prefix {
5112                        if !key.starts_with(pfx) {
5113                            continue;
5114                        }
5115                    }
5116                    let mut record = UnifiedRecord::new();
5117                    record.set("key", Value::text(key));
5118                    record.set("value", Value::text("***"));
5119                    record.set("status", Value::text("active"));
5120                    result.push(record);
5121                }
5122                Ok(RuntimeQueryResult {
5123                    query: query.to_string(),
5124                    mode,
5125                    statement: "show_secrets",
5126                    engine: "runtime-secret",
5127                    result,
5128                    affected_rows: 0,
5129                    statement_type: "select",
5130                    bookmark: None,
5131                })
5132            }
5133            // SHOW CONFIG [prefix] [AS JSON|FORMAT JSON]
5134            QueryExpr::ShowConfig {
5135                ref prefix,
5136                as_json,
5137            } => {
5138                let store = self.inner.db.store();
5139                let all_collections = store.list_collections();
5140                if !all_collections.contains(&"red_config".to_string()) {
5141                    if as_json {
5142                        return Ok(show_config_json_result(
5143                            query,
5144                            mode,
5145                            prefix,
5146                            crate::serde_json::Value::Object(crate::serde_json::Map::new()),
5147                        ));
5148                    }
5149                    let result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
5150                    return Ok(RuntimeQueryResult {
5151                        query: query.to_string(),
5152                        mode,
5153                        statement: "show_config",
5154                        engine: "runtime-config",
5155                        result,
5156                        affected_rows: 0,
5157                        statement_type: "select",
5158                        bookmark: None,
5159                    });
5160                }
5161                let manager = store
5162                    .get_collection("red_config")
5163                    .ok_or_else(|| RedDBError::NotFound("red_config".to_string()))?;
5164                let entities = manager.query_all(|_| true);
5165                let mut latest = std::collections::BTreeMap::<String, (u64, Value, Value)>::new();
5166                for entity in entities {
5167                    if let EntityData::Row(ref row) = entity.data {
5168                        if let Some(ref named) = row.named {
5169                            let key_val = named.get("key").cloned().unwrap_or(Value::Null);
5170                            let val = named.get("value").cloned().unwrap_or(Value::Null);
5171                            let key_str = match &key_val {
5172                                Value::Text(s) => s.as_ref(),
5173                                _ => continue,
5174                            };
5175                            if let Some(ref pfx) = prefix {
5176                                if !key_str.starts_with(pfx.as_str()) {
5177                                    continue;
5178                                }
5179                            }
5180                            let entity_id = entity.id.raw();
5181                            match latest.get(key_str) {
5182                                Some((prev_id, _, _)) if *prev_id > entity_id => {}
5183                                _ => {
5184                                    latest.insert(key_str.to_string(), (entity_id, key_val, val));
5185                                }
5186                            }
5187                        }
5188                    }
5189                }
5190                if as_json {
5191                    let mut tree = crate::serde_json::Value::Object(crate::serde_json::Map::new());
5192                    for (key, (_, _, val)) in latest {
5193                        let relative = match prefix {
5194                            Some(pfx) if key == *pfx => "",
5195                            Some(pfx) => key
5196                                .strip_prefix(pfx.as_str())
5197                                .and_then(|tail| tail.strip_prefix('.'))
5198                                .unwrap_or(key.as_str()),
5199                            None => key.as_str(),
5200                        };
5201                        insert_config_json_path(
5202                            &mut tree,
5203                            relative,
5204                            crate::presentation::entity_json::storage_value_to_json(&val),
5205                        );
5206                    }
5207                    return Ok(show_config_json_result(query, mode, prefix, tree));
5208                }
5209                let mut result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
5210                for (_, key_val, val) in latest.into_values() {
5211                    let mut record = UnifiedRecord::new();
5212                    record.set("key", key_val);
5213                    record.set("value", val);
5214                    result.push(record);
5215                }
5216                Ok(RuntimeQueryResult {
5217                    query: query.to_string(),
5218                    mode,
5219                    statement: "show_config",
5220                    engine: "runtime-config",
5221                    result,
5222                    affected_rows: 0,
5223                    statement_type: "select",
5224                    bookmark: None,
5225                })
5226            }
5227            // Session-local multi-tenancy handle (Phase 2.5.3).
5228            //
5229            // SET TENANT 'id' / SET TENANT NULL / RESET TENANT — writes
5230            // the thread-local; SHOW TENANT returns it. Paired with the
5231            // CURRENT_TENANT() scalar for use in RLS policies.
5232            QueryExpr::SetTenant(ref value) => {
5233                match value {
5234                    Some(id) => set_current_tenant(id.clone()),
5235                    None => clear_current_tenant(),
5236                }
5237                Ok(RuntimeQueryResult::ok_message(
5238                    query.to_string(),
5239                    &match value {
5240                        Some(id) => format!("tenant set: {id}"),
5241                        None => "tenant cleared".to_string(),
5242                    },
5243                    "set_tenant",
5244                ))
5245            }
5246            QueryExpr::ShowTenant => {
5247                let mut result = UnifiedResult::with_columns(vec!["tenant".into()]);
5248                let mut record = UnifiedRecord::new();
5249                record.set(
5250                    "tenant",
5251                    current_tenant().map(Value::text).unwrap_or(Value::Null),
5252                );
5253                result.push(record);
5254                Ok(RuntimeQueryResult {
5255                    query: query.to_string(),
5256                    mode,
5257                    statement: "show_tenant",
5258                    engine: "runtime-tenant",
5259                    result,
5260                    affected_rows: 0,
5261                    statement_type: "select",
5262                    bookmark: None,
5263                })
5264            }
5265            // Transaction control (Phase 2.3 PG parity).
5266            //
5267            // BEGIN allocates a real `Xid` and stores a `TxnContext` keyed by
5268            // the current connection's id. COMMIT/ROLLBACK release it through
5269            // the `SnapshotManager` so future snapshots see the correct set of
5270            // active/aborted transactions.
5271            //
5272            // Tuple stamping (xmin/xmax) and read-path visibility filtering
5273            // land in Phase 2.3.2 — this dispatch only manages the snapshot
5274            // registry. Statements running outside a TxnContext still behave
5275            // as autocommit (xid=0 → visible to every snapshot).
5276            QueryExpr::TransactionControl(ref ctl) => {
5277                use crate::storage::query::ast::TxnControl;
5278                use crate::storage::transaction::snapshot::{TxnContext, Xid};
5279                use crate::storage::transaction::IsolationLevel;
5280
5281                // Phase 2.3 keys transactions by a thread-local connection id.
5282                // The stdio/gRPC paths wire a real per-connection id later;
5283                // for embedded use (one RedDBRuntime per process-ish caller)
5284                // we fall back to a deterministic placeholder.
5285                let conn_id = current_connection_id();
5286
5287                let (kind, msg) = match ctl {
5288                    TxnControl::Begin => {
5289                        let mgr = Arc::clone(&self.inner.snapshot_manager);
5290                        let xid = mgr.begin();
5291                        let snapshot = mgr.snapshot(xid);
5292                        let ctx = TxnContext {
5293                            xid,
5294                            isolation: IsolationLevel::SnapshotIsolation,
5295                            snapshot,
5296                            savepoints: Vec::new(),
5297                            released_sub_xids: Vec::new(),
5298                        };
5299                        self.inner.tx_contexts.write().insert(conn_id, ctx);
5300                        ("begin", format!("BEGIN — xid={xid} (snapshot isolation)"))
5301                    }
5302                    TxnControl::Commit => {
5303                        // SET LOCAL TENANT ends with the transaction.
5304                        self.inner.tx_local_tenants.write().remove(&conn_id);
5305                        let ctx = self.inner.tx_contexts.write().remove(&conn_id);
5306                        match ctx {
5307                            Some(ctx) => {
5308                                let mut own_xids = std::collections::HashSet::new();
5309                                own_xids.insert(ctx.xid);
5310                                for (_, sub) in &ctx.savepoints {
5311                                    own_xids.insert(*sub);
5312                                }
5313                                for sub in &ctx.released_sub_xids {
5314                                    own_xids.insert(*sub);
5315                                }
5316                                if let Err(err) = self.check_table_row_write_conflicts(
5317                                    conn_id,
5318                                    &ctx.snapshot,
5319                                    &own_xids,
5320                                ) {
5321                                    for (_, sub) in &ctx.savepoints {
5322                                        self.inner.snapshot_manager.rollback(*sub);
5323                                    }
5324                                    for sub in &ctx.released_sub_xids {
5325                                        self.inner.snapshot_manager.rollback(*sub);
5326                                    }
5327                                    self.inner.snapshot_manager.rollback(ctx.xid);
5328                                    self.revive_pending_versioned_updates(conn_id);
5329                                    self.revive_pending_tombstones(conn_id);
5330                                    self.discard_pending_kv_watch_events(conn_id);
5331                                    self.discard_pending_queue_wakes(conn_id);
5332                                    self.discard_pending_store_wal_actions(conn_id);
5333                                    return Err(err);
5334                                }
5335                                self.restore_pending_write_stamps(conn_id);
5336                                if let Err(err) = self.flush_pending_store_wal_actions(conn_id) {
5337                                    for (_, sub) in &ctx.savepoints {
5338                                        self.inner.snapshot_manager.rollback(*sub);
5339                                    }
5340                                    for sub in &ctx.released_sub_xids {
5341                                        self.inner.snapshot_manager.rollback(*sub);
5342                                    }
5343                                    self.inner.snapshot_manager.rollback(ctx.xid);
5344                                    self.revive_pending_versioned_updates(conn_id);
5345                                    self.revive_pending_tombstones(conn_id);
5346                                    self.discard_pending_kv_watch_events(conn_id);
5347                                    return Err(err);
5348                                }
5349                                // Phase 2.3.2e: commit every open sub-xid
5350                                // so they also become visible. Their
5351                                // work is promoted to the parent txn's
5352                                // result exactly like a RELEASE would
5353                                // have done.
5354                                for (_, sub) in &ctx.savepoints {
5355                                    self.inner.snapshot_manager.commit(*sub);
5356                                }
5357                                for sub in &ctx.released_sub_xids {
5358                                    self.inner.snapshot_manager.commit(*sub);
5359                                }
5360                                self.inner.snapshot_manager.commit(ctx.xid);
5361                                self.finalize_pending_versioned_updates(conn_id);
5362                                self.finalize_pending_tombstones(conn_id);
5363                                self.finalize_pending_kv_watch_events(conn_id);
5364                                self.finalize_pending_queue_wakes(conn_id);
5365                                ("commit", format!("COMMIT — xid={} committed", ctx.xid))
5366                            }
5367                            None => (
5368                                "commit",
5369                                "COMMIT outside transaction — no-op (autocommit)".to_string(),
5370                            ),
5371                        }
5372                    }
5373                    TxnControl::Rollback => {
5374                        self.inner.tx_local_tenants.write().remove(&conn_id);
5375                        let ctx = self.inner.tx_contexts.write().remove(&conn_id);
5376                        match ctx {
5377                            Some(ctx) => {
5378                                // Phase 2.3.2e: abort every open sub-xid
5379                                // too so their writes stay hidden.
5380                                for (_, sub) in &ctx.savepoints {
5381                                    self.inner.snapshot_manager.rollback(*sub);
5382                                }
5383                                for sub in &ctx.released_sub_xids {
5384                                    self.inner.snapshot_manager.rollback(*sub);
5385                                }
5386                                self.inner.snapshot_manager.rollback(ctx.xid);
5387                                // Phase 2.3.2b: tuples that the txn had
5388                                // xmax-stamped become live again — wipe xmax
5389                                // back to 0 so later snapshots see them.
5390                                self.revive_pending_versioned_updates(conn_id);
5391                                self.revive_pending_tombstones(conn_id);
5392                                self.discard_pending_kv_watch_events(conn_id);
5393                                self.discard_pending_queue_wakes(conn_id);
5394                                self.discard_pending_store_wal_actions(conn_id);
5395                                ("rollback", format!("ROLLBACK — xid={} aborted", ctx.xid))
5396                            }
5397                            None => (
5398                                "rollback",
5399                                "ROLLBACK outside transaction — no-op (autocommit)".to_string(),
5400                            ),
5401                        }
5402                    }
5403                    // Phase 2.3.2e: savepoints map onto sub-xids. Each
5404                    // SAVEPOINT allocates a fresh xid and pushes it
5405                    // onto the per-txn stack so subsequent writes can
5406                    // be selectively rolled back. RELEASE pops without
5407                    // aborting; ROLLBACK TO aborts the sub-xid (and
5408                    // any nested ones) + revives their tombstones.
5409                    TxnControl::Savepoint(name) => {
5410                        let mgr = Arc::clone(&self.inner.snapshot_manager);
5411                        let mut guard = self.inner.tx_contexts.write();
5412                        match guard.get_mut(&conn_id) {
5413                            Some(ctx) => {
5414                                let sub = mgr.begin();
5415                                ctx.savepoints.push((name.clone(), sub));
5416                                ("savepoint", format!("SAVEPOINT {name} — sub_xid={sub}"))
5417                            }
5418                            None => (
5419                                "savepoint",
5420                                "SAVEPOINT outside transaction — no-op".to_string(),
5421                            ),
5422                        }
5423                    }
5424                    TxnControl::ReleaseSavepoint(name) => {
5425                        let mut guard = self.inner.tx_contexts.write();
5426                        match guard.get_mut(&conn_id) {
5427                            Some(ctx) => {
5428                                let pos = ctx
5429                                    .savepoints
5430                                    .iter()
5431                                    .position(|(n, _)| n == name)
5432                                    .ok_or_else(|| {
5433                                        RedDBError::Internal(format!(
5434                                            "savepoint {name} does not exist"
5435                                        ))
5436                                    })?;
5437                                // RELEASE pops the named savepoint and
5438                                // any nested ones. Their sub-xids move
5439                                // to `released_sub_xids` so they commit
5440                                // (or roll back) alongside the parent
5441                                // xid — PG semantics: released
5442                                // savepoints still contribute their
5443                                // work, but their names are gone.
5444                                let released = ctx.savepoints.len() - pos;
5445                                let popped: Vec<Xid> = ctx
5446                                    .savepoints
5447                                    .split_off(pos)
5448                                    .into_iter()
5449                                    .map(|(_, x)| x)
5450                                    .collect();
5451                                ctx.released_sub_xids.extend(popped);
5452                                (
5453                                    "release_savepoint",
5454                                    format!("RELEASE SAVEPOINT {name} — {released} level(s)"),
5455                                )
5456                            }
5457                            None => (
5458                                "release_savepoint",
5459                                "RELEASE outside transaction — no-op".to_string(),
5460                            ),
5461                        }
5462                    }
5463                    TxnControl::RollbackToSavepoint(name) => {
5464                        let mgr = Arc::clone(&self.inner.snapshot_manager);
5465                        // Splice out the savepoint + nested ones under
5466                        // a narrow lock, then run the snapshot-manager
5467                        // + tombstone side-effects without the tx map
5468                        // held so nothing re-enters.
5469                        let drop_result: Option<(Xid, Vec<Xid>)> = {
5470                            let mut guard = self.inner.tx_contexts.write();
5471                            if let Some(ctx) = guard.get_mut(&conn_id) {
5472                                let pos = ctx
5473                                    .savepoints
5474                                    .iter()
5475                                    .position(|(n, _)| n == name)
5476                                    .ok_or_else(|| {
5477                                        RedDBError::Internal(format!(
5478                                            "savepoint {name} does not exist"
5479                                        ))
5480                                    })?;
5481                                let savepoint_xid = ctx.savepoints[pos].1;
5482                                let aborted: Vec<Xid> = ctx
5483                                    .savepoints
5484                                    .split_off(pos)
5485                                    .into_iter()
5486                                    .map(|(_, x)| x)
5487                                    .collect();
5488                                Some((savepoint_xid, aborted))
5489                            } else {
5490                                None
5491                            }
5492                        };
5493
5494                        match drop_result {
5495                            Some((savepoint_xid, aborted)) => {
5496                                for x in &aborted {
5497                                    mgr.rollback(*x);
5498                                }
5499                                let reverted_updates =
5500                                    self.revive_versioned_updates_since(conn_id, savepoint_xid);
5501                                let revived = self.revive_tombstones_since(conn_id, savepoint_xid);
5502                                (
5503                                    "rollback_to_savepoint",
5504                                    format!(
5505                                        "ROLLBACK TO SAVEPOINT {name} — aborted {} sub_xid(s), reverted {reverted_updates} update(s), revived {revived} tombstone(s)",
5506                                        aborted.len(),
5507                                    ),
5508                                )
5509                            }
5510                            None => (
5511                                "rollback_to_savepoint",
5512                                "ROLLBACK TO outside transaction — no-op".to_string(),
5513                            ),
5514                        }
5515                    }
5516                };
5517                Ok(RuntimeQueryResult::ok_message(
5518                    query.to_string(),
5519                    &msg,
5520                    kind,
5521                ))
5522            }
5523            // Schema + Sequence DDL (Phase 1.3 PG parity).
5524            //
5525            // Schemas are lightweight logical namespaces: a CREATE SCHEMA call
5526            // just registers the name in `red_config` under `schema.{name}`.
5527            // Table lookups still happen by collection name; clients using
5528            // `schema.table` qualified names collapse to collection `schema.table`.
5529            //
5530            // Sequences persist a 64-bit counter + metadata (start, increment)
5531            // in `red_config` under `sequence.{name}.*`. Scalar callers
5532            // `nextval('name')` / `currval('name')` arrive with the MVCC phase
5533            // once we have a proper mutating-function dispatch path; for now the
5534            // DDL just establishes the catalog entry so clients don't error.
5535            QueryExpr::CreateSchema(ref q) => {
5536                let store = self.inner.db.store();
5537                let key = format!("schema.{}", q.name);
5538                if store.get_config(&key).is_some() {
5539                    if q.if_not_exists {
5540                        return Ok(RuntimeQueryResult::ok_message(
5541                            query.to_string(),
5542                            &format!("schema {} already exists — skipped", q.name),
5543                            "create_schema",
5544                        ));
5545                    }
5546                    return Err(RedDBError::Internal(format!(
5547                        "schema {} already exists",
5548                        q.name
5549                    )));
5550                }
5551                store.set_config_tree(&key, &crate::serde_json::Value::Bool(true));
5552                Ok(RuntimeQueryResult::ok_message(
5553                    query.to_string(),
5554                    &format!("schema {} created", q.name),
5555                    "create_schema",
5556                ))
5557            }
5558            QueryExpr::DropSchema(ref q) => {
5559                let store = self.inner.db.store();
5560                let key = format!("schema.{}", q.name);
5561                let existed = store.get_config(&key).is_some();
5562                if !existed && !q.if_exists {
5563                    return Err(RedDBError::Internal(format!(
5564                        "schema {} does not exist",
5565                        q.name
5566                    )));
5567                }
5568                // Remove marker from red_config via set to null.
5569                store.set_config_tree(&key, &crate::serde_json::Value::Null);
5570                let suffix = if q.cascade {
5571                    " (CASCADE accepted — tables untouched)"
5572                } else {
5573                    ""
5574                };
5575                Ok(RuntimeQueryResult::ok_message(
5576                    query.to_string(),
5577                    &format!("schema {} dropped{}", q.name, suffix),
5578                    "drop_schema",
5579                ))
5580            }
5581            QueryExpr::CreateSequence(ref q) => {
5582                let store = self.inner.db.store();
5583                let base = format!("sequence.{}", q.name);
5584                let start_key = format!("{base}.start");
5585                let incr_key = format!("{base}.increment");
5586                let curr_key = format!("{base}.current");
5587                if store.get_config(&start_key).is_some() {
5588                    if q.if_not_exists {
5589                        return Ok(RuntimeQueryResult::ok_message(
5590                            query.to_string(),
5591                            &format!("sequence {} already exists — skipped", q.name),
5592                            "create_sequence",
5593                        ));
5594                    }
5595                    return Err(RedDBError::Internal(format!(
5596                        "sequence {} already exists",
5597                        q.name
5598                    )));
5599                }
5600                // Persist start + increment, and set current so the first
5601                // nextval returns `start`.
5602                let initial_current = q.start - q.increment;
5603                store.set_config_tree(
5604                    &start_key,
5605                    &crate::serde_json::Value::Number(q.start as f64),
5606                );
5607                store.set_config_tree(
5608                    &incr_key,
5609                    &crate::serde_json::Value::Number(q.increment as f64),
5610                );
5611                store.set_config_tree(
5612                    &curr_key,
5613                    &crate::serde_json::Value::Number(initial_current as f64),
5614                );
5615                Ok(RuntimeQueryResult::ok_message(
5616                    query.to_string(),
5617                    &format!(
5618                        "sequence {} created (start={}, increment={})",
5619                        q.name, q.start, q.increment
5620                    ),
5621                    "create_sequence",
5622                ))
5623            }
5624            QueryExpr::DropSequence(ref q) => {
5625                let store = self.inner.db.store();
5626                let base = format!("sequence.{}", q.name);
5627                let existed = store.get_config(&format!("{base}.start")).is_some();
5628                if !existed && !q.if_exists {
5629                    return Err(RedDBError::Internal(format!(
5630                        "sequence {} does not exist",
5631                        q.name
5632                    )));
5633                }
5634                for k in ["start", "increment", "current"] {
5635                    store.set_config_tree(&format!("{base}.{k}"), &crate::serde_json::Value::Null);
5636                }
5637                Ok(RuntimeQueryResult::ok_message(
5638                    query.to_string(),
5639                    &format!("sequence {} dropped", q.name),
5640                    "drop_sequence",
5641                ))
5642            }
5643            // Views — CREATE [MATERIALIZED] VIEW (Phase 2.1 PG parity).
5644            //
5645            // The view definition is stored in-memory on RuntimeInner (not
5646            // persisted). SELECTs that reference the view name will substitute
5647            // the stored `QueryExpr` via `resolve_view_reference` during
5648            // planning (same entry point used by table-name resolution).
5649            //
5650            // Materialized views additionally allocate a slot in
5651            // `MaterializedViewCache`; a REFRESH repopulates that slot.
5652            QueryExpr::CreateView(ref q) => {
5653                let mut views = self.inner.views.write();
5654                if views.contains_key(&q.name) && !q.or_replace {
5655                    if q.if_not_exists {
5656                        return Ok(RuntimeQueryResult::ok_message(
5657                            query.to_string(),
5658                            &format!("view {} already exists — skipped", q.name),
5659                            "create_view",
5660                        ));
5661                    }
5662                    return Err(RedDBError::Internal(format!(
5663                        "view {} already exists",
5664                        q.name
5665                    )));
5666                }
5667                views.insert(q.name.clone(), Arc::new(q.clone()));
5668                drop(views);
5669
5670                // Materialized view: register cache slot (data is empty until REFRESH).
5671                if q.materialized {
5672                    use crate::storage::cache::result::{MaterializedViewDef, RefreshPolicy};
5673                    let refresh = match q.refresh_every_ms {
5674                        Some(ms) => RefreshPolicy::Periodic(std::time::Duration::from_millis(ms)),
5675                        None => RefreshPolicy::Manual,
5676                    };
5677                    let dependencies = collect_table_refs(&q.query);
5678                    let def = MaterializedViewDef {
5679                        name: q.name.clone(),
5680                        query: format!("<parsed view {}>", q.name),
5681                        dependencies: dependencies.clone(),
5682                        refresh,
5683                        retention_duration_ms: q.retention_duration_ms,
5684                    };
5685                    self.inner.materialized_views.write().register(def);
5686
5687                    // Issue #593 slice 9a — persist the descriptor to
5688                    // the system catalog so the definition survives a
5689                    // restart. Upsert semantics (delete-then-insert by
5690                    // name) keep the catalog free of duplicate rows
5691                    // across `CREATE OR REPLACE` churn.
5692                    let descriptor =
5693                        crate::runtime::continuous_materialized_view::MaterializedViewDescriptor {
5694                            name: q.name.clone(),
5695                            source_sql: query.to_string(),
5696                            source_collections: dependencies,
5697                            refresh_every_ms: q.refresh_every_ms,
5698                            retention_duration_ms: q.retention_duration_ms,
5699                        };
5700                    let store = self.inner.db.store();
5701                    crate::runtime::continuous_materialized_view::persist_descriptor(
5702                        store.as_ref(),
5703                        &descriptor,
5704                    )?;
5705
5706                    // Issue #594 slice 9b — provision a Table-shaped
5707                    // backing collection named after the view. The
5708                    // rewriter skips materialized views (see
5709                    // `rewrite_view_refs_inner`) so `SELECT FROM v`
5710                    // resolves to this collection directly. Empty
5711                    // until REFRESH wires through it in 9c.
5712                    self.ensure_materialized_view_backing(&q.name)?;
5713                }
5714                // Plan cache may have cached a plan that didn't know about this
5715                // view — invalidate so future references pick up the new binding.
5716                // Result cache gets flushed too: OR REPLACE must not serve a
5717                // prior execution of the obsolete body.
5718                self.invalidate_plan_cache();
5719                self.invalidate_result_cache();
5720
5721                Ok(RuntimeQueryResult::ok_message(
5722                    query.to_string(),
5723                    &format!(
5724                        "{}view {} created",
5725                        if q.materialized { "materialized " } else { "" },
5726                        q.name
5727                    ),
5728                    "create_view",
5729                ))
5730            }
5731            QueryExpr::DropView(ref q) => {
5732                let mut views = self.inner.views.write();
5733                let removed = views.remove(&q.name);
5734                let existed = removed.is_some();
5735                let removed_materialized =
5736                    removed.as_ref().map(|v| v.materialized).unwrap_or(false);
5737                drop(views);
5738                if q.materialized || existed {
5739                    // Try the materialised cache too — silent if absent.
5740                    self.inner.materialized_views.write().remove(&q.name);
5741                    // Issue #593 slice 9a — remove any persisted
5742                    // catalog row. Idempotent: a no-op when the view
5743                    // was never materialized (no row was ever written).
5744                    let store = self.inner.db.store();
5745                    crate::runtime::continuous_materialized_view::remove_by_name(
5746                        store.as_ref(),
5747                        &q.name,
5748                    )?;
5749                }
5750                // Issue #594 slice 9b — drop the backing collection
5751                // that was provisioned at CREATE time. Only mat views
5752                // ever had one; regular views never did.
5753                if removed_materialized || q.materialized {
5754                    self.drop_materialized_view_backing(&q.name)?;
5755                }
5756                // Drop any plan / result cache entries that baked the
5757                // view body into their QueryExpr.
5758                self.invalidate_plan_cache();
5759                self.invalidate_result_cache();
5760                if !existed && !q.if_exists {
5761                    return Err(RedDBError::Internal(format!(
5762                        "view {} does not exist",
5763                        q.name
5764                    )));
5765                }
5766                self.invalidate_plan_cache();
5767                Ok(RuntimeQueryResult::ok_message(
5768                    query.to_string(),
5769                    &format!("view {} dropped", q.name),
5770                    "drop_view",
5771                ))
5772            }
5773            QueryExpr::RefreshMaterializedView(ref q) => {
5774                // Look up the view definition, execute its underlying query,
5775                // and stash the serialized result in the materialised cache.
5776                let view = {
5777                    let views = self.inner.views.read();
5778                    views.get(&q.name).cloned()
5779                };
5780                let view = match view {
5781                    Some(v) => v,
5782                    None => {
5783                        return Err(RedDBError::Internal(format!(
5784                            "view {} does not exist",
5785                            q.name
5786                        )))
5787                    }
5788                };
5789                if !view.materialized {
5790                    return Err(RedDBError::Internal(format!(
5791                        "view {} is not materialized — REFRESH requires \
5792                         CREATE MATERIALIZED VIEW",
5793                        q.name
5794                    )));
5795                }
5796                // Execute the underlying query fresh.
5797                let started = std::time::Instant::now();
5798                let now_ms = std::time::SystemTime::now()
5799                    .duration_since(std::time::UNIX_EPOCH)
5800                    .map(|d| d.as_millis() as u64)
5801                    .unwrap_or(0);
5802                match self.execute_query_expr((*view.query).clone()) {
5803                    Ok(inner_result) => {
5804                        // Issue #595 slice 9c — atomically replace the
5805                        // backing collection's contents under a single
5806                        // WAL group. Concurrent SELECT from the view
5807                        // sees either the prior or new contents, never
5808                        // partial. A crash before the WAL commit lands
5809                        // leaves the prior contents intact on recovery.
5810                        let entities =
5811                            view_records_to_entities(&q.name, &inner_result.result.records);
5812                        let row_count = entities.len() as u64;
5813                        let store = self.inner.db.store();
5814                        let serialized_records = match store.refresh_collection(&q.name, entities) {
5815                            Ok(records) => records,
5816                            Err(err) => {
5817                                let duration_ms = started.elapsed().as_millis() as u64;
5818                                let msg = err.to_string();
5819                                self.inner
5820                                    .materialized_views
5821                                    .write()
5822                                    .record_refresh_failure(
5823                                        &q.name,
5824                                        msg.clone(),
5825                                        duration_ms,
5826                                        now_ms,
5827                                    );
5828                                return Err(RedDBError::Internal(format!(
5829                                    "REFRESH MATERIALIZED VIEW {}: {msg}",
5830                                    q.name
5831                                )));
5832                            }
5833                        };
5834
5835                        // Issue #596 slice 9d — emit a Refresh
5836                        // ChangeRecord into the logical-WAL spool so
5837                        // replicas deterministically replay the same
5838                        // backing-collection contents via
5839                        // `LogicalChangeApplier::apply_record`.
5840                        if let Some(ref primary) = self.inner.db.replication {
5841                            let lsn = self.inner.cdc.emit(
5842                                crate::replication::cdc::ChangeOperation::Refresh,
5843                                &q.name,
5844                                0,
5845                                "refresh",
5846                            );
5847                            self.invalidate_result_cache_for_table(&q.name);
5848                            let timestamp = std::time::SystemTime::now()
5849                                .duration_since(std::time::UNIX_EPOCH)
5850                                .unwrap_or_default()
5851                                .as_millis() as u64;
5852                            let record = ChangeRecord::for_refresh(
5853                                lsn,
5854                                timestamp,
5855                                q.name.clone(),
5856                                serialized_records,
5857                            )
5858                            .with_term(self.current_replication_term());
5859                            let encoded = record.encode();
5860                            primary.append_logical_record(record.lsn, encoded);
5861                        }
5862
5863                        let duration_ms = started.elapsed().as_millis() as u64;
5864                        let serialized = format!("{:?}", inner_result.result);
5865                        self.inner
5866                            .materialized_views
5867                            .write()
5868                            .record_refresh_success(
5869                                &q.name,
5870                                serialized.into_bytes(),
5871                                row_count,
5872                                duration_ms,
5873                                now_ms,
5874                            );
5875                        // SELECT FROM v now reads through the rewriter
5876                        // skip into the backing collection — drop the
5877                        // result cache so prior empty-backing reads
5878                        // don't shadow the new contents.
5879                        self.invalidate_result_cache();
5880                        Ok(RuntimeQueryResult::ok_message(
5881                            query.to_string(),
5882                            &format!("materialized view {} refreshed", q.name),
5883                            "refresh_materialized_view",
5884                        ))
5885                    }
5886                    Err(err) => {
5887                        let duration_ms = started.elapsed().as_millis() as u64;
5888                        let msg = err.to_string();
5889                        self.inner
5890                            .materialized_views
5891                            .write()
5892                            .record_refresh_failure(&q.name, msg.clone(), duration_ms, now_ms);
5893                        Err(err)
5894                    }
5895                }
5896            }
5897            // Row Level Security (Phase 2.5 PG parity).
5898            //
5899            // Policies live in an in-memory registry keyed by (table, name).
5900            // Enforcement (AND-ing the policy's USING clause into every
5901            // query's WHERE for the table) arrives in Phase 2.5.2 via the
5902            // filter compiler; this dispatch only manages the catalog.
5903            QueryExpr::CreatePolicy(ref q) => {
5904                let key = (q.table.clone(), q.name.clone());
5905                self.inner
5906                    .rls_policies
5907                    .write()
5908                    .insert(key, Arc::new(q.clone()));
5909                self.invalidate_plan_cache();
5910                // Issue #120 — surface policy names in the
5911                // schema-vocabulary so AskPipeline (#121) can resolve
5912                // a policy reference back to its table.
5913                self.schema_vocabulary_apply(
5914                    crate::runtime::schema_vocabulary::DdlEvent::CreatePolicy {
5915                        collection: q.table.clone(),
5916                        policy: q.name.clone(),
5917                    },
5918                );
5919                Ok(RuntimeQueryResult::ok_message(
5920                    query.to_string(),
5921                    &format!("policy {} on {} created", q.name, q.table),
5922                    "create_policy",
5923                ))
5924            }
5925            QueryExpr::DropPolicy(ref q) => {
5926                let removed = self
5927                    .inner
5928                    .rls_policies
5929                    .write()
5930                    .remove(&(q.table.clone(), q.name.clone()))
5931                    .is_some();
5932                if !removed && !q.if_exists {
5933                    return Err(RedDBError::Internal(format!(
5934                        "policy {} on {} does not exist",
5935                        q.name, q.table
5936                    )));
5937                }
5938                self.invalidate_plan_cache();
5939                // Issue #120 — keep the schema-vocabulary policy
5940                // entry in sync.
5941                self.schema_vocabulary_apply(
5942                    crate::runtime::schema_vocabulary::DdlEvent::DropPolicy {
5943                        collection: q.table.clone(),
5944                        policy: q.name.clone(),
5945                    },
5946                );
5947                Ok(RuntimeQueryResult::ok_message(
5948                    query.to_string(),
5949                    &format!("policy {} on {} dropped", q.name, q.table),
5950                    "drop_policy",
5951                ))
5952            }
5953            // Foreign Data Wrappers (Phase 3.2 PG parity).
5954            //
5955            // CREATE SERVER / CREATE FOREIGN TABLE register into the shared
5956            // `ForeignTableRegistry`. The read path consults that registry
5957            // before dispatching a SELECT — when the table name matches a
5958            // registered foreign table, we forward the scan to the wrapper
5959            // and skip the normal collection lookup.
5960            //
5961            // Phase 3.2 is in-memory only; persistence across restarts is a
5962            // 3.2.2 follow-up that mirrors the view registry pattern.
5963            QueryExpr::CreateServer(ref q) => {
5964                use crate::storage::fdw::FdwOptions;
5965                let registry = Arc::clone(&self.inner.foreign_tables);
5966                if registry.server(&q.name).is_some() {
5967                    if q.if_not_exists {
5968                        return Ok(RuntimeQueryResult::ok_message(
5969                            query.to_string(),
5970                            &format!("server {} already exists — skipped", q.name),
5971                            "create_server",
5972                        ));
5973                    }
5974                    return Err(RedDBError::Internal(format!(
5975                        "server {} already exists",
5976                        q.name
5977                    )));
5978                }
5979                let mut opts = FdwOptions::new();
5980                for (k, v) in &q.options {
5981                    opts.values.insert(k.clone(), v.clone());
5982                }
5983                registry
5984                    .create_server(&q.name, &q.wrapper, opts)
5985                    .map_err(|e| RedDBError::Internal(e.to_string()))?;
5986                Ok(RuntimeQueryResult::ok_message(
5987                    query.to_string(),
5988                    &format!("server {} created (wrapper {})", q.name, q.wrapper),
5989                    "create_server",
5990                ))
5991            }
5992            QueryExpr::DropServer(ref q) => {
5993                let existed = self.inner.foreign_tables.drop_server(&q.name);
5994                if !existed && !q.if_exists {
5995                    return Err(RedDBError::Internal(format!(
5996                        "server {} does not exist",
5997                        q.name
5998                    )));
5999                }
6000                Ok(RuntimeQueryResult::ok_message(
6001                    query.to_string(),
6002                    &format!(
6003                        "server {} dropped{}",
6004                        q.name,
6005                        if q.cascade { " (cascade)" } else { "" }
6006                    ),
6007                    "drop_server",
6008                ))
6009            }
6010            QueryExpr::CreateForeignTable(ref q) => {
6011                use crate::storage::fdw::{FdwOptions, ForeignColumn, ForeignTable};
6012                let registry = Arc::clone(&self.inner.foreign_tables);
6013                if registry.foreign_table(&q.name).is_some() {
6014                    if q.if_not_exists {
6015                        return Ok(RuntimeQueryResult::ok_message(
6016                            query.to_string(),
6017                            &format!("foreign table {} already exists — skipped", q.name),
6018                            "create_foreign_table",
6019                        ));
6020                    }
6021                    return Err(RedDBError::Internal(format!(
6022                        "foreign table {} already exists",
6023                        q.name
6024                    )));
6025                }
6026                let mut opts = FdwOptions::new();
6027                for (k, v) in &q.options {
6028                    opts.values.insert(k.clone(), v.clone());
6029                }
6030                let columns: Vec<ForeignColumn> = q
6031                    .columns
6032                    .iter()
6033                    .map(|c| ForeignColumn {
6034                        name: c.name.clone(),
6035                        data_type: c.data_type.clone(),
6036                        not_null: c.not_null,
6037                    })
6038                    .collect();
6039                registry
6040                    .create_foreign_table(ForeignTable {
6041                        name: q.name.clone(),
6042                        server_name: q.server.clone(),
6043                        columns,
6044                        options: opts,
6045                    })
6046                    .map_err(|e| RedDBError::Internal(e.to_string()))?;
6047                self.invalidate_plan_cache();
6048                Ok(RuntimeQueryResult::ok_message(
6049                    query.to_string(),
6050                    &format!("foreign table {} created (server {})", q.name, q.server),
6051                    "create_foreign_table",
6052                ))
6053            }
6054            QueryExpr::DropForeignTable(ref q) => {
6055                let existed = self.inner.foreign_tables.drop_foreign_table(&q.name);
6056                if !existed && !q.if_exists {
6057                    return Err(RedDBError::Internal(format!(
6058                        "foreign table {} does not exist",
6059                        q.name
6060                    )));
6061                }
6062                self.invalidate_plan_cache();
6063                Ok(RuntimeQueryResult::ok_message(
6064                    query.to_string(),
6065                    &format!("foreign table {} dropped", q.name),
6066                    "drop_foreign_table",
6067                ))
6068            }
6069            // COPY table FROM 'path' (Phase 1.5 PG parity).
6070            //
6071            // Stream CSV rows through the shared `CsvImporter`. The collection
6072            // is auto-created on first insert (via `insert_auto`-style path);
6073            // VACUUM/ANALYZE afterwards is up to the caller.
6074            QueryExpr::CopyFrom(ref q) => {
6075                use crate::storage::import::{CsvConfig, CsvImporter};
6076                let store = self.inner.db.store();
6077                let cfg = CsvConfig {
6078                    collection: q.table.clone(),
6079                    has_header: q.has_header,
6080                    delimiter: q.delimiter.map(|c| c as u8).unwrap_or(b','),
6081                    ..CsvConfig::default()
6082                };
6083                let importer = CsvImporter::new(cfg);
6084                let stats = importer
6085                    .import_file(&q.path, store.as_ref())
6086                    .map_err(|e| RedDBError::Internal(format!("COPY failed: {e}")))?;
6087                // Tables are written → invalidate cached plans / result cache.
6088                self.note_table_write(&q.table);
6089                Ok(RuntimeQueryResult::ok_message(
6090                    query.to_string(),
6091                    &format!(
6092                        "COPY imported {} rows into {} ({} errors skipped, {}ms)",
6093                        stats.records_imported, q.table, stats.errors_skipped, stats.duration_ms
6094                    ),
6095                    "copy_from",
6096                ))
6097            }
6098            // Maintenance commands (Phase 1.2 PG parity).
6099            //
6100            // - VACUUM [FULL] [table]: refreshes planner stats for the target
6101            //   collection(s) and — when FULL — triggers a full pager persist
6102            //   (flushes dirty pages + fsync). Also invalidates the result cache
6103            //   so subsequent reads re-execute against the freshly compacted
6104            //   storage. RedDB's segment/btree GC runs continuously via the
6105            //   background lifecycle; explicit space reclamation for sealed
6106            //   segments arrives with Phase 2.3 (MVCC + dead-tuple reclamation).
6107            // - ANALYZE [table]: reruns `analyze_collection` +
6108            //   `persist_table_stats` via `refresh_table_planner_stats` so the
6109            //   planner has fresh histograms, distinct estimates, null counts.
6110            //
6111            // Both commands accept an optional target; omitting the target
6112            // iterates every collection in the store.
6113            QueryExpr::MaintenanceCommand(ref cmd) => {
6114                use crate::storage::query::ast::MaintenanceCommand as Mc;
6115                let store = self.inner.db.store();
6116                let (kind, msg) = match cmd {
6117                    Mc::Analyze { target } => {
6118                        let targets: Vec<String> = match target {
6119                            Some(t) => vec![t.clone()],
6120                            None => store.list_collections(),
6121                        };
6122                        for t in &targets {
6123                            self.refresh_table_planner_stats(t);
6124                        }
6125                        (
6126                            "analyze",
6127                            format!("ANALYZE refreshed stats for {} table(s)", targets.len()),
6128                        )
6129                    }
6130                    Mc::Vacuum { target, full } => {
6131                        let targets: Vec<String> = match target {
6132                            Some(t) => vec![t.clone()],
6133                            None => store.list_collections(),
6134                        };
6135                        let cutoff_xid = self.mvcc_vacuum_cutoff_xid();
6136                        let mut vacuum_stats =
6137                            crate::storage::unified::store::MvccVacuumStats::default();
6138                        for t in &targets {
6139                            let stats = store.vacuum_mvcc_history(t, cutoff_xid).map_err(|e| {
6140                                RedDBError::Internal(format!(
6141                                    "VACUUM MVCC history failed for {t}: {e}"
6142                                ))
6143                            })?;
6144                            if stats.reclaimed_versions > 0 {
6145                                self.rebuild_runtime_indexes_for_table(t)?;
6146                            }
6147                            vacuum_stats.add(&stats);
6148                        }
6149                        self.inner.snapshot_manager.prune_aborted(cutoff_xid);
6150                        // Stats refresh covers every target (same as ANALYZE).
6151                        for t in &targets {
6152                            self.refresh_table_planner_stats(t);
6153                        }
6154                        // FULL forces a pager persist (dirty-page flush + fsync).
6155                        // Regular VACUUM relies on the background writer / segment
6156                        // lifecycle so the command is non-blocking.
6157                        let persisted = if *full {
6158                            match store.persist() {
6159                                Ok(()) => true,
6160                                Err(e) => {
6161                                    return Err(RedDBError::Internal(format!(
6162                                        "VACUUM FULL persist failed: {e:?}"
6163                                    )));
6164                                }
6165                            }
6166                        } else {
6167                            false
6168                        };
6169                        // Result cache depended on pre-vacuum state.
6170                        self.invalidate_result_cache();
6171                        (
6172                            "vacuum",
6173                            format!(
6174                                "VACUUM{} processed {} table(s): scanned_versions={}, retained_versions={}, reclaimed_versions={}, retained_history_versions={}, reclaimed_history_versions={}, retained_tombstones={}, reclaimed_tombstones={}{}",
6175                                if *full { " FULL" } else { "" },
6176                                targets.len(),
6177                                vacuum_stats.scanned_versions,
6178                                vacuum_stats.retained_versions,
6179                                vacuum_stats.reclaimed_versions,
6180                                vacuum_stats.retained_history_versions,
6181                                vacuum_stats.reclaimed_history_versions,
6182                                vacuum_stats.retained_tombstones,
6183                                vacuum_stats.reclaimed_tombstones,
6184                                if persisted {
6185                                    " (pages flushed to disk)"
6186                                } else {
6187                                    ""
6188                                }
6189                            ),
6190                        )
6191                    }
6192                };
6193                Ok(RuntimeQueryResult::ok_message(
6194                    query.to_string(),
6195                    &msg,
6196                    kind,
6197                ))
6198            }
6199            // GRANT / REVOKE / ALTER USER (RBAC milestone).
6200            //
6201            // These hit the AuthStore directly. The statement frame /
6202            // privilege gate has already decided whether the caller may
6203            // even run the statement; here we just translate the AST into
6204            // AuthStore calls.
6205            QueryExpr::Grant(ref g) => self.execute_grant_statement(query, g),
6206            QueryExpr::Revoke(ref r) => self.execute_revoke_statement(query, r),
6207            QueryExpr::AlterUser(ref a) => self.execute_alter_user_statement(query, a),
6208            QueryExpr::CreateUser(ref u) => self.execute_create_user_statement(query, u),
6209            QueryExpr::CreateIamPolicy { ref id, ref json } => {
6210                self.execute_create_iam_policy(query, id, json)
6211            }
6212            QueryExpr::DropIamPolicy { ref id } => self.execute_drop_iam_policy(query, id),
6213            QueryExpr::AttachPolicy {
6214                ref policy_id,
6215                ref principal,
6216            } => self.execute_attach_policy(query, policy_id, principal),
6217            QueryExpr::DetachPolicy {
6218                ref policy_id,
6219                ref principal,
6220            } => self.execute_detach_policy(query, policy_id, principal),
6221            QueryExpr::ShowPolicies { ref filter } => {
6222                self.execute_show_policies(query, filter.as_ref())
6223            }
6224            QueryExpr::ShowEffectivePermissions {
6225                ref user,
6226                ref resource,
6227            } => self.execute_show_effective_permissions(query, user, resource.as_ref()),
6228            QueryExpr::SimulatePolicy {
6229                ref user,
6230                ref action,
6231                ref resource,
6232            } => self.execute_simulate_policy(query, user, action, resource),
6233            QueryExpr::LintPolicy { ref source } => self.execute_lint_policy(query, source),
6234            QueryExpr::MigratePolicyMode {
6235                ref target,
6236                dry_run,
6237            } => self.execute_migrate_policy_mode(query, target, dry_run),
6238            QueryExpr::CreateMigration(ref q) => self.execute_create_migration(query, q),
6239            QueryExpr::ApplyMigration(ref q) => self.execute_apply_migration(query, q),
6240            QueryExpr::RollbackMigration(ref q) => self.execute_rollback_migration(query, q),
6241            QueryExpr::ExplainMigration(ref q) => self.execute_explain_migration(query, q),
6242        };
6243
6244        if !control_event_specs.is_empty() {
6245            let (outcome, reason) = match &query_result {
6246                Ok(_) => (crate::runtime::control_events::Outcome::Allowed, None),
6247                Err(err) => (control_event_outcome_for_error(err), Some(err.to_string())),
6248            };
6249            for spec in &control_event_specs {
6250                self.emit_control_event(
6251                    spec.kind,
6252                    outcome,
6253                    spec.action,
6254                    spec.resource.clone(),
6255                    reason.clone(),
6256                    spec.fields.clone(),
6257                )?;
6258            }
6259        }
6260
6261        if let (Some(plan), Ok(result)) = (&query_audit_plan, &query_result) {
6262            self.emit_query_audit(
6263                query,
6264                plan,
6265                query_audit_started.elapsed().as_millis() as u64,
6266                result,
6267            );
6268        }
6269
6270        // Decrypt Value::Secret columns in-place before caching, so
6271        // cached results match the post-decrypt shape and repeat
6272        // queries skip the per-row AES-GCM pass.
6273        let mut query_result = query_result;
6274        if let Ok(ref mut result) = query_result {
6275            if result.statement_type == "select" {
6276                self.apply_secret_decryption(result);
6277            }
6278        }
6279
6280        // Cache SELECT results for 30s.
6281        // Skip: pre-serialized JSON (large clone), and result sets > 5 rows.
6282        // Large multi-row results (range scans, filtered scans) are rarely
6283        // repeated with the same literal values so the cache hit rate is near
6284        // zero while the clone cost (100 records × ~16 fields each) is high.
6285        // Aggregations (1 row) and point lookups (1 row) still benefit.
6286        if let Ok(ref result) = query_result {
6287            frame.write_result_cache(self, result, result_cache_scopes);
6288        }
6289
6290        query_result
6291    }
6292
6293    /// Snapshot of every registered materialized view's runtime
6294    /// state — feeds the `red.materialized_views` virtual table.
6295    /// Issue #583 slice 10.
6296    pub fn materialized_view_metadata(
6297        &self,
6298    ) -> Vec<crate::storage::cache::result::MaterializedViewMetadata> {
6299        // Issue #595 slice 9c — `current_row_count` is now scraped
6300        // live from the backing collection rather than read from the
6301        // cache slot. Mirrors the slice-10 invariant on
6302        // `queue_pending_gauge` in #527: the live store is the source
6303        // of truth, the cache slot only carries last-refresh telemetry
6304        // (timing, error, refresh cadence).
6305        let store = self.inner.db.store();
6306        let mut entries = self.inner.materialized_views.read().metadata();
6307        for entry in &mut entries {
6308            if let Some(manager) = store.get_collection(&entry.name) {
6309                entry.current_row_count = manager.count() as u64;
6310            }
6311        }
6312        entries
6313    }
6314
6315    /// Drive scheduled refreshes for materialized views with a
6316    /// `REFRESH EVERY <duration>` clause. Called from the background
6317    /// scheduler thread (and from unit tests with a fake clock via
6318    /// `claim_due_at`). Each invocation atomically claims the set of
6319    /// due views (so two concurrent ticks never double-fire the same
6320    /// view) and runs each refresh through the standard execution
6321    /// path — failures are captured in `last_error` and the prior
6322    /// content stays intact. Issue #583 slice 10.
6323    /// Snapshot of every tracked retention sweeper state — feeds the
6324    /// three extra columns on `red.retention`. Issue #584 slice 12.
6325    pub(crate) fn retention_sweeper_snapshot(
6326        &self,
6327    ) -> Vec<(String, crate::runtime::retention_sweeper::SweeperState)> {
6328        self.inner.retention_sweeper.read().snapshot()
6329    }
6330
6331    /// Drive one tick of the retention sweeper. Iterates collections
6332    /// with a retention policy set, physically deletes at most
6333    /// `batch_size` expired rows per collection, and records the
6334    /// `last_sweep_at_ms` / `rows_swept_total` / pending estimate that
6335    /// `red.retention` exposes. Called from the background sweeper
6336    /// thread; safe to invoke directly from tests with a small batch
6337    /// size to drain rows deterministically. Issue #584 slice 12.
6338    ///
6339    /// Deletes are issued as `DELETE FROM <collection> WHERE
6340    /// <ts_column> < <cutoff>` through the standard `execute_query`
6341    /// chokepoint so WAL participation and snapshot guards apply
6342    /// exactly as for a user-issued DELETE — replicas replay the
6343    /// sweeper's deletes via the same WAL stream with no special
6344    /// handling on the replication side.
6345    ///
6346    /// Batching is enforced by tightening the cutoff: if more than
6347    /// `batch_size` rows are expired, the cutoff is dropped to the
6348    /// `batch_size`-th oldest expired timestamp + 1 so the predicate
6349    /// matches roughly `batch_size` rows; the remainder is reported
6350    /// as `current_rows_pending_sweep_estimate` and drained on the
6351    /// next tick.
6352    pub fn sweep_retention_tick(&self, batch_size: usize) {
6353        if batch_size == 0 {
6354            return;
6355        }
6356        let now_ms = std::time::SystemTime::now()
6357            .duration_since(std::time::UNIX_EPOCH)
6358            .map(|d| d.as_millis() as u64)
6359            .unwrap_or(0);
6360
6361        let store = self.inner.db.store();
6362        let collections = store.list_collections();
6363        for name in collections {
6364            let Some(contract) = self.inner.db.collection_contract(&name) else {
6365                continue;
6366            };
6367            let Some(retention_ms) = contract.retention_duration_ms else {
6368                continue;
6369            };
6370            let Some(ts_column) =
6371                crate::runtime::retention_filter::resolve_timestamp_column(&contract)
6372            else {
6373                continue;
6374            };
6375            let Some(manager) = store.get_collection(&name) else {
6376                continue;
6377            };
6378            let cutoff = (now_ms as i64).saturating_sub(retention_ms as i64);
6379
6380            // Single pass: collect expired timestamps. We keep the
6381            // full Vec rather than a bounded heap because the partial
6382            // sort below is the simplest correct way to find the
6383            // batch-th oldest; for the slice's "1000-row default
6384            // batch" target this is bounded enough for production
6385            // operation, and the alternative (in-place heap of size
6386            // batch+1) is a follow-up optimisation.
6387            let mut expired_ts: Vec<i64> = Vec::new();
6388            manager.for_each_entity(|entity| {
6389                let ts = match ts_column.as_str() {
6390                    "created_at" => Some(entity.created_at as i64),
6391                    "updated_at" => Some(entity.updated_at as i64),
6392                    other => entity
6393                        .data
6394                        .as_row()
6395                        .and_then(|row| row.get_field(other))
6396                        .and_then(|v| match v {
6397                            crate::storage::schema::Value::TimestampMs(t) => Some(*t),
6398                            crate::storage::schema::Value::Timestamp(t) => {
6399                                Some(t.saturating_mul(1_000))
6400                            }
6401                            crate::storage::schema::Value::BigInt(t) => Some(*t),
6402                            crate::storage::schema::Value::UnsignedInteger(t) => {
6403                                i64::try_from(*t).ok()
6404                            }
6405                            crate::storage::schema::Value::Integer(t) => Some(*t),
6406                            _ => None,
6407                        }),
6408                };
6409                if let Some(t) = ts {
6410                    if t < cutoff {
6411                        expired_ts.push(t);
6412                    }
6413                }
6414                true
6415            });
6416
6417            let total_expired = expired_ts.len() as u64;
6418            if total_expired == 0 {
6419                self.inner
6420                    .retention_sweeper
6421                    .write()
6422                    .record_tick(&name, 0, 0, now_ms);
6423                continue;
6424            }
6425
6426            let (effective_cutoff, pending) = if (total_expired as usize) <= batch_size {
6427                (cutoff, 0u64)
6428            } else {
6429                // Tighten the cutoff to the (batch_size)-th oldest
6430                // expired timestamp + 1 so DELETE matches roughly
6431                // `batch_size` rows.
6432                expired_ts.sort_unstable();
6433                let nth = expired_ts[batch_size - 1];
6434                (
6435                    nth.saturating_add(1),
6436                    total_expired.saturating_sub(batch_size as u64),
6437                )
6438            };
6439
6440            let stmt = format!(
6441                "DELETE FROM {} WHERE {} < {}",
6442                name, ts_column, effective_cutoff
6443            );
6444            let deleted = match self.execute_query(&stmt) {
6445                Ok(r) => r.affected_rows,
6446                Err(_) => 0,
6447            };
6448
6449            self.inner
6450                .retention_sweeper
6451                .write()
6452                .record_tick(&name, deleted, pending, now_ms);
6453        }
6454    }
6455
6456    pub fn refresh_due_materialized_views(&self) {
6457        let due = {
6458            let mut cache = self.inner.materialized_views.write();
6459            cache.claim_due_at(std::time::Instant::now())
6460        };
6461        for name in due {
6462            // Round-trip through `execute_query` (rather than the
6463            // prepared-statement `execute_query_expr` fast path, which
6464            // explicitly rejects DDL/maintenance statements). Failures
6465            // are captured inside the RefreshMaterializedView handler
6466            // via `record_refresh_failure`; the scheduler ignores the
6467            // Result so one bad view doesn't halt the loop.
6468            let stmt = format!("REFRESH MATERIALIZED VIEW {}", name);
6469            let _ = self.execute_query(&stmt);
6470        }
6471    }
6472
6473    /// Execute a pre-parsed `QueryExpr` directly, bypassing SQL parsing and the
6474    /// plan cache. Used by the prepared-statement fast path so that `execute_prepared`
6475    /// calls pay zero parse + cache overhead.
6476    ///
6477    /// Applies secret decryption on SELECT results, identical to `execute_query`.
6478    pub fn execute_query_expr(&self, expr: QueryExpr) -> RedDBResult<RuntimeQueryResult> {
6479        let _config_snapshot_guard = ConfigSnapshotGuard::install(Arc::clone(&self.inner.db));
6480        let _secret_store_guard = SecretStoreGuard::install(self.inner.auth_store.read().clone());
6481        // View rewrite (Phase 2.1): substitute any `QueryExpr::Table(tq)`
6482        // whose `tq.table` matches a registered view with the view's
6483        // underlying query. Safe to call even when no views are registered.
6484        let expr = self.rewrite_view_refs(expr);
6485
6486        self.validate_model_operations_before_auth(&expr)?;
6487        // Granular RBAC privilege check. Runs before dispatch so a
6488        // denied caller never reaches storage. Fail-closed: any error
6489        // resolving the action / resource produces PermissionDenied.
6490        if let Err(err) = self.check_query_privilege(&expr) {
6491            return Err(RedDBError::Query(format!("permission denied: {err}")));
6492        }
6493
6494        let statement = query_expr_name(&expr);
6495        let mode = detect_mode(statement);
6496        let query_str = statement;
6497
6498        let result = self.dispatch_expr(expr, query_str, mode)?;
6499        let mut r = result;
6500        if r.statement_type == "select" {
6501            self.apply_secret_decryption(&mut r);
6502        }
6503        Ok(r)
6504    }
6505
6506    pub(super) fn validate_model_operations_before_auth(
6507        &self,
6508        expr: &QueryExpr,
6509    ) -> RedDBResult<()> {
6510        use crate::catalog::CollectionModel;
6511        use crate::runtime::ddl::polymorphic_resolver;
6512        use crate::storage::query::ast::KvCommand;
6513
6514        let system_schema_target = match expr {
6515            QueryExpr::DropTable(q) => Some(q.name.as_str()),
6516            QueryExpr::DropGraph(q) => Some(q.name.as_str()),
6517            QueryExpr::DropVector(q) => Some(q.name.as_str()),
6518            QueryExpr::DropDocument(q) => Some(q.name.as_str()),
6519            QueryExpr::DropKv(q) => Some(q.name.as_str()),
6520            QueryExpr::DropCollection(q) => Some(q.name.as_str()),
6521            QueryExpr::Truncate(q) => Some(q.name.as_str()),
6522            _ => None,
6523        };
6524        if system_schema_target.is_some_and(crate::runtime::impl_ddl::is_system_schema_name) {
6525            return Err(RedDBError::Query("system schema is read-only".to_string()));
6526        }
6527
6528        let expected = match expr {
6529            QueryExpr::DropTable(q) => Some((q.name.as_str(), CollectionModel::Table)),
6530            QueryExpr::DropGraph(q) => Some((q.name.as_str(), CollectionModel::Graph)),
6531            QueryExpr::DropVector(q) => Some((q.name.as_str(), CollectionModel::Vector)),
6532            QueryExpr::DropDocument(q) => Some((q.name.as_str(), CollectionModel::Document)),
6533            QueryExpr::DropKv(q) => Some((q.name.as_str(), q.model)),
6534            QueryExpr::DropCollection(q) => q.model.map(|model| (q.name.as_str(), model)),
6535            QueryExpr::Truncate(q) => q.model.map(|model| (q.name.as_str(), model)),
6536            QueryExpr::KvCommand(cmd) => {
6537                let (collection, model) = match cmd {
6538                    KvCommand::Put {
6539                        collection, model, ..
6540                    }
6541                    | KvCommand::Get {
6542                        collection, model, ..
6543                    }
6544                    | KvCommand::Incr {
6545                        collection, model, ..
6546                    }
6547                    | KvCommand::Cas {
6548                        collection, model, ..
6549                    }
6550                    | KvCommand::List {
6551                        collection, model, ..
6552                    }
6553                    | KvCommand::Delete {
6554                        collection, model, ..
6555                    } => (collection.as_str(), *model),
6556                    KvCommand::Rotate { collection, .. }
6557                    | KvCommand::History { collection, .. }
6558                    | KvCommand::Purge { collection, .. } => {
6559                        (collection.as_str(), CollectionModel::Vault)
6560                    }
6561                    KvCommand::InvalidateTags { collection, .. } => {
6562                        (collection.as_str(), CollectionModel::Kv)
6563                    }
6564                    KvCommand::Watch {
6565                        collection, model, ..
6566                    } => (collection.as_str(), *model),
6567                    KvCommand::Unseal { collection, .. } => {
6568                        (collection.as_str(), CollectionModel::Vault)
6569                    }
6570                };
6571                Some((collection, model))
6572            }
6573            QueryExpr::ConfigCommand(cmd) => {
6574                self.validate_config_command_before_auth(cmd)?;
6575                None
6576            }
6577            _ => None,
6578        };
6579
6580        let Some((name, expected_model)) = expected else {
6581            return Ok(());
6582        };
6583        let snapshot = self.inner.db.catalog_model_snapshot();
6584        let Some(actual_model) = snapshot
6585            .collections
6586            .iter()
6587            .find(|collection| collection.name == name)
6588            .map(|collection| collection.declared_model.unwrap_or(collection.model))
6589        else {
6590            return Ok(());
6591        };
6592        polymorphic_resolver::ensure_model_match(expected_model, actual_model)
6593    }
6594
6595    /// Walk a `QueryExpr` and replace `QueryExpr::Table(tq)` nodes whose
6596    /// `tq.table` matches a registered view name with the view's stored
6597    /// body. Recurses through joins so `SELECT ... FROM t JOIN myview ...`
6598    /// resolves correctly. Pure operation — no side effects.
6599    pub(super) fn rewrite_view_refs(&self, expr: QueryExpr) -> QueryExpr {
6600        // Fast path: no views registered → return original expression.
6601        if self.inner.views.read().is_empty() {
6602            return expr;
6603        }
6604        self.rewrite_view_refs_inner(expr)
6605    }
6606
6607    fn rewrite_view_refs_inner(&self, expr: QueryExpr) -> QueryExpr {
6608        use crate::storage::query::ast::{Filter, TableSource};
6609        match expr {
6610            QueryExpr::Table(mut tq) => {
6611                // 1. If the TableSource is a subquery, recurse into it so
6612                //    `SELECT ... FROM (SELECT ... FROM myview) t` expands.
6613                //    The legacy `table` field (set to a synthetic
6614                //    "__subq_NNNN" sentinel) stays as-is so callers that
6615                //    read it keep compiling.
6616                if let Some(TableSource::Subquery(body)) = tq.source.take() {
6617                    tq.source = Some(TableSource::Subquery(Box::new(
6618                        self.rewrite_view_refs_inner(*body),
6619                    )));
6620                    return QueryExpr::Table(tq);
6621                }
6622
6623                // 2. Restore the source field (took it above for match).
6624                // When the source was `None` or `TableSource::Name(_)`, the
6625                // real lookup key is `tq.table` — check the view registry.
6626                let maybe_view = {
6627                    let views = self.inner.views.read();
6628                    views.get(&tq.table).cloned()
6629                };
6630                let Some(view) = maybe_view else {
6631                    return QueryExpr::Table(tq);
6632                };
6633
6634                // Issue #594 slice 9b — materialized views are read
6635                // from their backing collection, not by substituting
6636                // the body. Returning the TableQuery as-is lets the
6637                // normal table-read path resolve `SELECT FROM v`
6638                // against the collection provisioned at CREATE time.
6639                if view.materialized {
6640                    return QueryExpr::Table(tq);
6641                }
6642
6643                // Recurse into the view body — views may reference other
6644                // views. The recursion yields the final QueryExpr we need
6645                // to merge the outer's filter / limit / offset into.
6646                let inner_expr = self.rewrite_view_refs_inner((*view.query).clone());
6647
6648                // Phase 5: when the body is a Table we merge the outer
6649                // TableQuery's WHERE / LIMIT / OFFSET into it so stacked
6650                // views filter recursively. Non-table bodies (Search,
6651                // Ask, Vector, Graph, Hybrid) can't meaningfully combine
6652                // with an outer Table query today — return the body
6653                // verbatim; outer predicates are lost. Full projection
6654                // merge lands in Phase 5.2.
6655                match inner_expr {
6656                    QueryExpr::Table(mut inner_tq) => {
6657                        if let Some(outer_filter) = tq.filter.take() {
6658                            inner_tq.filter = Some(match inner_tq.filter.take() {
6659                                Some(existing) => {
6660                                    Filter::And(Box::new(existing), Box::new(outer_filter))
6661                                }
6662                                None => outer_filter,
6663                            });
6664                            // Keep the `Expr` form in lock-step with the
6665                            // merged `Filter`. The executor prefers
6666                            // `where_expr` and nulls `filter` when it is
6667                            // present (see `execute_query_inner`), so a
6668                            // stacked view whose outer predicate was only
6669                            // merged into `filter` would silently drop that
6670                            // predicate at eval time (#635).
6671                            inner_tq.where_expr = inner_tq
6672                                .filter
6673                                .as_ref()
6674                                .map(crate::storage::query::sql_lowering::filter_to_expr);
6675                        }
6676                        if let Some(outer_limit) = tq.limit {
6677                            inner_tq.limit = Some(match inner_tq.limit {
6678                                Some(existing) => existing.min(outer_limit),
6679                                None => outer_limit,
6680                            });
6681                        }
6682                        if let Some(outer_offset) = tq.offset {
6683                            inner_tq.offset = Some(match inner_tq.offset {
6684                                Some(existing) => existing + outer_offset,
6685                                None => outer_offset,
6686                            });
6687                        }
6688                        QueryExpr::Table(inner_tq)
6689                    }
6690                    other => other,
6691                }
6692            }
6693            QueryExpr::Join(mut jq) => {
6694                jq.left = Box::new(self.rewrite_view_refs_inner(*jq.left));
6695                jq.right = Box::new(self.rewrite_view_refs_inner(*jq.right));
6696                QueryExpr::Join(jq)
6697            }
6698            // Other variants don't carry nested QueryExpr that can reference
6699            // a view by table name. Return as-is.
6700            other => other,
6701        }
6702    }
6703
6704    /// Apply table-level read authorization and RLS rewriting for a
6705    /// relational SELECT leaf.
6706    fn authorize_relational_table_select(
6707        &self,
6708        mut table: TableQuery,
6709        frame: &dyn super::statement_frame::ReadFrame,
6710    ) -> RedDBResult<Option<TableQuery>> {
6711        if let Some(TableSource::Subquery(inner)) = table.source.take() {
6712            let authorized_inner = self.authorize_relational_select_expr(*inner, frame)?;
6713            table.source = Some(TableSource::Subquery(Box::new(authorized_inner)));
6714            return Ok(Some(table));
6715        }
6716
6717        self.check_table_column_projection_authz(&table, frame)?;
6718
6719        if self.inner.rls_enabled_tables.read().contains(&table.table) {
6720            return Ok(inject_rls_filters(self, frame, table));
6721        }
6722
6723        Ok(Some(table))
6724    }
6725
6726    fn authorize_relational_join_select(
6727        &self,
6728        mut join: JoinQuery,
6729        frame: &dyn super::statement_frame::ReadFrame,
6730    ) -> RedDBResult<Option<JoinQuery>> {
6731        self.check_join_column_projection_authz(&join, frame)?;
6732        join.left = Box::new(self.authorize_relational_join_child(*join.left, frame)?);
6733        join.right = Box::new(self.authorize_relational_join_child(*join.right, frame)?);
6734        Ok(inject_rls_into_join(self, frame, join))
6735    }
6736
6737    fn authorize_relational_join_child(
6738        &self,
6739        expr: QueryExpr,
6740        frame: &dyn super::statement_frame::ReadFrame,
6741    ) -> RedDBResult<QueryExpr> {
6742        match expr {
6743            QueryExpr::Table(mut table) => {
6744                if let Some(TableSource::Subquery(inner)) = table.source.take() {
6745                    let authorized_inner = self.authorize_relational_select_expr(*inner, frame)?;
6746                    table.source = Some(TableSource::Subquery(Box::new(authorized_inner)));
6747                }
6748                Ok(QueryExpr::Table(table))
6749            }
6750            QueryExpr::Join(join) => self
6751                .authorize_relational_join_select(join, frame)?
6752                .map(QueryExpr::Join)
6753                .ok_or_else(|| {
6754                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6755                }),
6756            other => Ok(other),
6757        }
6758    }
6759
6760    fn authorize_relational_select_expr(
6761        &self,
6762        expr: QueryExpr,
6763        frame: &dyn super::statement_frame::ReadFrame,
6764    ) -> RedDBResult<QueryExpr> {
6765        match expr {
6766            QueryExpr::Table(table) => self
6767                .authorize_relational_table_select(table, frame)?
6768                .map(QueryExpr::Table)
6769                .ok_or_else(|| {
6770                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6771                }),
6772            QueryExpr::Join(join) => self
6773                .authorize_relational_join_select(join, frame)?
6774                .map(QueryExpr::Join)
6775                .ok_or_else(|| {
6776                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6777                }),
6778            other => Ok(other),
6779        }
6780    }
6781
6782    fn check_table_column_projection_authz(
6783        &self,
6784        table: &TableQuery,
6785        frame: &dyn super::statement_frame::ReadFrame,
6786    ) -> RedDBResult<()> {
6787        let Some((username, role)) = frame.identity() else {
6788            return Ok(());
6789        };
6790        let Some(auth_store) = self.inner.auth_store.read().clone() else {
6791            return Ok(());
6792        };
6793
6794        let columns = self.resolved_table_projection_columns(table)?;
6795        let request = ColumnAccessRequest::select(table.table.clone(), columns);
6796        let principal = UserId::from_parts(frame.effective_scope(), username);
6797        let ctx = runtime_iam_context(role, frame.effective_scope());
6798        let outcome = auth_store.check_column_projection_authz(&principal, &request, &ctx);
6799        if outcome.allowed() {
6800            return Ok(());
6801        }
6802
6803        if let Some(denied) = outcome.first_denied_column() {
6804            return Err(RedDBError::Query(format!(
6805                "permission denied: principal=`{username}` cannot select column `{}`",
6806                denied.resource.name
6807            )));
6808        }
6809        Err(RedDBError::Query(format!(
6810            "permission denied: principal=`{username}` cannot select table `{}`",
6811            table.table
6812        )))
6813    }
6814
6815    fn check_join_column_projection_authz(
6816        &self,
6817        join: &JoinQuery,
6818        frame: &dyn super::statement_frame::ReadFrame,
6819    ) -> RedDBResult<()> {
6820        let mut by_table: HashMap<String, BTreeSet<String>> = HashMap::new();
6821        let projections = crate::storage::query::sql_lowering::effective_join_projections(join);
6822        self.collect_join_projection_columns(join, &projections, &mut by_table)?;
6823
6824        for (table, columns) in by_table {
6825            let query = TableQuery {
6826                table,
6827                source: None,
6828                alias: None,
6829                select_items: Vec::new(),
6830                columns: columns.into_iter().map(Projection::Column).collect(),
6831                where_expr: None,
6832                filter: None,
6833                group_by_exprs: Vec::new(),
6834                group_by: Vec::new(),
6835                having_expr: None,
6836                having: None,
6837                order_by: Vec::new(),
6838                limit: None,
6839                limit_param: None,
6840                offset: None,
6841                offset_param: None,
6842                expand: None,
6843                as_of: None,
6844                sessionize: None,
6845                distinct: false,
6846            };
6847            self.check_table_column_projection_authz(&query, frame)?;
6848        }
6849        Ok(())
6850    }
6851
6852    fn collect_join_projection_columns(
6853        &self,
6854        join: &JoinQuery,
6855        projections: &[Projection],
6856        out: &mut HashMap<String, BTreeSet<String>>,
6857    ) -> RedDBResult<()> {
6858        let left = table_side_context(join.left.as_ref());
6859        let right = table_side_context(join.right.as_ref());
6860
6861        if projections
6862            .iter()
6863            .any(|projection| matches!(projection, Projection::All))
6864        {
6865            for side in [left.as_ref(), right.as_ref()].into_iter().flatten() {
6866                out.entry(side.table.clone())
6867                    .or_default()
6868                    .extend(self.table_all_projection_columns(&side.table)?);
6869            }
6870            return Ok(());
6871        }
6872
6873        for projection in projections {
6874            collect_projection_columns_for_join_side(
6875                projection,
6876                left.as_ref(),
6877                right.as_ref(),
6878                out,
6879            )?;
6880        }
6881        Ok(())
6882    }
6883
6884    fn resolved_table_projection_columns(&self, table: &TableQuery) -> RedDBResult<Vec<String>> {
6885        let projections = crate::storage::query::sql_lowering::effective_table_projections(table);
6886        if projections
6887            .iter()
6888            .any(|projection| matches!(projection, Projection::All))
6889        {
6890            return self.table_all_projection_columns(&table.table);
6891        }
6892
6893        let mut columns = BTreeSet::new();
6894        for projection in &projections {
6895            collect_projection_columns_for_table(
6896                projection,
6897                &table.table,
6898                table.alias.as_deref(),
6899                &mut columns,
6900            );
6901        }
6902        Ok(columns.into_iter().collect())
6903    }
6904
6905    fn table_all_projection_columns(&self, table: &str) -> RedDBResult<Vec<String>> {
6906        if let Some(contract) = self.inner.db.collection_contract_arc(table) {
6907            let columns: Vec<String> = contract
6908                .declared_columns
6909                .iter()
6910                .map(|column| column.name.clone())
6911                .collect();
6912            if !columns.is_empty() {
6913                return Ok(columns);
6914            }
6915        }
6916
6917        let records = scan_runtime_table_source_records_limited(&self.inner.db, table, Some(1))?;
6918        Ok(records
6919            .first()
6920            .map(|record| {
6921                record
6922                    .column_names()
6923                    .into_iter()
6924                    .map(|column| column.to_string())
6925                    .collect()
6926            })
6927            .unwrap_or_default())
6928    }
6929
6930    fn resolve_table_expr_subqueries(
6931        &self,
6932        mut table: TableQuery,
6933        frame: &dyn super::statement_frame::ReadFrame,
6934    ) -> RedDBResult<TableQuery> {
6935        // Only a `Subquery` source needs recursive resolution. `.take()`
6936        // would otherwise drop a `Name` / `Function` source on the floor
6937        // (the `if let` skips the body but the take already cleared it),
6938        // which silently broke `SELECT * FROM components(g)` — the TVF
6939        // dispatch downstream keys off `TableSource::Function` and never
6940        // fired. Restore any non-subquery source unchanged (issue #795).
6941        match table.source.take() {
6942            Some(TableSource::Subquery(inner)) => {
6943                let inner = self.resolve_select_expr_subqueries(*inner, frame)?;
6944                table.source = Some(TableSource::Subquery(Box::new(inner)));
6945            }
6946            other => table.source = other,
6947        }
6948
6949        let outer_scopes = relation_scopes_for_query(&QueryExpr::Table(table.clone()));
6950        for item in &mut table.select_items {
6951            if let crate::storage::query::ast::SelectItem::Expr { expr, .. } = item {
6952                *expr = self.resolve_expr_subqueries(expr.clone(), &outer_scopes, frame)?;
6953            }
6954        }
6955        if let Some(where_expr) = table.where_expr.take() {
6956            table.where_expr =
6957                Some(self.resolve_expr_subqueries(where_expr, &outer_scopes, frame)?);
6958            table.filter = None;
6959        }
6960        if let Some(having_expr) = table.having_expr.take() {
6961            table.having_expr =
6962                Some(self.resolve_expr_subqueries(having_expr, &outer_scopes, frame)?);
6963            table.having = None;
6964        }
6965        for expr in &mut table.group_by_exprs {
6966            *expr = self.resolve_expr_subqueries(expr.clone(), &outer_scopes, frame)?;
6967        }
6968        for clause in &mut table.order_by {
6969            if let Some(expr) = clause.expr.take() {
6970                clause.expr = Some(self.resolve_expr_subqueries(expr, &outer_scopes, frame)?);
6971            }
6972        }
6973        Ok(table)
6974    }
6975
6976    fn resolve_select_expr_subqueries(
6977        &self,
6978        expr: QueryExpr,
6979        frame: &dyn super::statement_frame::ReadFrame,
6980    ) -> RedDBResult<QueryExpr> {
6981        match expr {
6982            QueryExpr::Table(table) => self
6983                .resolve_table_expr_subqueries(table, frame)
6984                .map(QueryExpr::Table),
6985            QueryExpr::Join(mut join) => {
6986                join.left = Box::new(self.resolve_select_expr_subqueries(*join.left, frame)?);
6987                join.right = Box::new(self.resolve_select_expr_subqueries(*join.right, frame)?);
6988                Ok(QueryExpr::Join(join))
6989            }
6990            other => Ok(other),
6991        }
6992    }
6993
6994    fn resolve_expr_subqueries(
6995        &self,
6996        expr: crate::storage::query::ast::Expr,
6997        outer_scopes: &[String],
6998        frame: &dyn super::statement_frame::ReadFrame,
6999    ) -> RedDBResult<crate::storage::query::ast::Expr> {
7000        use crate::storage::query::ast::Expr;
7001
7002        match expr {
7003            Expr::Subquery { query, span } => {
7004                let values = self.execute_expr_subquery_values(query, outer_scopes, frame)?;
7005                if values.len() > 1 {
7006                    return Err(RedDBError::Query(
7007                        "scalar subquery returned more than one row".to_string(),
7008                    ));
7009                }
7010                Ok(Expr::Literal {
7011                    value: values.into_iter().next().unwrap_or(Value::Null),
7012                    span,
7013                })
7014            }
7015            Expr::BinaryOp { op, lhs, rhs, span } => Ok(Expr::BinaryOp {
7016                op,
7017                lhs: Box::new(self.resolve_expr_subqueries(*lhs, outer_scopes, frame)?),
7018                rhs: Box::new(self.resolve_expr_subqueries(*rhs, outer_scopes, frame)?),
7019                span,
7020            }),
7021            Expr::UnaryOp { op, operand, span } => Ok(Expr::UnaryOp {
7022                op,
7023                operand: Box::new(self.resolve_expr_subqueries(*operand, outer_scopes, frame)?),
7024                span,
7025            }),
7026            Expr::Cast {
7027                inner,
7028                target,
7029                span,
7030            } => Ok(Expr::Cast {
7031                inner: Box::new(self.resolve_expr_subqueries(*inner, outer_scopes, frame)?),
7032                target,
7033                span,
7034            }),
7035            Expr::FunctionCall { name, args, span } => {
7036                let args = args
7037                    .into_iter()
7038                    .map(|arg| self.resolve_expr_subqueries(arg, outer_scopes, frame))
7039                    .collect::<RedDBResult<Vec<_>>>()?;
7040                Ok(Expr::FunctionCall { name, args, span })
7041            }
7042            Expr::Case {
7043                branches,
7044                else_,
7045                span,
7046            } => {
7047                let branches = branches
7048                    .into_iter()
7049                    .map(|(cond, value)| {
7050                        Ok((
7051                            self.resolve_expr_subqueries(cond, outer_scopes, frame)?,
7052                            self.resolve_expr_subqueries(value, outer_scopes, frame)?,
7053                        ))
7054                    })
7055                    .collect::<RedDBResult<Vec<_>>>()?;
7056                let else_ = else_
7057                    .map(|expr| self.resolve_expr_subqueries(*expr, outer_scopes, frame))
7058                    .transpose()?
7059                    .map(Box::new);
7060                Ok(Expr::Case {
7061                    branches,
7062                    else_,
7063                    span,
7064                })
7065            }
7066            Expr::IsNull {
7067                operand,
7068                negated,
7069                span,
7070            } => Ok(Expr::IsNull {
7071                operand: Box::new(self.resolve_expr_subqueries(*operand, outer_scopes, frame)?),
7072                negated,
7073                span,
7074            }),
7075            Expr::InList {
7076                target,
7077                values,
7078                negated,
7079                span,
7080            } => {
7081                let target =
7082                    Box::new(self.resolve_expr_subqueries(*target, outer_scopes, frame)?);
7083                let mut resolved = Vec::new();
7084                for value in values {
7085                    if let Expr::Subquery { query, .. } = value {
7086                        resolved.extend(
7087                            self.execute_expr_subquery_values(query, outer_scopes, frame)?
7088                                .into_iter()
7089                                .map(Expr::lit),
7090                        );
7091                    } else {
7092                        resolved.push(self.resolve_expr_subqueries(value, outer_scopes, frame)?);
7093                    }
7094                }
7095                Ok(Expr::InList {
7096                    target,
7097                    values: resolved,
7098                    negated,
7099                    span,
7100                })
7101            }
7102            Expr::Between {
7103                target,
7104                low,
7105                high,
7106                negated,
7107                span,
7108            } => Ok(Expr::Between {
7109                target: Box::new(self.resolve_expr_subqueries(*target, outer_scopes, frame)?),
7110                low: Box::new(self.resolve_expr_subqueries(*low, outer_scopes, frame)?),
7111                high: Box::new(self.resolve_expr_subqueries(*high, outer_scopes, frame)?),
7112                negated,
7113                span,
7114            }),
7115            other => Ok(other),
7116        }
7117    }
7118
7119    fn execute_expr_subquery_values(
7120        &self,
7121        subquery: crate::storage::query::ast::ExprSubquery,
7122        outer_scopes: &[String],
7123        frame: &dyn super::statement_frame::ReadFrame,
7124    ) -> RedDBResult<Vec<Value>> {
7125        let query = *subquery.query;
7126        if query_references_outer_scope(&query, outer_scopes) {
7127            return Err(RedDBError::Query(
7128                "NOT_YET_SUPPORTED: correlated subqueries are not supported yet; track follow-up issue #470-correlated-subqueries".to_string(),
7129            ));
7130        }
7131        let query = self.rewrite_view_refs(query);
7132        let query = self.resolve_select_expr_subqueries(query, frame)?;
7133        let query = self.authorize_relational_select_expr(query, frame)?;
7134        let result = match query {
7135            QueryExpr::Table(table) => {
7136                execute_runtime_table_query(&self.inner.db, &table, Some(&self.inner.index_store))?
7137            }
7138            QueryExpr::Join(join) => execute_runtime_join_query(&self.inner.db, &join)?,
7139            other => {
7140                return Err(RedDBError::Query(format!(
7141                    "expression subquery must be a SELECT query, got {}",
7142                    query_expr_name(&other)
7143                )))
7144            }
7145        };
7146        first_column_values(result)
7147    }
7148
7149    fn dispatch_expr(
7150        &self,
7151        expr: QueryExpr,
7152        query_str: &str,
7153        mode: QueryMode,
7154    ) -> RedDBResult<RuntimeQueryResult> {
7155        let statement = query_expr_name(&expr);
7156        match expr {
7157            QueryExpr::Graph(_) | QueryExpr::Path(_) => {
7158                // Graph queries are not cacheable as prepared statements.
7159                Err(RedDBError::Query(
7160                    "graph queries cannot be used as prepared statements".to_string(),
7161                ))
7162            }
7163            QueryExpr::Table(table) => {
7164                let scope = self.ai_scope();
7165                let table = self.resolve_table_expr_subqueries(
7166                    table,
7167                    &scope as &dyn super::statement_frame::ReadFrame,
7168                )?;
7169                // Table-valued functions (e.g. components(g)) dispatch to a
7170                // read-only executor before any catalog/virtual-table routing
7171                // (issue #795).
7172                if let Some(TableSource::Function {
7173                    name,
7174                    args,
7175                    named_args,
7176                }) = table.source.clone()
7177                {
7178                    return Ok(RuntimeQueryResult {
7179                        query: query_str.to_string(),
7180                        mode,
7181                        statement,
7182                        engine: "runtime-graph-tvf",
7183                        result: self.execute_table_function(&name, &args, &named_args)?,
7184                        affected_rows: 0,
7185                        statement_type: "select",
7186                        bookmark: None,
7187                    });
7188                }
7189                // Inline-graph TVF (issue #799) on the prepared-statement /
7190                // direct-expr path. Result caching is wired on the
7191                // `execute_query_inner` path; here we just compute and return.
7192                if let Some(TableSource::InlineGraphFunction {
7193                    name,
7194                    nodes,
7195                    edges,
7196                    named_args,
7197                }) = table.source.clone()
7198                {
7199                    return Ok(RuntimeQueryResult {
7200                        query: query_str.to_string(),
7201                        mode,
7202                        statement,
7203                        engine: "runtime-graph-tvf-inline",
7204                        result: self.execute_inline_graph_function(
7205                            &name,
7206                            &nodes,
7207                            &edges,
7208                            &named_args,
7209                        )?,
7210                        affected_rows: 0,
7211                        statement_type: "select",
7212                        bookmark: None,
7213                    });
7214                }
7215                if super::red_schema::is_virtual_table(&table.table) {
7216                    return Ok(RuntimeQueryResult {
7217                        query: query_str.to_string(),
7218                        mode,
7219                        statement,
7220                        engine: "runtime-red-schema",
7221                        result: super::red_schema::red_query(
7222                            self,
7223                            &table.table,
7224                            &table,
7225                            &scope as &dyn super::statement_frame::ReadFrame,
7226                        )?,
7227                        affected_rows: 0,
7228                        statement_type: "select",
7229                        bookmark: None,
7230                    });
7231                }
7232                // `<graph>.<output>` analytics virtual view (issue #800).
7233                if let Some(view_result) = self.try_resolve_analytics_view(
7234                    &table,
7235                    &scope as &dyn super::statement_frame::ReadFrame,
7236                )? {
7237                    return Ok(RuntimeQueryResult {
7238                        query: query_str.to_string(),
7239                        mode,
7240                        statement,
7241                        engine: "runtime-graph-analytics-view",
7242                        result: view_result,
7243                        affected_rows: 0,
7244                        statement_type: "select",
7245                        bookmark: None,
7246                    });
7247                }
7248                let Some(table_with_rls) = self.authorize_relational_table_select(
7249                    table,
7250                    &scope as &dyn super::statement_frame::ReadFrame,
7251                )?
7252                else {
7253                    return Ok(RuntimeQueryResult {
7254                        query: query_str.to_string(),
7255                        mode,
7256                        statement,
7257                        engine: "runtime-table-rls",
7258                        result: crate::storage::query::unified::UnifiedResult::empty(),
7259                        affected_rows: 0,
7260                        statement_type: "select",
7261                        bookmark: None,
7262                    });
7263                };
7264                Ok(RuntimeQueryResult {
7265                    query: query_str.to_string(),
7266                    mode,
7267                    statement,
7268                    engine: "runtime-table",
7269                    result: execute_runtime_table_query(
7270                        &self.inner.db,
7271                        &table_with_rls,
7272                        Some(&self.inner.index_store),
7273                    )?,
7274                    affected_rows: 0,
7275                    statement_type: "select",
7276                    bookmark: None,
7277                })
7278            }
7279            QueryExpr::Join(join) => {
7280                let scope = self.ai_scope();
7281                let Some(join_with_rls) = self.authorize_relational_join_select(
7282                    join,
7283                    &scope as &dyn super::statement_frame::ReadFrame,
7284                )?
7285                else {
7286                    return Ok(RuntimeQueryResult {
7287                        query: query_str.to_string(),
7288                        mode,
7289                        statement,
7290                        engine: "runtime-join-rls",
7291                        result: crate::storage::query::unified::UnifiedResult::empty(),
7292                        affected_rows: 0,
7293                        statement_type: "select",
7294                        bookmark: None,
7295                    });
7296                };
7297                Ok(RuntimeQueryResult {
7298                    query: query_str.to_string(),
7299                    mode,
7300                    statement,
7301                    engine: "runtime-join",
7302                    result: execute_runtime_join_query(&self.inner.db, &join_with_rls)?,
7303                    affected_rows: 0,
7304                    statement_type: "select",
7305                    bookmark: None,
7306                })
7307            }
7308            QueryExpr::Vector(vector) => Ok(RuntimeQueryResult {
7309                query: query_str.to_string(),
7310                mode,
7311                statement,
7312                engine: "runtime-vector",
7313                result: execute_runtime_vector_query(&self.inner.db, &vector)?,
7314                affected_rows: 0,
7315                statement_type: "select",
7316                bookmark: None,
7317            }),
7318            QueryExpr::Hybrid(hybrid) => Ok(RuntimeQueryResult {
7319                query: query_str.to_string(),
7320                mode,
7321                statement,
7322                engine: "runtime-hybrid",
7323                result: execute_runtime_hybrid_query(&self.inner.db, &hybrid)?,
7324                affected_rows: 0,
7325                statement_type: "select",
7326                bookmark: None,
7327            }),
7328            QueryExpr::Insert(ref insert) if super::red_schema::is_virtual_table(&insert.table) => {
7329                Err(RedDBError::Query(
7330                    super::red_schema::READ_ONLY_ERROR.to_string(),
7331                ))
7332            }
7333            QueryExpr::Update(ref update) if super::red_schema::is_virtual_table(&update.table) => {
7334                Err(RedDBError::Query(
7335                    super::red_schema::READ_ONLY_ERROR.to_string(),
7336                ))
7337            }
7338            QueryExpr::Delete(ref delete) if super::red_schema::is_virtual_table(&delete.table) => {
7339                Err(RedDBError::Query(
7340                    super::red_schema::READ_ONLY_ERROR.to_string(),
7341                ))
7342            }
7343            QueryExpr::Insert(ref insert) => self
7344                .with_deferred_store_wal_for_dml(self.insert_may_emit_events(insert), || {
7345                    self.execute_insert(query_str, insert)
7346                }),
7347            QueryExpr::Update(ref update) => self
7348                .with_deferred_store_wal_for_dml(self.update_may_emit_events(update), || {
7349                    self.execute_update(query_str, update)
7350                }),
7351            QueryExpr::Delete(ref delete) => self
7352                .with_deferred_store_wal_for_dml(self.delete_may_emit_events(delete), || {
7353                    self.execute_delete(query_str, delete)
7354                }),
7355            QueryExpr::SearchCommand(ref cmd) => self.execute_search_command(query_str, cmd),
7356            QueryExpr::Ask(ref ask) => self.execute_ask(query_str, ask),
7357            _ => Err(RedDBError::Query(format!(
7358                "prepared-statement execution does not support {statement} statements"
7359            ))),
7360        }
7361    }
7362
7363    /// Dispatch a graph-collection table-valued function call in FROM
7364    /// position (e.g. `SELECT * FROM components(g)`).
7365    ///
7366    /// Validates the function name and arity here, materializes the whole
7367    /// active graph read-only, then runs the algorithm via the shared
7368    /// `dispatch_graph_algorithm` path. Never mutates the catalog or store.
7369    fn execute_table_function(
7370        &self,
7371        name: &str,
7372        args: &[String],
7373        named_args: &[(String, f64)],
7374    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
7375        if !is_graph_tvf_name(name) {
7376            return Err(RedDBError::Query(format!("unknown table function: {name}")));
7377        }
7378        // Every graph-collection TVF takes exactly one graph argument.
7379        if args.len() != 1 {
7380            return Err(RedDBError::Query(format!(
7381                "table function '{name}' takes exactly 1 graph argument, got {}",
7382                args.len()
7383            )));
7384        }
7385
7386        // Read-only materialization of the full active graph. Passing `None`
7387        // for the projection uses the full graph store. Like #795/#796, the
7388        // v0 form runs over the whole graph store regardless of the collection
7389        // argument value. Materialization never mutates any store.
7390        let (nodes, edges) = self.materialize_whole_graph_abstract()?;
7391        self.dispatch_graph_algorithm(name, nodes, edges, named_args)
7392    }
7393
7394    /// Dispatch an inline-graph table-valued function call in FROM position
7395    /// (e.g. `SELECT * FROM components(nodes => (…), edges => (…))`, issue
7396    /// #799).
7397    ///
7398    /// Materializes the two subqueries through the normal read path (so RLS,
7399    /// column authz, and MVCC visibility all apply), constructs the abstract
7400    /// graph — the first column of `nodes` is the node id; the first two-or-
7401    /// three columns of `edges` are `(source, target [, weight])` — then runs
7402    /// the same algorithm path used by the graph-collection form. Read-only.
7403    fn execute_inline_graph_function(
7404        &self,
7405        name: &str,
7406        nodes_query: &QueryExpr,
7407        edges_query: &QueryExpr,
7408        named_args: &[(String, f64)],
7409    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
7410        if !is_graph_tvf_name(name) {
7411            return Err(RedDBError::Query(format!("unknown table function: {name}")));
7412        }
7413
7414        let node_result = self.execute_query_expr(nodes_query.clone())?.result;
7415        let nodes = inline_node_ids(name, &node_result)?;
7416
7417        let edge_result = self.execute_query_expr(edges_query.clone())?.result;
7418        let edges = inline_edges(name, &edge_result)?;
7419
7420        self.dispatch_graph_algorithm(name, nodes, edges, named_args)
7421    }
7422
7423    /// Materialize the whole active graph read-only into the abstract
7424    /// `(nodes, edges)` inputs the pure graph algorithms consume.
7425    fn materialize_whole_graph_abstract(
7426        &self,
7427    ) -> RedDBResult<(
7428        Vec<String>,
7429        Vec<(
7430            String,
7431            String,
7432            crate::storage::engine::graph_algorithms::Weight,
7433        )>,
7434    )> {
7435        use crate::storage::engine::graph_algorithms;
7436
7437        let graph = super::graph_dsl::materialize_graph_with_projection(
7438            self.inner.db.store().as_ref(),
7439            None,
7440        )?;
7441        let nodes: Vec<String> = graph.iter_nodes().map(|n| n.id.clone()).collect();
7442        let edges: Vec<(String, String, graph_algorithms::Weight)> = graph
7443            .iter_all_edges()
7444            .into_iter()
7445            .map(|e| (e.source_id, e.target_id, e.weight))
7446            .collect();
7447        Ok((nodes, edges))
7448    }
7449
7450    /// Resolve a `<graph>.<output>` analytics virtual view (issue #800).
7451    ///
7452    /// Returns `Ok(None)` when `table` is not an analytics view — either the
7453    /// name is not dotted, a real collection of that exact name exists (a real
7454    /// collection always wins; no shadowing), the suffix is not a recognised
7455    /// analytics output, or the parent is not a graph. Returns `Ok(Some(_))`
7456    /// with the freshly computed result when it does resolve, and an error when
7457    /// the parent graph exists but the output is not enabled, a declared
7458    /// algorithm is unsupported, or the parent collection's policy denies the
7459    /// read.
7460    ///
7461    /// The view is recomputed on every call (no result-cache write) so it
7462    /// always reflects the current graph data, satisfying the on-demand
7463    /// recompute contract for this slice.
7464    fn try_resolve_analytics_view(
7465        &self,
7466        table: &TableQuery,
7467        frame: &dyn super::statement_frame::ReadFrame,
7468    ) -> RedDBResult<Option<crate::storage::query::unified::UnifiedResult>> {
7469        let full = table.table.as_str();
7470        let Some(dot) = full.rfind('.') else {
7471            return Ok(None);
7472        };
7473        // A real collection literally named `g.communities` always wins.
7474        if self.inner.db.store().get_collection(full).is_some() {
7475            return Ok(None);
7476        }
7477        let graph_name = &full[..dot];
7478        let output_name = &full[dot + 1..];
7479        let Some(output) = crate::catalog::AnalyticsOutput::from_str(output_name) else {
7480            return Ok(None);
7481        };
7482
7483        let contracts = self.inner.db.collection_contracts();
7484        let Some(contract) = contracts.iter().find(|c| c.name == graph_name) else {
7485            return Ok(None);
7486        };
7487        if contract.declared_model != crate::catalog::CollectionModel::Graph {
7488            return Ok(None);
7489        }
7490        let Some(view) = contract
7491            .analytics_config
7492            .iter()
7493            .find(|view| view.output == output)
7494        else {
7495            // The parent graph exists but this output was not declared — a
7496            // clear error beats the misleading "collection not found".
7497            return Err(RedDBError::Query(format!(
7498                "analytics output '{output_name}' is not enabled on graph '{graph_name}'; declare it with WITH ANALYTICS (...)"
7499            )));
7500        };
7501
7502        // Policy inheritance (AC5): route through the parent graph collection's
7503        // read authorization. A policy or RLS rule that denies the parent
7504        // denies its analytics views transitively.
7505        let parent_query = TableQuery::new(graph_name);
7506        if self
7507            .authorize_relational_table_select(parent_query, frame)?
7508            .is_none()
7509        {
7510            return Err(RedDBError::Query(format!(
7511                "permission denied: policy on graph '{graph_name}' denies analytics view '{output_name}'"
7512            )));
7513        }
7514
7515        let (algorithm, named_args) = analytics_view_algorithm(graph_name, view)?;
7516        let (nodes, edges) = self.materialize_whole_graph_abstract()?;
7517        let result = self.dispatch_graph_algorithm(&algorithm, nodes, edges, &named_args)?;
7518        Ok(Some(result))
7519    }
7520
7521    /// Shared algorithm dispatch over abstract `(nodes, edges)` inputs.
7522    ///
7523    /// Both the graph-collection form and the inline-graph form route here so
7524    /// named-argument validation and the projected row shape stay identical
7525    /// across the two signatures (issue #799). Projects each algorithm's
7526    /// native output shape.
7527    fn dispatch_graph_algorithm(
7528        &self,
7529        name: &str,
7530        nodes: Vec<String>,
7531        edges: Vec<(
7532            String,
7533            String,
7534            crate::storage::engine::graph_algorithms::Weight,
7535        )>,
7536        named_args: &[(String, f64)],
7537    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
7538        use crate::storage::engine::graph_algorithms;
7539        use crate::storage::query::unified::UnifiedResult;
7540        use crate::storage::schema::Value;
7541
7542        if name.eq_ignore_ascii_case("components") {
7543            reject_named_args(name, named_args)?;
7544            let assignment = graph_algorithms::connected_components(&nodes, &edges);
7545            let mut result =
7546                UnifiedResult::with_columns(vec!["node_id".into(), "island_id".into()]);
7547            for (node_id, island_id) in assignment {
7548                let mut record = UnifiedRecord::new();
7549                record.set("node_id", Value::text(node_id));
7550                record.set("island_id", Value::Integer(island_id as i64));
7551                result.push(record);
7552            }
7553            return Ok(result);
7554        }
7555
7556        if name.eq_ignore_ascii_case("louvain") {
7557            // The only supported named argument is `resolution` (γ). It
7558            // defaults to 1.0 (classic modularity) and must be a finite,
7559            // strictly positive number — a non-positive (or NaN/inf)
7560            // resolution has no sensible meaning.
7561            let resolution = louvain_resolution(named_args)?;
7562            let assignment = graph_algorithms::louvain(&nodes, &edges, resolution);
7563            let mut result =
7564                UnifiedResult::with_columns(vec!["node_id".into(), "community_id".into()]);
7565            for (node_id, community_id) in assignment {
7566                let mut record = UnifiedRecord::new();
7567                record.set("node_id", Value::text(node_id));
7568                record.set("community_id", Value::Integer(community_id as i64));
7569                result.push(record);
7570            }
7571            return Ok(result);
7572        }
7573
7574        if name.eq_ignore_ascii_case("degree_centrality") {
7575            reject_named_args(name, named_args)?;
7576            let assignment = abstract_degree_centrality(&nodes, &edges);
7577            let mut result = UnifiedResult::with_columns(vec!["node_id".into(), "degree".into()]);
7578            for (node_id, degree) in assignment {
7579                let mut record = UnifiedRecord::new();
7580                record.set("node_id", Value::text(node_id));
7581                record.set("degree", Value::Integer(degree as i64));
7582                result.push(record);
7583            }
7584            return Ok(result);
7585        }
7586
7587        if name.eq_ignore_ascii_case("shortest_path") {
7588            // Scalar named arguments: `src` and `dst` are required node ids,
7589            // `max_hops` is an optional non-negative edge-count cap. Node ids
7590            // in the graph store are integer entity ids rendered as strings, so
7591            // each id arg must be a non-negative whole number; reject anything
7592            // else (fractional, negative, NaN/inf) with a clear message.
7593            let mut src: Option<String> = None;
7594            let mut dst: Option<String> = None;
7595            let mut max_hops: Option<usize> = None;
7596            let as_node_id = |key: &str, value: f64| -> RedDBResult<String> {
7597                if !value.is_finite() || value < 0.0 || value.fract() != 0.0 {
7598                    return Err(RedDBError::Query(format!(
7599                        "table function 'shortest_path' argument '{key}' must be a non-negative integer node id, got {value}"
7600                    )));
7601                }
7602                Ok((value as i64).to_string())
7603            };
7604            for (key, value) in named_args {
7605                if key.eq_ignore_ascii_case("src") {
7606                    src = Some(as_node_id("src", *value)?);
7607                } else if key.eq_ignore_ascii_case("dst") {
7608                    dst = Some(as_node_id("dst", *value)?);
7609                } else if key.eq_ignore_ascii_case("max_hops") {
7610                    if !value.is_finite() || *value < 0.0 || value.fract() != 0.0 {
7611                        return Err(RedDBError::Query(format!(
7612                            "table function 'shortest_path' max_hops must be a non-negative integer, got {value}"
7613                        )));
7614                    }
7615                    max_hops = Some(*value as usize);
7616                } else {
7617                    return Err(RedDBError::Query(format!(
7618                        "table function 'shortest_path' has no named argument '{key}' (expected 'src', 'dst', 'max_hops')"
7619                    )));
7620                }
7621            }
7622            let src = src.ok_or_else(|| {
7623                RedDBError::Query(
7624                    "table function 'shortest_path' requires named argument 'src'".to_string(),
7625                )
7626            })?;
7627            let dst = dst.ok_or_else(|| {
7628                RedDBError::Query(
7629                    "table function 'shortest_path' requires named argument 'dst'".to_string(),
7630                )
7631            })?;
7632
7633            // Columns are always present; an unreachable pair (within the
7634            // optional `max_hops` budget) simply yields zero rows — never an
7635            // error. `hop` is the 0-based index from the source;
7636            // `cumulative_weight` is the running path weight (0 at the source,
7637            // the total at the destination). Edges are treated as undirected,
7638            // consistent with `components` / `louvain`.
7639            let mut result = UnifiedResult::with_columns(vec![
7640                "hop".into(),
7641                "node_id".into(),
7642                "cumulative_weight".into(),
7643            ]);
7644            if let Some(path) =
7645                graph_algorithms::shortest_path(&nodes, &edges, &src, &dst, max_hops)
7646            {
7647                for (hop, (node_id, cumulative_weight)) in path.into_iter().enumerate() {
7648                    let mut record = UnifiedRecord::new();
7649                    record.set("hop", Value::Integer(hop as i64));
7650                    record.set("node_id", Value::text(node_id));
7651                    record.set("cumulative_weight", Value::Float(cumulative_weight));
7652                    result.push(record);
7653                }
7654            }
7655            return Ok(result);
7656        }
7657        // ── Centrality family (issue #797): each returns rows `(node_id,
7658        // score)` over the abstract `(nodes, edges)` graph. Like the other
7659        // graph TVFs the graph is treated as undirected and scores are
7660        // deterministic; the inline-graph form shares this dispatch. ──
7661        if name.eq_ignore_ascii_case("betweenness") {
7662            reject_named_args(name, named_args)?;
7663            return Ok(Self::centrality_result(graph_algorithms::betweenness(
7664                &nodes, &edges,
7665            )));
7666        }
7667        if name.eq_ignore_ascii_case("eigenvector") {
7668            // Optional `max_iterations` (positive integer, default 100) and
7669            // `tolerance` (finite, strictly positive, default 1e-6).
7670            let mut max_iterations = 100_usize;
7671            let mut tolerance = 1e-6_f64;
7672            for (key, value) in named_args {
7673                if key.eq_ignore_ascii_case("max_iterations") {
7674                    max_iterations = parse_positive_iterations("eigenvector", value)?;
7675                } else if key.eq_ignore_ascii_case("tolerance") {
7676                    if !value.is_finite() || *value <= 0.0 {
7677                        return Err(RedDBError::Query(format!(
7678                            "table function 'eigenvector' tolerance must be > 0, got {value}"
7679                        )));
7680                    }
7681                    tolerance = *value;
7682                } else {
7683                    return Err(RedDBError::Query(format!(
7684                        "table function 'eigenvector' has no named argument '{key}' (expected 'max_iterations' or 'tolerance')"
7685                    )));
7686                }
7687            }
7688            return Ok(Self::centrality_result(graph_algorithms::eigenvector(
7689                &nodes,
7690                &edges,
7691                max_iterations,
7692                tolerance,
7693            )));
7694        }
7695        if name.eq_ignore_ascii_case("pagerank") {
7696            // Optional `damping` (in (0, 1), default 0.85) and `max_iterations`
7697            // (positive integer, default 100).
7698            let mut damping = 0.85_f64;
7699            let mut max_iterations = 100_usize;
7700            for (key, value) in named_args {
7701                if key.eq_ignore_ascii_case("damping") {
7702                    if !value.is_finite() || *value <= 0.0 || *value >= 1.0 {
7703                        return Err(RedDBError::Query(format!(
7704                            "table function 'pagerank' damping must be in (0, 1), got {value}"
7705                        )));
7706                    }
7707                    damping = *value;
7708                } else if key.eq_ignore_ascii_case("max_iterations") {
7709                    max_iterations = parse_positive_iterations("pagerank", value)?;
7710                } else {
7711                    return Err(RedDBError::Query(format!(
7712                        "table function 'pagerank' has no named argument '{key}' (expected 'damping' or 'max_iterations')"
7713                    )));
7714                }
7715            }
7716            return Ok(Self::centrality_result(graph_algorithms::pagerank(
7717                &nodes,
7718                &edges,
7719                damping,
7720                max_iterations,
7721            )));
7722        }
7723        Err(RedDBError::Query(format!("unknown table function: {name}")))
7724    }
7725
7726    /// `components(<graph_collection>)` — returns rows `(node_id, island_id)`.
7727    ///
7728    /// Materializes the active graph (nodes + weighted edges) read-only and
7729    /// runs the pure `graph_algorithms::connected_components`. Edges are
7730    /// treated as undirected; island ids are deterministic (ascending order of
7731    /// each component's smallest node).
7732    fn execute_components_tvf(
7733        &self,
7734        _collection: &str,
7735    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
7736        use crate::storage::engine::graph_algorithms;
7737        use crate::storage::query::unified::UnifiedResult;
7738        use crate::storage::schema::Value;
7739
7740        // Read-only materialization of the full active graph. The named
7741        // collection identifies the active graph scope; passing `None` for the
7742        // projection uses the full graph store (the same result
7743        // `active_graph_projection` yields when no projection is registered).
7744        // Materialization never mutates any store.
7745        let graph = super::graph_dsl::materialize_graph_with_projection(
7746            self.inner.db.store().as_ref(),
7747            None,
7748        )?;
7749
7750        // Materialize abstract inputs for the pure algorithm.
7751        let nodes: Vec<String> = graph.iter_nodes().map(|n| n.id.clone()).collect();
7752        let edges: Vec<(String, String, graph_algorithms::Weight)> = graph
7753            .iter_all_edges()
7754            .into_iter()
7755            .map(|e| (e.source_id, e.target_id, e.weight))
7756            .collect();
7757
7758        let assignment = graph_algorithms::connected_components(&nodes, &edges);
7759
7760        // Project into a UnifiedResult with columns ["node_id", "island_id"].
7761        let mut result = UnifiedResult::with_columns(vec!["node_id".into(), "island_id".into()]);
7762        for (node_id, island_id) in assignment {
7763            let mut record = UnifiedRecord::new();
7764            record.set("node_id", Value::text(node_id));
7765            record.set("island_id", Value::Integer(island_id as i64));
7766            result.push(record);
7767        }
7768        Ok(result)
7769    }
7770
7771    /// `louvain(<graph> [, resolution => <f64>])` — returns rows
7772    /// `(node_id, community_id)` (issue #796).
7773    ///
7774    /// Materializes the active graph (nodes + weighted edges) read-only and
7775    /// runs the pure, deterministic `graph_algorithms::louvain`. Edges are
7776    /// treated as undirected; community ids are assigned in ascending order of
7777    /// each community's smallest node, so identical input + resolution always
7778    /// yields identical rows. Like `components`, the v0 form runs over the
7779    /// whole graph store regardless of the collection argument value.
7780    fn execute_louvain_tvf(
7781        &self,
7782        _collection: &str,
7783        resolution: f64,
7784    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
7785        use crate::storage::engine::graph_algorithms;
7786        use crate::storage::query::unified::UnifiedResult;
7787        use crate::storage::schema::Value;
7788
7789        let graph = super::graph_dsl::materialize_graph_with_projection(
7790            self.inner.db.store().as_ref(),
7791            None,
7792        )?;
7793
7794        let nodes: Vec<String> = graph.iter_nodes().map(|n| n.id.clone()).collect();
7795        let edges: Vec<(String, String, graph_algorithms::Weight)> = graph
7796            .iter_all_edges()
7797            .into_iter()
7798            .map(|e| (e.source_id, e.target_id, e.weight))
7799            .collect();
7800
7801        let assignment = graph_algorithms::louvain(&nodes, &edges, resolution);
7802
7803        // Project into a UnifiedResult with columns ["node_id", "community_id"].
7804        let mut result = UnifiedResult::with_columns(vec!["node_id".into(), "community_id".into()]);
7805        for (node_id, community_id) in assignment {
7806            let mut record = UnifiedRecord::new();
7807            record.set("node_id", Value::text(node_id));
7808            record.set("community_id", Value::Integer(community_id as i64));
7809            result.push(record);
7810        }
7811        Ok(result)
7812    }
7813
7814    /// Project `(node_id, score)` centrality rows into a `UnifiedResult` with
7815    /// columns `["node_id", "score"]`; scores are `Value::Float`.
7816    fn centrality_result(
7817        rows: Vec<(String, f64)>,
7818    ) -> crate::storage::query::unified::UnifiedResult {
7819        use crate::storage::query::unified::UnifiedResult;
7820        use crate::storage::schema::Value;
7821        let mut result = UnifiedResult::with_columns(vec!["node_id".into(), "score".into()]);
7822        for (node_id, score) in rows {
7823            let mut record = UnifiedRecord::new();
7824            record.set("node_id", Value::text(node_id));
7825            record.set("score", Value::Float(score));
7826            result.push(record);
7827        }
7828        result
7829    }
7830
7831    /// Ultra-fast path: detect `SELECT * FROM table WHERE _entity_id = N` by string pattern
7832    /// and execute it without SQL parsing or planning. Returns None if pattern doesn't match.
7833    fn try_fast_entity_lookup(&self, query: &str) -> Option<RedDBResult<RuntimeQueryResult>> {
7834        // Pattern: "SELECT * FROM <table> WHERE _entity_id = <id>"
7835        // or "SELECT * FROM <table> WHERE _entity_id =<id>"
7836        let q = query.trim();
7837        if !q.starts_with("SELECT") && !q.starts_with("select") {
7838            return None;
7839        }
7840
7841        // Find "WHERE _entity_id = " or "WHERE _entity_id ="
7842        let where_pos = q
7843            .find("WHERE _entity_id")
7844            .or_else(|| q.find("where _entity_id"))?;
7845        let after_field = &q[where_pos + 16..].trim_start(); // skip "WHERE _entity_id"
7846        let after_eq = after_field.strip_prefix('=')?.trim_start();
7847
7848        // Parse the entity ID number
7849        let id_str = after_eq.trim();
7850        let entity_id: u64 = id_str.parse().ok()?;
7851
7852        // Extract table name: between "FROM " and " WHERE"
7853        let from_pos = q.find("FROM ").or_else(|| q.find("from "))? + 5;
7854        let table = q[from_pos..where_pos].trim();
7855        if table.is_empty()
7856            || table.contains(' ') && !table.contains(" AS ") && !table.contains(" as ")
7857        {
7858            return None; // complex query, fall through
7859        }
7860        let table_name = table.split_whitespace().next()?;
7861
7862        // Direct entity lookup — skips SQL parse, plan cache, result
7863        // cache, view rewriter, RLS gate. Safe because the gating in
7864        // `execute_query` guarantees no scope override / no
7865        // transaction context is active. MVCC visibility is still
7866        // honoured against the current snapshot.
7867        let store = self.inner.db.store();
7868        let entity = store
7869            .get(
7870                table_name,
7871                crate::storage::unified::EntityId::new(entity_id),
7872            )
7873            .filter(entity_visible_under_current_snapshot)
7874            .filter(|entity| {
7875                self.inner
7876                    .db
7877                    .replica_allows_entity_at_read(table_name, entity)
7878            });
7879
7880        let count = if entity.is_some() { 1u64 } else { 0 };
7881
7882        // Materialize a record so downstream consumers that walk
7883        // `result.records` (embedded runtime API, decrypt pass, CLI)
7884        // see the row. Previously only `pre_serialized_json` was
7885        // filled, which caused those consumers to see zero rows and
7886        // skewed benchmarks.
7887        let records: Vec<crate::storage::query::unified::UnifiedRecord> = entity
7888            .as_ref()
7889            .and_then(|e| runtime_table_record_from_entity(e.clone()))
7890            .into_iter()
7891            .collect();
7892
7893        let json = match entity {
7894            Some(ref e) => execute_runtime_serialize_single_entity(e),
7895            None => r#"{"columns":[],"record_count":0,"selection":{"scope":"any"},"records":[]}"#
7896                .to_string(),
7897        };
7898
7899        Some(Ok(RuntimeQueryResult {
7900            query: query.to_string(),
7901            mode: crate::storage::query::modes::QueryMode::Sql,
7902            statement: "select",
7903            engine: "fast-entity-lookup",
7904            result: crate::storage::query::unified::UnifiedResult {
7905                columns: Vec::new(),
7906                records,
7907                stats: crate::storage::query::unified::QueryStats {
7908                    rows_scanned: count,
7909                    ..Default::default()
7910                },
7911                pre_serialized_json: Some(json),
7912            },
7913            affected_rows: 0,
7914            statement_type: "select",
7915            bookmark: None,
7916        }))
7917    }
7918
7919    pub(crate) fn invalidate_plan_cache(&self) {
7920        self.inner.query_cache.write().clear();
7921        self.inner
7922            .ddl_epoch
7923            .fetch_add(1, std::sync::atomic::Ordering::Release);
7924    }
7925
7926    /// Read the monotonic DDL epoch counter. Bumped by every
7927    /// `invalidate_plan_cache` call so prepared-statement holders can
7928    /// detect schema drift between PREPARE and EXECUTE.
7929    pub fn ddl_epoch(&self) -> u64 {
7930        self.inner
7931            .ddl_epoch
7932            .load(std::sync::atomic::Ordering::Acquire)
7933    }
7934
7935    pub(crate) fn clear_table_planner_stats(&self, table: &str) {
7936        let store = self.inner.db.store();
7937        crate::storage::query::planner::stats_catalog::clear_table_stats(store.as_ref(), table);
7938        self.invalidate_plan_cache();
7939    }
7940
7941    /// Replay `tenant_tables.*.column` keys from red_config at boot so
7942    /// `CREATE TABLE ... TENANT BY (col)` declarations persist across
7943    /// restarts (Phase 2.5.4). Reads every row of the `red_config`
7944    /// collection, picks the keys matching the tenant-marker shape,
7945    /// and calls `register_tenant_table` for each.
7946    ///
7947    /// Safe no-op when `red_config` doesn't exist (first boot on a
7948    /// fresh datadir).
7949    pub(crate) fn rehydrate_tenant_tables(&self) {
7950        let store = self.inner.db.store();
7951        let Some(manager) = store.get_collection("red_config") else {
7952            return;
7953        };
7954        // Replay in insertion order (SegmentManager iteration). Multiple
7955        // toggles on the same table leave several rows behind — the
7956        // last one processed wins because each register/unregister
7957        // call overwrites the in-memory state.
7958        for entity in manager.query_all(|_| true) {
7959            let crate::storage::unified::entity::EntityData::Row(row) = &entity.data else {
7960                continue;
7961            };
7962            let Some(named) = &row.named else { continue };
7963            let Some(crate::storage::schema::Value::Text(key)) = named.get("key") else {
7964                continue;
7965            };
7966            // Shape: tenant_tables.{table}.column
7967            let Some(rest) = key.strip_prefix("tenant_tables.") else {
7968                continue;
7969            };
7970            let Some((table, suffix)) = rest.rsplit_once('.') else {
7971                // Issue #205 — a `tenant_tables.*` row that doesn't
7972                // split cleanly is a schema-shape regression: the
7973                // metadata writer must always emit the `.column`
7974                // suffix, so reaching this branch means an upgrade
7975                // with incompatible state or external tampering.
7976                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7977                    collection: "red_config".to_string(),
7978                    detail: format!("malformed tenant_tables key: {key}"),
7979                }
7980                .emit_global();
7981                continue;
7982            };
7983            if suffix != "column" {
7984                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7985                    collection: "red_config".to_string(),
7986                    detail: format!("unexpected tenant_tables suffix: {key}"),
7987                }
7988                .emit_global();
7989                continue;
7990            }
7991            match named.get("value") {
7992                Some(crate::storage::schema::Value::Text(column)) => {
7993                    self.register_tenant_table(table, column);
7994                }
7995                // Null / missing value = DISABLE TENANCY marker.
7996                Some(crate::storage::schema::Value::Null) | None => {
7997                    self.unregister_tenant_table(table);
7998                }
7999                _ => {}
8000            }
8001        }
8002    }
8003
8004    /// Replay every persisted `MaterializedViewDescriptor` from the
8005    /// `red_materialized_view_defs` system collection (issue #593
8006    /// slice 9a). For each descriptor, re-parse the original SQL,
8007    /// extract the `QueryExpr::CreateView` it produced, and populate
8008    /// the in-memory registries (`inner.views` and
8009    /// `inner.materialized_views`) directly — no write paths run, so
8010    /// rehydrate does not re-persist what it just read.
8011    ///
8012    /// Malformed rows (missing `name`/`source_sql`, parse errors) are
8013    /// skipped with a `SchemaCorruption` operator event so a single
8014    /// bad entry does not block startup.
8015    pub(crate) fn rehydrate_materialized_view_descriptors(&self) {
8016        let store = self.inner.db.store();
8017        let descriptors = crate::runtime::continuous_materialized_view::load_all(store.as_ref());
8018        for descriptor in descriptors {
8019            let parsed = match crate::storage::query::parser::parse(&descriptor.source_sql) {
8020                Ok(qc) => qc,
8021                Err(err) => {
8022                    crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
8023                        collection:
8024                            crate::runtime::continuous_materialized_view::CATALOG_COLLECTION
8025                                .to_string(),
8026                        detail: format!(
8027                            "failed to re-parse materialized-view source for {}: {err}",
8028                            descriptor.name
8029                        ),
8030                    }
8031                    .emit_global();
8032                    continue;
8033                }
8034            };
8035            let crate::storage::query::ast::QueryExpr::CreateView(create) = parsed.query else {
8036                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
8037                    collection: crate::runtime::continuous_materialized_view::CATALOG_COLLECTION
8038                        .to_string(),
8039                    detail: format!(
8040                        "materialized-view source for {} did not re-parse as CREATE VIEW",
8041                        descriptor.name
8042                    ),
8043                }
8044                .emit_global();
8045                continue;
8046            };
8047            // Populate in-memory view registry.
8048            let view_name = create.name.clone();
8049            self.inner
8050                .views
8051                .write()
8052                .insert(view_name.clone(), Arc::new(create));
8053            // Materialized cache slot (data empty until next REFRESH).
8054            use crate::storage::cache::result::{MaterializedViewDef, RefreshPolicy};
8055            let refresh = match descriptor.refresh_every_ms {
8056                Some(ms) => RefreshPolicy::Periodic(std::time::Duration::from_millis(ms)),
8057                None => RefreshPolicy::Manual,
8058            };
8059            let def = MaterializedViewDef {
8060                name: view_name.clone(),
8061                query: format!("<parsed view {}>", view_name),
8062                dependencies: descriptor.source_collections.clone(),
8063                refresh,
8064                retention_duration_ms: descriptor.retention_duration_ms,
8065            };
8066            self.inner.materialized_views.write().register(def);
8067            if let Err(err) = self.ensure_materialized_view_backing(&view_name) {
8068                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
8069                    collection: crate::runtime::continuous_materialized_view::CATALOG_COLLECTION
8070                        .to_string(),
8071                    detail: format!(
8072                        "failed to rehydrate backing collection for materialized view {view_name}: {err}"
8073                    ),
8074                }
8075                .emit_global();
8076            }
8077        }
8078        // A rehydrated view shape may differ from any plans the cache
8079        // bootstrapped before this method ran — flush to be safe.
8080        self.invalidate_plan_cache();
8081    }
8082
8083    pub(crate) fn rehydrate_declared_column_schemas(&self) {
8084        let store = self.inner.db.store();
8085        for contract in self.inner.db.collection_contracts() {
8086            let columns: Vec<String> = contract
8087                .declared_columns
8088                .iter()
8089                .map(|column| column.name.clone())
8090                .collect();
8091            let Some(manager) = store.get_collection(&contract.name) else {
8092                continue;
8093            };
8094            manager.set_column_schema_if_empty(columns);
8095        }
8096    }
8097
8098    /// Register a table as tenant-scoped (Phase 2.5.4). Installs the
8099    /// in-memory column mapping, the implicit RLS policy, and enables
8100    /// row-level security on the table. Idempotent — re-registering
8101    /// the same `(table, column)` replaces the prior auto-policy.
8102    pub fn register_tenant_table(&self, table: &str, column: &str) {
8103        use crate::storage::query::ast::{
8104            CompareOp, CreatePolicyQuery, Expr, FieldRef, Filter, Span,
8105        };
8106        self.inner
8107            .tenant_tables
8108            .write()
8109            .insert(table.to_string(), column.to_string());
8110
8111        // Build the policy: col = CURRENT_TENANT()
8112        // Uses CompareExpr so the comparison happens at runtime against
8113        // the thread-local tenant value read by the CURRENT_TENANT
8114        // scalar. Spans are synthetic — there's no source location for
8115        // an auto-generated policy.
8116        let lhs = Expr::Column {
8117            field: FieldRef::TableColumn {
8118                table: table.to_string(),
8119                column: column.to_string(),
8120            },
8121            span: Span::synthetic(),
8122        };
8123        let rhs = Expr::FunctionCall {
8124            name: "CURRENT_TENANT".to_string(),
8125            args: Vec::new(),
8126            span: Span::synthetic(),
8127        };
8128        let policy_filter = Filter::CompareExpr {
8129            lhs,
8130            op: CompareOp::Eq,
8131            rhs,
8132        };
8133
8134        let policy = CreatePolicyQuery {
8135            name: "__tenant_iso".to_string(),
8136            table: table.to_string(),
8137            action: None, // None = ALL actions (SELECT/INSERT/UPDATE/DELETE)
8138            role: None,   // None = every role
8139            using: Box::new(policy_filter),
8140            // Auto-tenancy defaults to Table targets. Collections of
8141            // other kinds (graph / vector / queue / timeseries) that
8142            // opt in via `ALTER ... ENABLE TENANCY` should use the
8143            // matching kind — but for now we keep the auto-policy
8144            // kind-agnostic so the evaluator can apply it to any
8145            // entity living in the collection.
8146            target_kind: crate::storage::query::ast::PolicyTargetKind::Table,
8147        };
8148
8149        // Replace any prior auto-policy for this table (column rename).
8150        self.inner.rls_policies.write().insert(
8151            (table.to_string(), "__tenant_iso".to_string()),
8152            Arc::new(policy),
8153        );
8154        self.inner
8155            .rls_enabled_tables
8156            .write()
8157            .insert(table.to_string());
8158
8159        // Auto-build a hash index on the tenant column. Every read/write
8160        // against a tenant-scoped table carries an implicit
8161        // `col = CURRENT_TENANT()` predicate from the auto-policy, so an
8162        // index on that column is on the hot path of every query. Without
8163        // it, every SELECT/UPDATE/DELETE degrades to a full scan.
8164        self.ensure_tenant_index(table, column);
8165    }
8166
8167    /// Auto-create the hash index that backs the tenant-iso RLS predicate.
8168    /// Skipped when:
8169    ///   * the column is dotted (nested path — flat secondary indices
8170    ///     don't cover those today; RLS still works via the policy)
8171    ///   * `__tenant_idx_{table}` already exists (idempotent on rehydrate)
8172    ///   * the user already registered an index whose first column matches
8173    ///     (avoids redundant duplicates of a user-defined composite)
8174    fn ensure_tenant_index(&self, table: &str, column: &str) {
8175        if column.contains('.') {
8176            return;
8177        }
8178        let index_name = format!("__tenant_idx_{table}");
8179        let registry = self.inner.index_store.list_indices(table);
8180        if registry.iter().any(|idx| idx.name == index_name) {
8181            return;
8182        }
8183        if registry
8184            .iter()
8185            .any(|idx| idx.columns.first().map(|c| c.as_str()) == Some(column))
8186        {
8187            return;
8188        }
8189
8190        let store = self.inner.db.store();
8191        let Some(manager) = store.get_collection(table) else {
8192            return;
8193        };
8194        let entities = manager.query_all(|_| true);
8195        let entity_fields: Vec<(
8196            crate::storage::unified::EntityId,
8197            Vec<(String, crate::storage::schema::Value)>,
8198        )> = entities
8199            .iter()
8200            .map(|e| {
8201                let fields = match &e.data {
8202                    crate::storage::EntityData::Row(row) => {
8203                        if let Some(ref named) = row.named {
8204                            named.iter().map(|(k, v)| (k.clone(), v.clone())).collect()
8205                        } else if let Some(ref schema) = row.schema {
8206                            schema
8207                                .iter()
8208                                .zip(row.columns.iter())
8209                                .map(|(k, v)| (k.clone(), v.clone()))
8210                                .collect()
8211                        } else {
8212                            Vec::new()
8213                        }
8214                    }
8215                    crate::storage::EntityData::Node(node) => node
8216                        .properties
8217                        .iter()
8218                        .map(|(k, v)| (k.clone(), v.clone()))
8219                        .collect(),
8220                    _ => Vec::new(),
8221                };
8222                (e.id, fields)
8223            })
8224            .collect();
8225
8226        let columns = vec![column.to_string()];
8227        if self
8228            .inner
8229            .index_store
8230            .create_index(
8231                &index_name,
8232                table,
8233                &columns,
8234                super::index_store::IndexMethodKind::Hash,
8235                false,
8236                &entity_fields,
8237            )
8238            .is_err()
8239        {
8240            return;
8241        }
8242        self.inner
8243            .index_store
8244            .register(super::index_store::RegisteredIndex {
8245                name: index_name,
8246                collection: table.to_string(),
8247                columns,
8248                method: super::index_store::IndexMethodKind::Hash,
8249                unique: false,
8250            });
8251        self.invalidate_plan_cache();
8252    }
8253
8254    /// Drop the auto-generated tenant index, if one exists. Called from
8255    /// `unregister_tenant_table` so DISABLE TENANCY / DROP TABLE clean up.
8256    fn drop_tenant_index(&self, table: &str) {
8257        let index_name = format!("__tenant_idx_{table}");
8258        self.inner.index_store.drop_index(&index_name, table);
8259    }
8260
8261    /// Retrieve the tenant column for a table, if any (Phase 2.5.4).
8262    /// Used by the INSERT auto-fill path to know which column to
8263    /// populate with `current_tenant()` when the user didn't name it.
8264    pub fn tenant_column(&self, table: &str) -> Option<String> {
8265        self.inner.tenant_tables.read().get(table).cloned()
8266    }
8267
8268    /// Remove a table's tenant registration (Phase 2.5.4). Called by
8269    /// DROP TABLE / ALTER TABLE DISABLE TENANCY. Removes the auto-policy
8270    /// but leaves any user-installed explicit policies intact.
8271    pub fn unregister_tenant_table(&self, table: &str) {
8272        self.inner.tenant_tables.write().remove(table);
8273        self.inner
8274            .rls_policies
8275            .write()
8276            .remove(&(table.to_string(), "__tenant_iso".to_string()));
8277        self.drop_tenant_index(table);
8278        // Only clear RLS enablement if no other policies remain.
8279        let has_other_policies = self
8280            .inner
8281            .rls_policies
8282            .read()
8283            .keys()
8284            .any(|(t, _)| t == table);
8285        if !has_other_policies {
8286            self.inner.rls_enabled_tables.write().remove(table);
8287        }
8288    }
8289
8290    /// Record that the running transaction has marked `id` in `collection`
8291    /// for deletion (Phase 2.3.2b MVCC tombstones). `stamper_xid` is the
8292    /// xid that was written into `xmax` — either the parent txn xid or
8293    /// the innermost savepoint sub-xid. Savepoint rollback filters by
8294    /// this xid to revive only its own tombstones.
8295    pub(crate) fn record_pending_tombstone(
8296        &self,
8297        conn_id: u64,
8298        collection: &str,
8299        id: crate::storage::unified::entity::EntityId,
8300        stamper_xid: crate::storage::transaction::snapshot::Xid,
8301        previous_xmax: crate::storage::transaction::snapshot::Xid,
8302    ) {
8303        self.inner
8304            .pending_tombstones
8305            .write()
8306            .entry(conn_id)
8307            .or_default()
8308            .push((collection.to_string(), id, stamper_xid, previous_xmax));
8309    }
8310
8311    pub(crate) fn record_pending_versioned_update(
8312        &self,
8313        conn_id: u64,
8314        collection: &str,
8315        old_id: crate::storage::unified::entity::EntityId,
8316        new_id: crate::storage::unified::entity::EntityId,
8317        stamper_xid: crate::storage::transaction::snapshot::Xid,
8318        previous_xmax: crate::storage::transaction::snapshot::Xid,
8319    ) {
8320        self.inner
8321            .pending_versioned_updates
8322            .write()
8323            .entry(conn_id)
8324            .or_default()
8325            .push((
8326                collection.to_string(),
8327                old_id,
8328                new_id,
8329                stamper_xid,
8330                previous_xmax,
8331            ));
8332    }
8333
8334    fn with_deferred_store_wal_if_transaction<T>(
8335        &self,
8336        f: impl FnOnce() -> RedDBResult<T>,
8337    ) -> RedDBResult<T> {
8338        let conn_id = current_connection_id();
8339        if !self.inner.tx_contexts.read().contains_key(&conn_id) {
8340            return f();
8341        }
8342
8343        crate::storage::UnifiedStore::begin_deferred_store_wal_capture();
8344        let result = f();
8345        let captured = crate::storage::UnifiedStore::take_deferred_store_wal_capture();
8346        match result {
8347            Ok(value) => {
8348                self.record_pending_store_wal_actions(conn_id, captured);
8349                Ok(value)
8350            }
8351            Err(err) => Err(err),
8352        }
8353    }
8354
8355    fn with_deferred_store_wal_for_dml<T>(
8356        &self,
8357        capture_autocommit_events: bool,
8358        f: impl FnOnce() -> RedDBResult<T>,
8359    ) -> RedDBResult<T> {
8360        let conn_id = current_connection_id();
8361        if self.inner.tx_contexts.read().contains_key(&conn_id) {
8362            return self.with_deferred_store_wal_if_transaction(f);
8363        }
8364        if !capture_autocommit_events {
8365            return f();
8366        }
8367
8368        crate::storage::UnifiedStore::begin_deferred_store_wal_capture();
8369        let result = f();
8370        let captured = crate::storage::UnifiedStore::take_deferred_store_wal_capture();
8371        self.inner
8372            .db
8373            .store()
8374            .append_deferred_store_wal_actions(captured)
8375            .map_err(|err| RedDBError::Internal(err.to_string()))?;
8376        result
8377    }
8378
8379    fn insert_may_emit_events(&self, query: &InsertQuery) -> bool {
8380        !query.suppress_events
8381            && self.collection_has_event_subscriptions_for_operation(
8382                &query.table,
8383                crate::catalog::SubscriptionOperation::Insert,
8384            )
8385    }
8386
8387    fn update_may_emit_events(&self, query: &UpdateQuery) -> bool {
8388        !query.suppress_events
8389            && self.collection_has_event_subscriptions_for_operation(
8390                &query.table,
8391                crate::catalog::SubscriptionOperation::Update,
8392            )
8393    }
8394
8395    fn delete_may_emit_events(&self, query: &DeleteQuery) -> bool {
8396        !query.suppress_events
8397            && self.collection_has_event_subscriptions_for_operation(
8398                &query.table,
8399                crate::catalog::SubscriptionOperation::Delete,
8400            )
8401    }
8402
8403    fn collection_has_event_subscriptions_for_operation(
8404        &self,
8405        collection: &str,
8406        operation: crate::catalog::SubscriptionOperation,
8407    ) -> bool {
8408        let Some(contract) = self.db().collection_contract_arc(collection) else {
8409            return false;
8410        };
8411        contract.subscriptions.iter().any(|subscription| {
8412            subscription.enabled
8413                && (subscription.ops_filter.is_empty()
8414                    || subscription.ops_filter.contains(&operation))
8415        })
8416    }
8417
8418    fn record_pending_store_wal_actions(
8419        &self,
8420        conn_id: u64,
8421        actions: crate::storage::unified::DeferredStoreWalActions,
8422    ) {
8423        if actions.is_empty() {
8424            return;
8425        }
8426        let mut guard = self.inner.pending_store_wal_actions.write();
8427        guard.entry(conn_id).or_default().extend(actions);
8428    }
8429
8430    fn flush_pending_store_wal_actions(&self, conn_id: u64) -> RedDBResult<()> {
8431        let Some(actions) = self
8432            .inner
8433            .pending_store_wal_actions
8434            .write()
8435            .remove(&conn_id)
8436        else {
8437            return Ok(());
8438        };
8439        self.inner
8440            .db
8441            .store()
8442            .append_deferred_store_wal_actions(actions)
8443            .map_err(|err| RedDBError::Internal(err.to_string()))
8444    }
8445
8446    fn discard_pending_store_wal_actions(&self, conn_id: u64) {
8447        self.inner
8448            .pending_store_wal_actions
8449            .write()
8450            .remove(&conn_id);
8451    }
8452
8453    fn xid_conflicts_with_snapshot(
8454        &self,
8455        xid: crate::storage::transaction::snapshot::Xid,
8456        snapshot: &crate::storage::transaction::snapshot::Snapshot,
8457        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
8458    ) -> bool {
8459        xid != 0
8460            && !own_xids.contains(&xid)
8461            && !self.inner.snapshot_manager.is_aborted(xid)
8462            && !self.inner.snapshot_manager.is_active(xid)
8463            && (xid > snapshot.xid || snapshot.in_progress.contains(&xid))
8464    }
8465
8466    fn conflict_error(
8467        collection: &str,
8468        logical_id: crate::storage::unified::entity::EntityId,
8469        xid: crate::storage::transaction::snapshot::Xid,
8470    ) -> RedDBError {
8471        RedDBError::Query(format!(
8472            "serialization conflict: table row {collection}/{} was modified by concurrent transaction {xid}",
8473            logical_id.raw()
8474        ))
8475    }
8476
8477    fn check_logical_row_conflict(
8478        &self,
8479        collection: &str,
8480        logical_id: crate::storage::unified::entity::EntityId,
8481        excluded_ids: &[crate::storage::unified::entity::EntityId],
8482        snapshot: &crate::storage::transaction::snapshot::Snapshot,
8483        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
8484    ) -> RedDBResult<()> {
8485        let store = self.inner.db.store();
8486        let Some(manager) = store.get_collection(collection) else {
8487            return Ok(());
8488        };
8489
8490        for candidate in manager.query_all(|_| true) {
8491            if excluded_ids.contains(&candidate.id) || candidate.logical_id() != logical_id {
8492                continue;
8493            }
8494            if self.xid_conflicts_with_snapshot(candidate.xmin, snapshot, own_xids) {
8495                return Err(Self::conflict_error(collection, logical_id, candidate.xmin));
8496            }
8497            if self.xid_conflicts_with_snapshot(candidate.xmax, snapshot, own_xids) {
8498                return Err(Self::conflict_error(collection, logical_id, candidate.xmax));
8499            }
8500        }
8501        Ok(())
8502    }
8503
8504    pub(crate) fn check_table_row_write_conflicts(
8505        &self,
8506        conn_id: u64,
8507        snapshot: &crate::storage::transaction::snapshot::Snapshot,
8508        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
8509    ) -> RedDBResult<()> {
8510        let versioned_updates = self
8511            .inner
8512            .pending_versioned_updates
8513            .read()
8514            .get(&conn_id)
8515            .cloned()
8516            .unwrap_or_default();
8517        let tombstones = self
8518            .inner
8519            .pending_tombstones
8520            .read()
8521            .get(&conn_id)
8522            .cloned()
8523            .unwrap_or_default();
8524
8525        let store = self.inner.db.store();
8526        for (collection, old_id, new_id, xid, previous_xmax) in versioned_updates {
8527            let Some(manager) = store.get_collection(&collection) else {
8528                continue;
8529            };
8530            let Some(old) = manager.get(old_id) else {
8531                continue;
8532            };
8533            let logical_id = old.logical_id();
8534            if self.xid_conflicts_with_snapshot(previous_xmax, snapshot, own_xids) {
8535                return Err(Self::conflict_error(&collection, logical_id, previous_xmax));
8536            }
8537            if old.xmax != xid && self.xid_conflicts_with_snapshot(old.xmax, snapshot, own_xids) {
8538                return Err(Self::conflict_error(&collection, logical_id, old.xmax));
8539            }
8540            self.check_logical_row_conflict(
8541                &collection,
8542                logical_id,
8543                &[old_id, new_id],
8544                snapshot,
8545                own_xids,
8546            )?;
8547        }
8548
8549        for (collection, id, xid, previous_xmax) in tombstones {
8550            let Some(manager) = store.get_collection(&collection) else {
8551                continue;
8552            };
8553            let Some(entity) = manager.get(id) else {
8554                continue;
8555            };
8556            let logical_id = entity.logical_id();
8557            if self.xid_conflicts_with_snapshot(previous_xmax, snapshot, own_xids) {
8558                return Err(Self::conflict_error(&collection, logical_id, previous_xmax));
8559            }
8560            if entity.xmax != xid
8561                && self.xid_conflicts_with_snapshot(entity.xmax, snapshot, own_xids)
8562            {
8563                return Err(Self::conflict_error(&collection, logical_id, entity.xmax));
8564            }
8565            self.check_logical_row_conflict(&collection, logical_id, &[id], snapshot, own_xids)?;
8566        }
8567
8568        Ok(())
8569    }
8570
8571    pub(crate) fn restore_pending_write_stamps(&self, conn_id: u64) {
8572        let versioned_updates = self
8573            .inner
8574            .pending_versioned_updates
8575            .read()
8576            .get(&conn_id)
8577            .cloned()
8578            .unwrap_or_default();
8579        let tombstones = self
8580            .inner
8581            .pending_tombstones
8582            .read()
8583            .get(&conn_id)
8584            .cloned()
8585            .unwrap_or_default();
8586
8587        let store = self.inner.db.store();
8588        for (collection, old_id, _new_id, xid, _previous_xmax) in versioned_updates {
8589            if let Some(manager) = store.get_collection(&collection) {
8590                if let Some(mut entity) = manager.get(old_id) {
8591                    entity.set_xmax(xid);
8592                    let _ = manager.update(entity);
8593                }
8594            }
8595        }
8596        for (collection, id, xid, _previous_xmax) in tombstones {
8597            if let Some(manager) = store.get_collection(&collection) {
8598                if let Some(mut entity) = manager.get(id) {
8599                    entity.set_xmax(xid);
8600                    let _ = manager.update(entity);
8601                }
8602            }
8603        }
8604    }
8605
8606    pub(crate) fn finalize_pending_versioned_updates(&self, conn_id: u64) {
8607        self.inner
8608            .pending_versioned_updates
8609            .write()
8610            .remove(&conn_id);
8611    }
8612
8613    pub(crate) fn revive_pending_versioned_updates(&self, conn_id: u64) {
8614        let Some(pending) = self
8615            .inner
8616            .pending_versioned_updates
8617            .write()
8618            .remove(&conn_id)
8619        else {
8620            return;
8621        };
8622
8623        let store = self.inner.db.store();
8624        for (collection, old_id, new_id, xid, previous_xmax) in pending {
8625            if let Some(manager) = store.get_collection(&collection) {
8626                if let Some(mut old) = manager.get(old_id) {
8627                    if old.xmax == xid {
8628                        old.set_xmax(previous_xmax);
8629                        let _ = manager.update(old);
8630                    }
8631                }
8632            }
8633            let _ = store.delete_batch(&collection, &[new_id]);
8634        }
8635    }
8636
8637    pub(crate) fn revive_versioned_updates_since(&self, conn_id: u64, stamper_xid: u64) -> usize {
8638        let mut guard = self.inner.pending_versioned_updates.write();
8639        let Some(pending) = guard.get_mut(&conn_id) else {
8640            return 0;
8641        };
8642
8643        let store = self.inner.db.store();
8644        let mut reverted = 0usize;
8645        pending.retain(|(collection, old_id, new_id, xid, previous_xmax)| {
8646            if *xid < stamper_xid {
8647                return true;
8648            }
8649            if let Some(manager) = store.get_collection(collection) {
8650                if let Some(mut old) = manager.get(*old_id) {
8651                    if old.xmax == *xid {
8652                        old.set_xmax(*previous_xmax);
8653                        let _ = manager.update(old);
8654                    }
8655                }
8656            }
8657            let _ = store.delete_batch(collection, &[*new_id]);
8658            reverted += 1;
8659            false
8660        });
8661        if pending.is_empty() {
8662            guard.remove(&conn_id);
8663        }
8664        reverted
8665    }
8666
8667    /// Flush tombstones on COMMIT. The xmax stamp is already the durable
8668    /// delete marker; commit only drops the rollback journal and emits
8669    /// side effects. Physical reclamation is left for VACUUM so old
8670    /// snapshots can still resolve the pre-delete row version.
8671    pub(crate) fn finalize_pending_tombstones(&self, conn_id: u64) {
8672        let Some(pending) = self.inner.pending_tombstones.write().remove(&conn_id) else {
8673            return;
8674        };
8675        if pending.is_empty() {
8676            return;
8677        }
8678
8679        let store = self.inner.db.store();
8680        for (collection, id, _xid, _previous_xmax) in pending {
8681            store.context_index().remove_entity(id);
8682            self.cdc_emit(
8683                crate::replication::cdc::ChangeOperation::Delete,
8684                &collection,
8685                id.raw(),
8686                "entity",
8687            );
8688        }
8689    }
8690
8691    /// Revive tombstones on ROLLBACK — reset `xmax` to 0 so the tuples
8692    /// become visible again to future snapshots. Best-effort: a row
8693    /// already reclaimed by a concurrent VACUUM stays gone, but VACUUM
8694    /// never reclaims tuples whose xmax is still referenced by any
8695    /// active snapshot, so this case is only reachable via external
8696    /// storage corruption.
8697    pub(crate) fn revive_pending_tombstones(&self, conn_id: u64) {
8698        let Some(pending) = self.inner.pending_tombstones.write().remove(&conn_id) else {
8699            return;
8700        };
8701
8702        let store = self.inner.db.store();
8703        for (collection, id, xid, previous_xmax) in pending {
8704            let Some(manager) = store.get_collection(&collection) else {
8705                continue;
8706            };
8707            if let Some(mut entity) = manager.get(id) {
8708                if entity.xmax == xid {
8709                    entity.set_xmax(previous_xmax);
8710                    let _ = manager.update(entity);
8711                }
8712            }
8713        }
8714    }
8715
8716    /// Slice C of PRD #718 — accessor for the local wait registry.
8717    pub fn queue_wait_registry(
8718        &self,
8719    ) -> std::sync::Arc<crate::runtime::queue_wait_registry::QueueWaitRegistry> {
8720        self.inner.queue_wait_registry.clone()
8721    }
8722
8723    /// Buffer a `(scope, queue)` wake on the current connection so it
8724    /// fires post-COMMIT, or notify immediately if no transaction is
8725    /// open (autocommit path). The wait registry only ever observes
8726    /// notifies for committed work — rollback drops the buffer.
8727    pub(crate) fn record_queue_wake(&self, scope: &str, queue: &str) {
8728        if self.current_xid().is_some() {
8729            let conn_id = current_connection_id();
8730            self.inner
8731                .pending_queue_wakes
8732                .write()
8733                .entry(conn_id)
8734                .or_default()
8735                .push((scope.to_string(), queue.to_string()));
8736            return;
8737        }
8738        self.inner.queue_wait_registry.notify(scope, queue);
8739    }
8740
8741    pub(crate) fn finalize_pending_queue_wakes(&self, conn_id: u64) {
8742        let Some(pending) = self.inner.pending_queue_wakes.write().remove(&conn_id) else {
8743            return;
8744        };
8745        for (scope, queue) in pending {
8746            self.inner.queue_wait_registry.notify(&scope, &queue);
8747        }
8748    }
8749
8750    pub(crate) fn discard_pending_queue_wakes(&self, conn_id: u64) {
8751        self.inner.pending_queue_wakes.write().remove(&conn_id);
8752    }
8753
8754    pub(crate) fn finalize_pending_kv_watch_events(&self, conn_id: u64) {
8755        let Some(pending) = self.inner.pending_kv_watch_events.write().remove(&conn_id) else {
8756            return;
8757        };
8758        for event in pending {
8759            self.cdc_emit_kv(
8760                event.op,
8761                &event.collection,
8762                &event.key,
8763                0,
8764                event.before,
8765                event.after,
8766            );
8767        }
8768    }
8769
8770    pub(crate) fn discard_pending_kv_watch_events(&self, conn_id: u64) {
8771        self.inner.pending_kv_watch_events.write().remove(&conn_id);
8772    }
8773
8774    /// Materialise the entire graph store while applying MVCC visibility
8775    /// AND per-collection RLS to each candidate node and edge. Mirrors
8776    /// `materialize_graph` but routes every entity through the same
8777    /// gate the SELECT path uses, with the correct `PolicyTargetKind`
8778    /// per entity kind (`Nodes` for graph nodes, `Edges` for graph
8779    /// edges). Returns the filtered `GraphStore` plus the
8780    /// `node_id → properties` map the executor needs for `RETURN n.*`
8781    /// projections.
8782    fn materialize_graph_with_rls(
8783        &self,
8784    ) -> RedDBResult<(
8785        crate::storage::engine::GraphStore,
8786        std::collections::HashMap<
8787            String,
8788            std::collections::HashMap<String, crate::storage::schema::Value>,
8789        >,
8790        crate::storage::query::unified::EdgeProperties,
8791    )> {
8792        use crate::storage::engine::GraphStore;
8793        use crate::storage::query::ast::{PolicyAction, PolicyTargetKind};
8794        use crate::storage::unified::entity::{EntityData, EntityKind};
8795        use std::collections::{HashMap, HashSet};
8796
8797        let store = self.inner.db.store();
8798        let snap_ctx = capture_current_snapshot();
8799        let role = current_auth_identity().map(|(_, r)| r.as_str().to_string());
8800
8801        let graph = GraphStore::new();
8802        let mut node_properties: HashMap<String, HashMap<String, crate::storage::schema::Value>> =
8803            HashMap::new();
8804        let mut edge_properties: crate::storage::query::unified::EdgeProperties = HashMap::new();
8805        let mut allowed_nodes: HashSet<String> = HashSet::new();
8806
8807        // Per-collection cached compiled filters — Nodes-kind for
8808        // first pass, Edges-kind for the second. None entries mean
8809        // "RLS enabled, zero matching policy → deny all of this kind".
8810        let mut node_rls: HashMap<String, Option<crate::storage::query::ast::Filter>> =
8811            HashMap::new();
8812        let mut edge_rls: HashMap<String, Option<crate::storage::query::ast::Filter>> =
8813            HashMap::new();
8814
8815        let collections = store.list_collections();
8816
8817        // First pass — gather nodes.
8818        for collection in &collections {
8819            let Some(manager) = store.get_collection(collection) else {
8820                continue;
8821            };
8822            let entities = manager.query_all(|_| true);
8823            for entity in entities {
8824                if !entity_visible_with_context(snap_ctx.as_ref(), &entity) {
8825                    continue;
8826                }
8827                let EntityKind::GraphNode(ref node) = entity.kind else {
8828                    continue;
8829                };
8830                if !node_passes_rls(self, collection, role.as_deref(), &mut node_rls, &entity) {
8831                    continue;
8832                }
8833                let id_str = entity.id.raw().to_string();
8834                graph
8835                    .add_node_with_label(
8836                        &id_str,
8837                        &node.label,
8838                        &super::graph_node_label(&node.node_type),
8839                    )
8840                    .map_err(|err| RedDBError::Query(err.to_string()))?;
8841                allowed_nodes.insert(id_str.clone());
8842                if let EntityData::Node(node_data) = &entity.data {
8843                    node_properties.insert(id_str, node_data.properties.clone());
8844                }
8845            }
8846        }
8847
8848        // Second pass — gather edges. An edge appears only when both
8849        // endpoint nodes survived the RLS pass AND the edge itself
8850        // passes its own RLS gate.
8851        for collection in &collections {
8852            let Some(manager) = store.get_collection(collection) else {
8853                continue;
8854            };
8855            let entities = manager.query_all(|_| true);
8856            for entity in entities {
8857                if !entity_visible_with_context(snap_ctx.as_ref(), &entity) {
8858                    continue;
8859                }
8860                let EntityKind::GraphEdge(ref edge) = entity.kind else {
8861                    continue;
8862                };
8863                if !allowed_nodes.contains(&edge.from_node)
8864                    || !allowed_nodes.contains(&edge.to_node)
8865                {
8866                    continue;
8867                }
8868                if !edge_passes_rls(self, collection, role.as_deref(), &mut edge_rls, &entity) {
8869                    continue;
8870                }
8871                let weight = match &entity.data {
8872                    EntityData::Edge(e) => e.weight,
8873                    _ => edge.weight as f32 / 1000.0,
8874                };
8875                let edge_label = super::graph_edge_label(&edge.label);
8876                graph
8877                    .add_edge_with_label(&edge.from_node, &edge.to_node, &edge_label, weight)
8878                    .map_err(|err| RedDBError::Query(err.to_string()))?;
8879                if let EntityData::Edge(edge_data) = &entity.data {
8880                    edge_properties.insert(
8881                        (edge.from_node.clone(), edge_label, edge.to_node.clone()),
8882                        edge_data.properties.clone(),
8883                    );
8884                }
8885            }
8886        }
8887
8888        // Suppress unused-PolicyAction/PolicyTargetKind warnings — both
8889        // are used inside the helper closures via the per-kind helpers
8890        // declared at the bottom of this file.
8891        let _ = (PolicyAction::Select, PolicyTargetKind::Nodes);
8892
8893        Ok((graph, node_properties, edge_properties))
8894    }
8895
8896    /// Phase 1.1 MVCC universal: post-save hook that stamps `xmin` on a
8897    /// freshly-inserted entity when the current connection holds an
8898    /// open transaction. Used by graph / vector / queue / timeseries
8899    /// write paths that go through the DevX builder API (`db.node(...)
8900    /// .save()` and friends) — those live in the storage crate and
8901    /// can't reach `current_xid()` without crossing layers, so the
8902    /// application layer calls this helper right after `save()` to
8903    /// finalise the MVCC stamp.
8904    ///
8905    /// Autocommit (outside BEGIN) is a no-op — no extra lookup or
8906    /// write, so the non-transactional hot path stays untouched.
8907    ///
8908    /// Best-effort: if the collection or entity disappears between
8909    /// the save and the stamp (concurrent DROP), we silently skip.
8910    pub(crate) fn stamp_xmin_if_in_txn(
8911        &self,
8912        collection: &str,
8913        id: crate::storage::unified::entity::EntityId,
8914    ) {
8915        let Some(xid) = self.current_xid() else {
8916            return;
8917        };
8918        let store = self.inner.db.store();
8919        let Some(manager) = store.get_collection(collection) else {
8920            return;
8921        };
8922        if let Some(mut entity) = manager.get(id) {
8923            entity.set_xmin(xid);
8924            let _ = manager.update(entity);
8925        }
8926    }
8927
8928    /// Revive tombstones stamped by `stamper_xid` or any sub-xid
8929    /// allocated after it (Phase 2.3.2e savepoint rollback). Any
8930    /// pending entries with `xid < stamper_xid` stay queued because
8931    /// they belong to the enclosing scope — they'll either flush on
8932    /// COMMIT or revive on an outer ROLLBACK TO SAVEPOINT.
8933    ///
8934    /// Returns the number of tuples whose `xmax` was wiped back to 0.
8935    pub(crate) fn revive_tombstones_since(&self, conn_id: u64, stamper_xid: u64) -> usize {
8936        let mut guard = self.inner.pending_tombstones.write();
8937        let Some(pending) = guard.get_mut(&conn_id) else {
8938            return 0;
8939        };
8940
8941        let store = self.inner.db.store();
8942        let mut revived = 0usize;
8943        pending.retain(|(collection, id, xid, previous_xmax)| {
8944            if *xid < stamper_xid {
8945                // Stamped before the savepoint — keep in queue.
8946                return true;
8947            }
8948            if let Some(manager) = store.get_collection(collection) {
8949                if let Some(mut entity) = manager.get(*id) {
8950                    if entity.xmax == *xid {
8951                        entity.set_xmax(*previous_xmax);
8952                        let _ = manager.update(entity);
8953                        revived += 1;
8954                    }
8955                }
8956            }
8957            false
8958        });
8959        if pending.is_empty() {
8960            guard.remove(&conn_id);
8961        }
8962        revived
8963    }
8964
8965    /// Return the snapshot the current connection should use for visibility
8966    /// checks (Phase 2.3 PG parity).
8967    ///
8968    /// * If the connection is inside a BEGIN-wrapped transaction, reuse
8969    ///   the snapshot stored in its `TxnContext`.
8970    /// * Otherwise (autocommit), capture a fresh snapshot tied to an
8971    ///   implicit xid=0 — the read path treats pre-MVCC rows as always
8972    ///   visible so this degrades to "see everything committed".
8973    pub fn current_snapshot(&self) -> crate::storage::transaction::snapshot::Snapshot {
8974        let conn_id = current_connection_id();
8975        if let Some(ctx) = self.inner.tx_contexts.read().get(&conn_id).cloned() {
8976            return ctx.snapshot;
8977        }
8978        // Autocommit: take a fresh snapshot bounded by `peek_next_xid` so
8979        // every already-committed xid (which is strictly less) passes the
8980        // `xmin <= snap.xid` gate, while concurrently-active xids land in
8981        // the `in_progress` set and stay hidden until they commit. Using
8982        // xid=0 would incorrectly hide every MVCC-stamped tuple.
8983        let high_water = self.inner.snapshot_manager.peek_next_xid();
8984        self.inner.snapshot_manager.snapshot(high_water)
8985    }
8986
8987    /// Xid of the current connection's active transaction, or `None` when
8988    /// running outside a BEGIN/COMMIT block. Write paths call this to
8989    /// decide whether to stamp `xmin`/`xmax` on tuples.
8990    /// Phase 2.3.2e: when a savepoint is open, `writer_xid` returns the
8991    /// sub-xid so new writes can be selectively rolled back. Otherwise
8992    /// the parent txn's xid is returned, matching pre-savepoint
8993    /// behaviour. Callers that need the enclosing *transaction* xid
8994    /// (e.g. VACUUM min-active calculations) should read `ctx.xid`
8995    /// directly.
8996    pub fn current_xid(&self) -> Option<crate::storage::transaction::snapshot::Xid> {
8997        let conn_id = current_connection_id();
8998        self.inner
8999            .tx_contexts
9000            .read()
9001            .get(&conn_id)
9002            .map(|ctx| ctx.writer_xid())
9003    }
9004
9005    /// `true` when the given connection id has an open `BEGIN`. Issue
9006    /// #760 — `OpenStream` consults this to refuse output streams that
9007    /// would otherwise collide with an interactive transaction (see
9008    /// ADR 0029 "Transaction interaction"). HTTP requests pre-dating the
9009    /// connection-id plumbing run with id `0`, which never carries a
9010    /// transaction context, so this returns `false` on those paths.
9011    pub fn connection_in_transaction(&self, conn_id: u64) -> bool {
9012        self.inner.tx_contexts.read().contains_key(&conn_id)
9013    }
9014
9015    /// Access the shared `SnapshotManager` — useful for VACUUM to compute
9016    /// the oldest-active xid when reclaiming dead tuples.
9017    pub fn snapshot_manager(&self) -> Arc<crate::storage::transaction::snapshot::SnapshotManager> {
9018        Arc::clone(&self.inner.snapshot_manager)
9019    }
9020
9021    fn mvcc_vacuum_cutoff_xid(&self) -> crate::storage::transaction::snapshot::Xid {
9022        let manager = &self.inner.snapshot_manager;
9023        let next_xid = manager.peek_next_xid();
9024        let mut cutoff = next_xid;
9025        if let Some(oldest_active) = manager.oldest_active_xid() {
9026            cutoff = cutoff.min(oldest_active);
9027        }
9028        if let Some(oldest_pinned) = manager.oldest_pinned_xid() {
9029            cutoff = cutoff.min(oldest_pinned);
9030        }
9031        let retention_xids = self.config_u64("runtime.mvcc.vacuum_retention_xids", 0);
9032        if retention_xids > 0 {
9033            cutoff = cutoff.min(next_xid.saturating_sub(retention_xids));
9034        }
9035        cutoff
9036    }
9037
9038    fn rebuild_runtime_indexes_for_table(&self, table: &str) -> RedDBResult<()> {
9039        let registered = self.inner.index_store.list_indices(table);
9040        if registered.is_empty() {
9041            return Ok(());
9042        }
9043        let store = self.inner.db.store();
9044        let Some(manager) = store.get_collection(table) else {
9045            return Ok(());
9046        };
9047        let entity_fields = manager
9048            .query_all(|entity| matches!(entity.kind, crate::storage::EntityKind::TableRow { .. }))
9049            .into_iter()
9050            .map(|entity| (entity.id, table_row_index_fields(&entity)))
9051            .collect::<Vec<_>>();
9052
9053        for index in registered {
9054            self.inner.index_store.drop_index(&index.name, table);
9055            self.inner
9056                .index_store
9057                .create_index(
9058                    &index.name,
9059                    table,
9060                    &index.columns,
9061                    index.method,
9062                    index.unique,
9063                    &entity_fields,
9064                )
9065                .map_err(RedDBError::Internal)?;
9066            self.inner.index_store.register(index);
9067        }
9068        self.invalidate_plan_cache();
9069        Ok(())
9070    }
9071
9072    pub(crate) fn persist_runtime_index_descriptor(
9073        &self,
9074        index: super::index_store::RegisteredIndex,
9075    ) -> RedDBResult<()> {
9076        let store = self.inner.db.store();
9077        let _ = store.get_or_create_collection(RUNTIME_INDEX_REGISTRY_COLLECTION);
9078        let entity = crate::storage::UnifiedEntity::new(
9079            crate::storage::EntityId::new(0),
9080            crate::storage::EntityKind::TableRow {
9081                table: std::sync::Arc::from(RUNTIME_INDEX_REGISTRY_COLLECTION),
9082                row_id: 0,
9083            },
9084            crate::storage::EntityData::Row(crate::storage::RowData {
9085                columns: Vec::new(),
9086                named: Some(
9087                    [
9088                        (
9089                            "collection".to_string(),
9090                            crate::storage::schema::Value::text(index.collection.clone()),
9091                        ),
9092                        (
9093                            "name".to_string(),
9094                            crate::storage::schema::Value::text(index.name.clone()),
9095                        ),
9096                        (
9097                            "columns".to_string(),
9098                            crate::storage::schema::Value::text(index.columns.join("\u{1f}")),
9099                        ),
9100                        (
9101                            "method".to_string(),
9102                            crate::storage::schema::Value::text(index_method_kind_as_str(
9103                                index.method,
9104                            )),
9105                        ),
9106                        (
9107                            "unique".to_string(),
9108                            crate::storage::schema::Value::Boolean(index.unique),
9109                        ),
9110                        (
9111                            "dropped".to_string(),
9112                            crate::storage::schema::Value::Boolean(false),
9113                        ),
9114                    ]
9115                    .into_iter()
9116                    .collect(),
9117                ),
9118                schema: None,
9119            }),
9120        );
9121        store
9122            .insert_auto(RUNTIME_INDEX_REGISTRY_COLLECTION, entity)
9123            .map(|_| ())
9124            .map_err(|err| RedDBError::Internal(format!("{err:?}")))
9125    }
9126
9127    pub(crate) fn persist_runtime_index_drop(
9128        &self,
9129        collection: &str,
9130        name: &str,
9131    ) -> RedDBResult<()> {
9132        let store = self.inner.db.store();
9133        let _ = store.get_or_create_collection(RUNTIME_INDEX_REGISTRY_COLLECTION);
9134        let entity = crate::storage::UnifiedEntity::new(
9135            crate::storage::EntityId::new(0),
9136            crate::storage::EntityKind::TableRow {
9137                table: std::sync::Arc::from(RUNTIME_INDEX_REGISTRY_COLLECTION),
9138                row_id: 0,
9139            },
9140            crate::storage::EntityData::Row(crate::storage::RowData {
9141                columns: Vec::new(),
9142                named: Some(
9143                    [
9144                        (
9145                            "collection".to_string(),
9146                            crate::storage::schema::Value::text(collection.to_string()),
9147                        ),
9148                        (
9149                            "name".to_string(),
9150                            crate::storage::schema::Value::text(name.to_string()),
9151                        ),
9152                        (
9153                            "dropped".to_string(),
9154                            crate::storage::schema::Value::Boolean(true),
9155                        ),
9156                    ]
9157                    .into_iter()
9158                    .collect(),
9159                ),
9160                schema: None,
9161            }),
9162        );
9163        store
9164            .insert_auto(RUNTIME_INDEX_REGISTRY_COLLECTION, entity)
9165            .map(|_| ())
9166            .map_err(|err| RedDBError::Internal(format!("{err:?}")))
9167    }
9168
9169    fn rehydrate_runtime_index_registry(&self) -> RedDBResult<()> {
9170        let store = self.inner.db.store();
9171        let Some(manager) = store.get_collection(RUNTIME_INDEX_REGISTRY_COLLECTION) else {
9172            return Ok(());
9173        };
9174        let mut rows = manager.query_all(|_| true);
9175        rows.sort_by_key(|entity| entity.id.raw());
9176
9177        let mut latest = std::collections::HashMap::<
9178            (String, String),
9179            Option<super::index_store::RegisteredIndex>,
9180        >::new();
9181        for entity in rows {
9182            let crate::storage::EntityData::Row(row) = &entity.data else {
9183                continue;
9184            };
9185            let Some(named) = &row.named else {
9186                continue;
9187            };
9188            let Some(collection) = named_text(named, "collection") else {
9189                continue;
9190            };
9191            let Some(name) = named_text(named, "name") else {
9192                continue;
9193            };
9194            let dropped = named_bool(named, "dropped").unwrap_or(false);
9195            let key = (collection.clone(), name.clone());
9196            if dropped {
9197                latest.insert(key, None);
9198                continue;
9199            }
9200            let columns = named_text(named, "columns")
9201                .map(|raw| {
9202                    raw.split('\u{1f}')
9203                        .filter(|part| !part.is_empty())
9204                        .map(str::to_string)
9205                        .collect::<Vec<_>>()
9206                })
9207                .unwrap_or_default();
9208            let Some(method) =
9209                named_text(named, "method").and_then(|raw| index_method_kind_from_str(&raw))
9210            else {
9211                continue;
9212            };
9213            latest.insert(
9214                key,
9215                Some(super::index_store::RegisteredIndex {
9216                    name,
9217                    collection,
9218                    columns,
9219                    method,
9220                    unique: named_bool(named, "unique").unwrap_or(false),
9221                }),
9222            );
9223        }
9224
9225        for index in latest.into_values().flatten() {
9226            let Some(manager) = store.get_collection(&index.collection) else {
9227                continue;
9228            };
9229            let entity_fields = manager
9230                .query_all(|entity| {
9231                    matches!(entity.kind, crate::storage::EntityKind::TableRow { .. })
9232                })
9233                .into_iter()
9234                .map(|entity| (entity.id, table_row_index_fields(&entity)))
9235                .collect::<Vec<_>>();
9236            self.inner
9237                .index_store
9238                .create_index(
9239                    &index.name,
9240                    &index.collection,
9241                    &index.columns,
9242                    index.method,
9243                    index.unique,
9244                    &entity_fields,
9245                )
9246                .map_err(RedDBError::Internal)?;
9247            self.inner.index_store.register(index);
9248        }
9249        self.invalidate_plan_cache();
9250        Ok(())
9251    }
9252
9253    /// Own-tx xids (parent + open/released savepoints) for the current
9254    /// connection. Transports + tests that build a `SnapshotContext`
9255    /// manually (outside the `execute_query` scope) need this set so
9256    /// the writer's own uncommitted tuples stay visible to self.
9257    pub fn current_txn_own_xids(
9258        &self,
9259    ) -> std::collections::HashSet<crate::storage::transaction::snapshot::Xid> {
9260        let mut set = std::collections::HashSet::new();
9261        if let Some(ctx) = self.inner.tx_contexts.read().get(&current_connection_id()) {
9262            set.insert(ctx.xid);
9263            for (_, sub) in &ctx.savepoints {
9264                set.insert(*sub);
9265            }
9266            for sub in &ctx.released_sub_xids {
9267                set.insert(*sub);
9268            }
9269        }
9270        set
9271    }
9272
9273    /// Access the shared `ForeignTableRegistry` (Phase 3.2 PG parity).
9274    ///
9275    /// Callers use this to check whether a table name is a registered
9276    /// foreign table (`registry.is_foreign_table(name)`) and, if so, to
9277    /// scan it (`registry.scan(name)`). The read-path rewriter consults
9278    /// this before dispatching into native-collection lookup.
9279    pub fn foreign_tables(&self) -> Arc<crate::storage::fdw::ForeignTableRegistry> {
9280        Arc::clone(&self.inner.foreign_tables)
9281    }
9282
9283    /// Is Row-Level Security enabled for this table? (Phase 2.5 PG parity)
9284    pub fn is_rls_enabled(&self, table: &str) -> bool {
9285        self.inner.rls_enabled_tables.read().contains(table)
9286    }
9287
9288    /// Collect the USING predicates that apply to this `(table, role, action)`.
9289    ///
9290    /// Returned filters should be OR-combined (a row passes RLS when *any*
9291    /// matching policy accepts it) and then AND-ed into the query's WHERE.
9292    /// When the table has RLS disabled this returns an empty Vec — callers
9293    /// can fast-path back to the unfiltered read.
9294    pub fn matching_rls_policies(
9295        &self,
9296        table: &str,
9297        role: Option<&str>,
9298        action: crate::storage::query::ast::PolicyAction,
9299    ) -> Vec<crate::storage::query::ast::Filter> {
9300        // Default kind = Table preserves the pre-Phase-2.5.5 behaviour:
9301        // callers that don't name a kind only see Table-scoped
9302        // policies (which is what execute SELECT / UPDATE / DELETE
9303        // expect).
9304        self.matching_rls_policies_for_kind(
9305            table,
9306            role,
9307            action,
9308            crate::storage::query::ast::PolicyTargetKind::Table,
9309        )
9310    }
9311
9312    /// Kind-aware variant used by cross-model scans (Phase 2.5.5).
9313    ///
9314    /// Graph scans request `Nodes` / `Edges`, vector ANN requests
9315    /// `Vectors`, queue consumers request `Messages`, and timeseries
9316    /// range scans request `Points`. Policies tagged with a
9317    /// different kind are skipped so a graph-scoped policy doesn't
9318    /// accidentally gate a table SELECT on the same collection.
9319    pub fn matching_rls_policies_for_kind(
9320        &self,
9321        table: &str,
9322        role: Option<&str>,
9323        action: crate::storage::query::ast::PolicyAction,
9324        kind: crate::storage::query::ast::PolicyTargetKind,
9325    ) -> Vec<crate::storage::query::ast::Filter> {
9326        if !self.is_rls_enabled(table) {
9327            return Vec::new();
9328        }
9329        let policies = self.inner.rls_policies.read();
9330        policies
9331            .iter()
9332            .filter_map(|((t, _), p)| {
9333                if t != table {
9334                    return None;
9335                }
9336                // Kind gate — Table policies also apply to every
9337                // other kind *iff* the policy predicate evaluates
9338                // against entity fields that exist uniformly; the
9339                // caller's kind filter is the stricter check, so
9340                // match literally. Auto-tenancy policies stamp
9341                // Table and the caller passes the concrete kind —
9342                // we allow Table policies to apply cross-kind for
9343                // backwards compat.
9344                if p.target_kind != kind
9345                    && p.target_kind != crate::storage::query::ast::PolicyTargetKind::Table
9346                {
9347                    return None;
9348                }
9349                // Action gate — `None` means "ALL" actions.
9350                if let Some(a) = p.action {
9351                    if a != action {
9352                        return None;
9353                    }
9354                }
9355                // Role gate — `None` means "any role".
9356                if let Some(p_role) = p.role.as_deref() {
9357                    match role {
9358                        Some(r) if r == p_role => {}
9359                        _ => return None,
9360                    }
9361                }
9362                Some((*p.using).clone())
9363            })
9364            .collect()
9365    }
9366
9367    pub(crate) fn refresh_table_planner_stats(&self, table: &str) {
9368        let store = self.inner.db.store();
9369        if let Some(stats) =
9370            crate::storage::query::planner::stats_catalog::analyze_collection(store.as_ref(), table)
9371        {
9372            crate::storage::query::planner::stats_catalog::persist_table_stats(
9373                store.as_ref(),
9374                &stats,
9375            );
9376        } else {
9377            crate::storage::query::planner::stats_catalog::clear_table_stats(store.as_ref(), table);
9378        }
9379        self.invalidate_plan_cache();
9380    }
9381
9382    pub(crate) fn note_table_write(&self, table: &str) {
9383        // Skip the write lock when the table is already marked
9384        // dirty. With single-row UPDATEs in a loop this used to
9385        // grab the planner_dirty_tables write lock N times even
9386        // though the first call already flipped the flag.
9387        let already_dirty = self.inner.planner_dirty_tables.read().contains(table);
9388        if !already_dirty {
9389            self.inner
9390                .planner_dirty_tables
9391                .write()
9392                .insert(table.to_string());
9393        }
9394        self.invalidate_result_cache_for_table(table);
9395    }
9396
9397    /// Wrap the planner's `RuntimeQueryExplain` as rows on a
9398    /// `RuntimeQueryResult` so callers over the SQL interface see the
9399    /// plan tree in the same shape a SELECT produces.
9400    ///
9401    /// Columns: `op`, `source`, `est_rows`, `est_cost`, `depth`.
9402    /// Nodes are walked depth-first; `depth` counts from 0 at the
9403    /// root so a text renderer can indent without re-walking.
9404    fn explain_as_rows(&self, raw_query: &str, inner_sql: &str) -> RedDBResult<RuntimeQueryResult> {
9405        let explain = self.explain_query(inner_sql)?;
9406
9407        let columns = vec![
9408            "op".to_string(),
9409            "source".to_string(),
9410            "est_rows".to_string(),
9411            "est_cost".to_string(),
9412            "depth".to_string(),
9413        ];
9414
9415        let mut records: Vec<crate::storage::query::unified::UnifiedRecord> = Vec::new();
9416
9417        // Prepend `CteScan` markers when the query carried a leading
9418        // WITH clause. The CTE bodies are already inlined into the
9419        // main plan tree, but operators reading EXPLAIN need to see
9420        // which named CTEs were resolved — without this row the plan
9421        // would look indistinguishable from a hand-inlined query.
9422        for name in &explain.cte_materializations {
9423            use std::sync::Arc;
9424            let mut rec = crate::storage::query::unified::UnifiedRecord::default();
9425            rec.set_arc(Arc::from("op"), Value::text("CteScan".to_string()));
9426            rec.set_arc(Arc::from("source"), Value::text(name.clone()));
9427            rec.set_arc(Arc::from("est_rows"), Value::Float(0.0));
9428            rec.set_arc(Arc::from("est_cost"), Value::Float(0.0));
9429            rec.set_arc(Arc::from("depth"), Value::Integer(0));
9430            records.push(rec);
9431        }
9432
9433        walk_plan_node(&explain.logical_plan.root, 0, &mut records);
9434
9435        let result = crate::storage::query::unified::UnifiedResult {
9436            columns,
9437            records,
9438            stats: Default::default(),
9439            pre_serialized_json: None,
9440        };
9441
9442        Ok(RuntimeQueryResult {
9443            query: raw_query.to_string(),
9444            mode: explain.mode,
9445            statement: "explain",
9446            engine: "runtime-explain",
9447            result,
9448            affected_rows: 0,
9449            statement_type: "select",
9450            bookmark: None,
9451        })
9452    }
9453
9454    // -----------------------------------------------------------------
9455    // Granular RBAC — privilege gate + GRANT/REVOKE/ALTER USER dispatch
9456    // -----------------------------------------------------------------
9457
9458    /// Project a `QueryExpr` to the (action, resource) pair the
9459    /// privilege engine cares about. Returns `Ok(())` for statements
9460    /// that don't touch user data (transaction control, SHOW, SET, etc.).
9461    pub(crate) fn check_query_privilege(
9462        &self,
9463        expr: &crate::storage::query::ast::QueryExpr,
9464    ) -> Result<(), String> {
9465        use crate::auth::privileges::{Action, AuthzContext, Resource};
9466        use crate::auth::UserId;
9467        use crate::storage::query::ast::QueryExpr;
9468
9469        // No auth store wired (embedded mode / fresh DB / tests) → bypass.
9470        // The bootstrap path itself goes through `execute_query` so this
9471        // is the only sensible default; once auth is wired, the gate
9472        // becomes active.
9473        let auth_store = match self.inner.auth_store.read().clone() {
9474            Some(s) => s,
9475            None => return Ok(()),
9476        };
9477
9478        // Resolve principal + role from the thread-local identity.
9479        // Anonymous (no identity) is allowed to read the bootstrap path
9480        // only when auth_store says so; we treat missing identity as
9481        // platform-admin-equivalent here so embedded test harnesses
9482        // continue to work without setting an identity.
9483        let (username, role) = match current_auth_identity() {
9484            Some(p) => p,
9485            None => return Ok(()),
9486        };
9487        let tenant = current_tenant();
9488
9489        let ctx = AuthzContext {
9490            principal: &username,
9491            effective_role: role,
9492            tenant: tenant.as_deref(),
9493        };
9494        let principal_id = UserId::from_parts(tenant.as_deref(), &username);
9495
9496        // Map QueryExpr → (Action, Resource).
9497        let (action, resource) = match expr {
9498            QueryExpr::Table(t) => (Action::Select, Resource::table_from_name(&t.table)),
9499            QueryExpr::RankOf(_) | QueryExpr::ApproxRankOf(_) | QueryExpr::RankRange(_) => {
9500                (Action::Select, Resource::Database)
9501            }
9502            QueryExpr::QueueSelect(q) => {
9503                return self.check_queue_op_privilege(
9504                    &auth_store,
9505                    &principal_id,
9506                    role,
9507                    tenant.as_deref(),
9508                    "queue:peek",
9509                    &q.queue,
9510                );
9511            }
9512            QueryExpr::QueueCommand(cmd) => {
9513                use crate::storage::query::ast::QueueCommand;
9514                let (queue, action_verb) = match cmd {
9515                    QueueCommand::Push { queue, .. } => (queue.as_str(), "queue:enqueue"),
9516                    QueueCommand::Pop { queue, .. }
9517                    | QueueCommand::GroupRead { queue, .. }
9518                    | QueueCommand::Claim { queue, .. } => (queue.as_str(), "queue:read"),
9519                    QueueCommand::Peek { queue, .. }
9520                    | QueueCommand::Len { queue }
9521                    | QueueCommand::Pending { queue, .. } => (queue.as_str(), "queue:peek"),
9522                    QueueCommand::Ack { queue, .. } => (queue.as_str(), "queue:ack"),
9523                    QueueCommand::Nack {
9524                        queue, delay_ms, ..
9525                    } => {
9526                        // Per-failure retry overrides re-shape retry
9527                        // behaviour for everyone draining the queue and
9528                        // gate on the dedicated `queue:retry` verb so
9529                        // operators can grant base NACK without granting
9530                        // the override capability.
9531                        let verb = if delay_ms.is_some() {
9532                            "queue:retry"
9533                        } else {
9534                            "queue:nack"
9535                        };
9536                        (queue.as_str(), verb)
9537                    }
9538                    QueueCommand::Purge { queue } => (queue.as_str(), "queue:purge"),
9539                    // `GroupCreate` is part of the consumer-setup
9540                    // surface — read-side, never destructive.
9541                    QueueCommand::GroupCreate { queue, .. } => (queue.as_str(), "queue:read"),
9542                    QueueCommand::Move { source, .. } => (source.as_str(), "queue:dlq:move"),
9543                };
9544                return self.check_queue_op_privilege(
9545                    &auth_store,
9546                    &principal_id,
9547                    role,
9548                    tenant.as_deref(),
9549                    action_verb,
9550                    queue,
9551                );
9552            }
9553            QueryExpr::Graph(g) => {
9554                // MATCH … RETURN is the explorer's pattern-traversal
9555                // surface — gate on `graph:traverse` (#757).
9556                self.check_graph_op_privilege(
9557                    &auth_store,
9558                    &principal_id,
9559                    role,
9560                    tenant.as_deref(),
9561                    "graph:traverse",
9562                )?;
9563                if auth_store.iam_authorization_enabled() {
9564                    self.check_graph_property_projection_privilege(
9565                        &auth_store,
9566                        &principal_id,
9567                        role,
9568                        tenant.as_deref(),
9569                        g,
9570                    )?;
9571                    return Ok(());
9572                }
9573                return Ok(());
9574            }
9575            QueryExpr::Path(_) => {
9576                // PATH FROM … TO … is a path-traversal query — gates
9577                // on `graph:traverse` like neighborhood/shortest-path
9578                // (#757).
9579                return self.check_graph_op_privilege(
9580                    &auth_store,
9581                    &principal_id,
9582                    role,
9583                    tenant.as_deref(),
9584                    "graph:traverse",
9585                );
9586            }
9587            QueryExpr::GraphCommand(cmd) => {
9588                use crate::storage::query::ast::GraphCommand;
9589                let action_verb = match cmd {
9590                    // Metadata / property reads.
9591                    GraphCommand::Properties { .. } => "graph:read",
9592                    // Traversal / pattern-walk surface.
9593                    GraphCommand::Neighborhood { .. }
9594                    | GraphCommand::Traverse { .. }
9595                    | GraphCommand::ShortestPath { .. } => "graph:traverse",
9596                    // Analytics algorithms — expensive enough that Red
9597                    // UI needs to gate the runner independently of
9598                    // ordinary traversal.
9599                    GraphCommand::Centrality { .. }
9600                    | GraphCommand::Community { .. }
9601                    | GraphCommand::Components { .. }
9602                    | GraphCommand::Cycles { .. }
9603                    | GraphCommand::Clustering
9604                    | GraphCommand::TopologicalSort => "graph:algorithm:run",
9605                };
9606                return self.check_graph_op_privilege(
9607                    &auth_store,
9608                    &principal_id,
9609                    role,
9610                    tenant.as_deref(),
9611                    action_verb,
9612                );
9613            }
9614            QueryExpr::Vector(v) => {
9615                if auth_store.iam_authorization_enabled() {
9616                    self.check_vector_op_privilege(
9617                        &auth_store,
9618                        &principal_id,
9619                        role,
9620                        tenant.as_deref(),
9621                        "vector:search",
9622                        &v.collection,
9623                    )?;
9624                    self.check_table_like_column_projection_privilege(
9625                        &auth_store,
9626                        &principal_id,
9627                        role,
9628                        tenant.as_deref(),
9629                        &v.collection,
9630                        &["content".to_string()],
9631                    )?;
9632                    return Ok(());
9633                }
9634                return Ok(());
9635            }
9636            QueryExpr::SearchCommand(cmd) => {
9637                use crate::storage::query::ast::SearchCommand;
9638                if auth_store.iam_authorization_enabled() {
9639                    // `SEARCH SIMILAR [..] COLLECTION <c>` and `SEARCH
9640                    // HYBRID ... COLLECTION <c>` are the same UI
9641                    // affordances as `VECTOR SEARCH` / hybrid joins —
9642                    // Red UI must see the same `vector:search` envelope
9643                    // so a single toolbar grant is sufficient.
9644                    let collection = match cmd {
9645                        SearchCommand::Similar { collection, .. }
9646                        | SearchCommand::Hybrid { collection, .. } => Some(collection.as_str()),
9647                        _ => None,
9648                    };
9649                    if let Some(c) = collection {
9650                        self.check_vector_op_privilege(
9651                            &auth_store,
9652                            &principal_id,
9653                            role,
9654                            tenant.as_deref(),
9655                            "vector:search",
9656                            c,
9657                        )?;
9658                        return Ok(());
9659                    }
9660                }
9661                return Ok(());
9662            }
9663            QueryExpr::Hybrid(h) => {
9664                if auth_store.iam_authorization_enabled() {
9665                    // The vector half of a hybrid search is gated under
9666                    // the same `vector:search` verb as a standalone
9667                    // VECTOR SEARCH — Red UI's hybrid-search toolbar
9668                    // must surface the same UI-safe denial envelope
9669                    // when the principal lacks the grant. The
9670                    // structured half is dispatched to its own gate via
9671                    // the inner query during execution.
9672                    self.check_vector_op_privilege(
9673                        &auth_store,
9674                        &principal_id,
9675                        role,
9676                        tenant.as_deref(),
9677                        "vector:search",
9678                        &h.vector.collection,
9679                    )?;
9680                    return Ok(());
9681                }
9682                return Ok(());
9683            }
9684            QueryExpr::Insert(i) => (Action::Insert, Resource::table_from_name(&i.table)),
9685            QueryExpr::Update(u) => (Action::Update, Resource::table_from_name(&u.table)),
9686            QueryExpr::Delete(d) => (Action::Delete, Resource::table_from_name(&d.table)),
9687            // Joins inherit the read privilege from any constituent
9688            // table — for now we emit a single Select on the database
9689            // (admins bypass; non-admins need a Database/Schema grant).
9690            QueryExpr::Join(_) => (Action::Select, Resource::Database),
9691            // GRANT / REVOKE / USER DDL are authority statements;
9692            // require Admin (the helper methods enforce).
9693            QueryExpr::Grant(_)
9694            | QueryExpr::Revoke(_)
9695            | QueryExpr::AlterUser(_)
9696            | QueryExpr::CreateUser(_) => {
9697                return if role == crate::auth::Role::Admin {
9698                    Ok(())
9699                } else {
9700                    Err(format!(
9701                        "principal=`{}` role=`{:?}` cannot issue ACL/auth DDL",
9702                        username, role
9703                    ))
9704                };
9705            }
9706            QueryExpr::CreateIamPolicy { id, .. } => {
9707                return self.check_policy_management_privilege(
9708                    &auth_store,
9709                    &principal_id,
9710                    role,
9711                    tenant.as_deref(),
9712                    "policy:put",
9713                    "policy",
9714                    id,
9715                );
9716            }
9717            QueryExpr::DropIamPolicy { id } => {
9718                return self.check_policy_management_privilege(
9719                    &auth_store,
9720                    &principal_id,
9721                    role,
9722                    tenant.as_deref(),
9723                    "policy:drop",
9724                    "policy",
9725                    id,
9726                );
9727            }
9728            QueryExpr::AttachPolicy { policy_id, .. } => {
9729                return self.check_policy_management_privilege(
9730                    &auth_store,
9731                    &principal_id,
9732                    role,
9733                    tenant.as_deref(),
9734                    "policy:attach",
9735                    "policy",
9736                    policy_id,
9737                );
9738            }
9739            QueryExpr::DetachPolicy { policy_id, .. } => {
9740                return self.check_policy_management_privilege(
9741                    &auth_store,
9742                    &principal_id,
9743                    role,
9744                    tenant.as_deref(),
9745                    "policy:detach",
9746                    "policy",
9747                    policy_id,
9748                );
9749            }
9750            QueryExpr::ShowPolicies { .. } | QueryExpr::ShowEffectivePermissions { .. } => {
9751                return Ok(());
9752            }
9753            QueryExpr::SimulatePolicy { .. } => {
9754                return self.check_policy_management_privilege(
9755                    &auth_store,
9756                    &principal_id,
9757                    role,
9758                    tenant.as_deref(),
9759                    "policy:simulate",
9760                    "policy",
9761                    "*",
9762                );
9763            }
9764            QueryExpr::LintPolicy { .. } => {
9765                // Linting is a read-only inspection — gate it like
9766                // simulate (policy management role).
9767                return self.check_policy_management_privilege(
9768                    &auth_store,
9769                    &principal_id,
9770                    role,
9771                    tenant.as_deref(),
9772                    "policy:simulate",
9773                    "policy",
9774                    "*",
9775                );
9776            }
9777            QueryExpr::MigratePolicyMode { dry_run, .. } => {
9778                // DRY RUN is a pre-flight inspection (policy:simulate).
9779                // The actual mode flip is a privileged mutation under
9780                // the policy:put action (it persists a new enforcement
9781                // mode to the vault KV through `set_enforcement_mode`).
9782                let action = if *dry_run {
9783                    "policy:simulate"
9784                } else {
9785                    "policy:put"
9786                };
9787                return self.check_policy_management_privilege(
9788                    &auth_store,
9789                    &principal_id,
9790                    role,
9791                    tenant.as_deref(),
9792                    action,
9793                    "policy",
9794                    "*",
9795                );
9796            }
9797            // DROP and TRUNCATE — Write-role gate + per-collection IAM policy
9798            // when IAM mode is active. Other DDL stays role-only for now.
9799            QueryExpr::DropTable(q) => {
9800                return self.check_ddl_collection_privilege(
9801                    &auth_store,
9802                    &principal_id,
9803                    role,
9804                    tenant.as_deref(),
9805                    &username,
9806                    "drop",
9807                    &q.name,
9808                );
9809            }
9810            QueryExpr::DropGraph(q) => {
9811                return self.check_ddl_collection_privilege(
9812                    &auth_store,
9813                    &principal_id,
9814                    role,
9815                    tenant.as_deref(),
9816                    &username,
9817                    "drop",
9818                    &q.name,
9819                );
9820            }
9821            QueryExpr::DropVector(q) => {
9822                return self.check_ddl_collection_privilege(
9823                    &auth_store,
9824                    &principal_id,
9825                    role,
9826                    tenant.as_deref(),
9827                    &username,
9828                    "drop",
9829                    &q.name,
9830                );
9831            }
9832            QueryExpr::DropDocument(q) => {
9833                return self.check_ddl_collection_privilege(
9834                    &auth_store,
9835                    &principal_id,
9836                    role,
9837                    tenant.as_deref(),
9838                    &username,
9839                    "drop",
9840                    &q.name,
9841                );
9842            }
9843            QueryExpr::DropKv(q) => {
9844                return self.check_ddl_collection_privilege(
9845                    &auth_store,
9846                    &principal_id,
9847                    role,
9848                    tenant.as_deref(),
9849                    &username,
9850                    "drop",
9851                    &q.name,
9852                );
9853            }
9854            QueryExpr::DropCollection(q) => {
9855                return self.check_ddl_collection_privilege(
9856                    &auth_store,
9857                    &principal_id,
9858                    role,
9859                    tenant.as_deref(),
9860                    &username,
9861                    "drop",
9862                    &q.name,
9863                );
9864            }
9865            QueryExpr::Truncate(q) => {
9866                return self.check_ddl_collection_privilege(
9867                    &auth_store,
9868                    &principal_id,
9869                    role,
9870                    tenant.as_deref(),
9871                    &username,
9872                    "truncate",
9873                    &q.name,
9874                );
9875            }
9876            // Remaining DDL (#753) — hybrid policy-aware gate. Specific
9877            // create/alter/drop verbs gate operations with a clear
9878            // per-collection target so Red UI can author fine-grained
9879            // policies (`create on collection:users`). Namespace-level
9880            // and grouped DDL fall back to broader `schema:admin` /
9881            // `schema:write` verbs against a `schema:<name>` resource.
9882            // All branches share the [`check_ddl_object_privilege`]
9883            // helper so allows / denies produce the same structured
9884            // "principal=… action=… resource=<kind>:<name> denied by
9885            // IAM policy" reason the Red UI security read contracts
9886            // (#740) already render.
9887            QueryExpr::CreateTable(q) => {
9888                return self.check_ddl_object_privilege(
9889                    &auth_store,
9890                    &principal_id,
9891                    role,
9892                    tenant.as_deref(),
9893                    &username,
9894                    "create",
9895                    "collection",
9896                    &q.name,
9897                    crate::auth::Role::Write,
9898                );
9899            }
9900            QueryExpr::CreateCollection(q) => {
9901                return self.check_ddl_object_privilege(
9902                    &auth_store,
9903                    &principal_id,
9904                    role,
9905                    tenant.as_deref(),
9906                    &username,
9907                    "create",
9908                    "collection",
9909                    &q.name,
9910                    crate::auth::Role::Write,
9911                );
9912            }
9913            QueryExpr::CreateVector(q) => {
9914                return self.check_ddl_object_privilege(
9915                    &auth_store,
9916                    &principal_id,
9917                    role,
9918                    tenant.as_deref(),
9919                    &username,
9920                    "create",
9921                    "collection",
9922                    &q.name,
9923                    crate::auth::Role::Write,
9924                );
9925            }
9926            QueryExpr::AlterTable(q) => {
9927                return self.check_ddl_object_privilege(
9928                    &auth_store,
9929                    &principal_id,
9930                    role,
9931                    tenant.as_deref(),
9932                    &username,
9933                    "alter",
9934                    "collection",
9935                    &q.name,
9936                    crate::auth::Role::Write,
9937                );
9938            }
9939            QueryExpr::CreateIndex(q) => {
9940                return self.check_ddl_object_privilege(
9941                    &auth_store,
9942                    &principal_id,
9943                    role,
9944                    tenant.as_deref(),
9945                    &username,
9946                    "create",
9947                    "collection",
9948                    &q.table,
9949                    crate::auth::Role::Write,
9950                );
9951            }
9952            QueryExpr::DropIndex(q) => {
9953                return self.check_ddl_object_privilege(
9954                    &auth_store,
9955                    &principal_id,
9956                    role,
9957                    tenant.as_deref(),
9958                    &username,
9959                    "drop",
9960                    "collection",
9961                    &q.table,
9962                    crate::auth::Role::Write,
9963                );
9964            }
9965            QueryExpr::CreateSchema(q) => {
9966                return self.check_ddl_object_privilege(
9967                    &auth_store,
9968                    &principal_id,
9969                    role,
9970                    tenant.as_deref(),
9971                    &username,
9972                    "schema:admin",
9973                    "schema",
9974                    &q.name,
9975                    crate::auth::Role::Admin,
9976                );
9977            }
9978            QueryExpr::DropSchema(q) => {
9979                return self.check_ddl_object_privilege(
9980                    &auth_store,
9981                    &principal_id,
9982                    role,
9983                    tenant.as_deref(),
9984                    &username,
9985                    "schema:admin",
9986                    "schema",
9987                    &q.name,
9988                    crate::auth::Role::Admin,
9989                );
9990            }
9991            QueryExpr::CreateSequence(q) => {
9992                return self.check_ddl_object_privilege(
9993                    &auth_store,
9994                    &principal_id,
9995                    role,
9996                    tenant.as_deref(),
9997                    &username,
9998                    "create",
9999                    "collection",
10000                    &q.name,
10001                    crate::auth::Role::Write,
10002                );
10003            }
10004            QueryExpr::DropSequence(q) => {
10005                return self.check_ddl_object_privilege(
10006                    &auth_store,
10007                    &principal_id,
10008                    role,
10009                    tenant.as_deref(),
10010                    &username,
10011                    "drop",
10012                    "collection",
10013                    &q.name,
10014                    crate::auth::Role::Write,
10015                );
10016            }
10017            QueryExpr::CreateView(q) => {
10018                return self.check_ddl_object_privilege(
10019                    &auth_store,
10020                    &principal_id,
10021                    role,
10022                    tenant.as_deref(),
10023                    &username,
10024                    "create",
10025                    "collection",
10026                    &q.name,
10027                    crate::auth::Role::Write,
10028                );
10029            }
10030            QueryExpr::DropView(q) => {
10031                return self.check_ddl_object_privilege(
10032                    &auth_store,
10033                    &principal_id,
10034                    role,
10035                    tenant.as_deref(),
10036                    &username,
10037                    "drop",
10038                    "collection",
10039                    &q.name,
10040                    crate::auth::Role::Write,
10041                );
10042            }
10043            QueryExpr::RefreshMaterializedView(q) => {
10044                return self.check_ddl_object_privilege(
10045                    &auth_store,
10046                    &principal_id,
10047                    role,
10048                    tenant.as_deref(),
10049                    &username,
10050                    "alter",
10051                    "collection",
10052                    &q.name,
10053                    crate::auth::Role::Write,
10054                );
10055            }
10056            QueryExpr::CreatePolicy(q) => {
10057                return self.check_ddl_object_privilege(
10058                    &auth_store,
10059                    &principal_id,
10060                    role,
10061                    tenant.as_deref(),
10062                    &username,
10063                    "create",
10064                    "collection",
10065                    &q.table,
10066                    crate::auth::Role::Write,
10067                );
10068            }
10069            QueryExpr::DropPolicy(q) => {
10070                return self.check_ddl_object_privilege(
10071                    &auth_store,
10072                    &principal_id,
10073                    role,
10074                    tenant.as_deref(),
10075                    &username,
10076                    "drop",
10077                    "collection",
10078                    &q.table,
10079                    crate::auth::Role::Write,
10080                );
10081            }
10082            QueryExpr::CreateServer(q) => {
10083                return self.check_ddl_object_privilege(
10084                    &auth_store,
10085                    &principal_id,
10086                    role,
10087                    tenant.as_deref(),
10088                    &username,
10089                    "schema:admin",
10090                    "schema",
10091                    &q.name,
10092                    crate::auth::Role::Admin,
10093                );
10094            }
10095            QueryExpr::DropServer(q) => {
10096                return self.check_ddl_object_privilege(
10097                    &auth_store,
10098                    &principal_id,
10099                    role,
10100                    tenant.as_deref(),
10101                    &username,
10102                    "schema:admin",
10103                    "schema",
10104                    &q.name,
10105                    crate::auth::Role::Admin,
10106                );
10107            }
10108            QueryExpr::CreateForeignTable(q) => {
10109                return self.check_ddl_object_privilege(
10110                    &auth_store,
10111                    &principal_id,
10112                    role,
10113                    tenant.as_deref(),
10114                    &username,
10115                    "schema:write",
10116                    "schema",
10117                    &q.name,
10118                    crate::auth::Role::Write,
10119                );
10120            }
10121            QueryExpr::DropForeignTable(q) => {
10122                return self.check_ddl_object_privilege(
10123                    &auth_store,
10124                    &principal_id,
10125                    role,
10126                    tenant.as_deref(),
10127                    &username,
10128                    "schema:write",
10129                    "schema",
10130                    &q.name,
10131                    crate::auth::Role::Write,
10132                );
10133            }
10134            QueryExpr::CreateTimeSeries(q) => {
10135                return self.check_ddl_object_privilege(
10136                    &auth_store,
10137                    &principal_id,
10138                    role,
10139                    tenant.as_deref(),
10140                    &username,
10141                    "create",
10142                    "collection",
10143                    &q.name,
10144                    crate::auth::Role::Write,
10145                );
10146            }
10147            QueryExpr::CreateMetric(q) => {
10148                return self.check_ddl_object_privilege(
10149                    &auth_store,
10150                    &principal_id,
10151                    role,
10152                    tenant.as_deref(),
10153                    &username,
10154                    "create",
10155                    "collection",
10156                    &q.path,
10157                    crate::auth::Role::Write,
10158                );
10159            }
10160            QueryExpr::AlterMetric(q) => {
10161                return self.check_ddl_object_privilege(
10162                    &auth_store,
10163                    &principal_id,
10164                    role,
10165                    tenant.as_deref(),
10166                    &username,
10167                    "alter",
10168                    "collection",
10169                    &q.path,
10170                    crate::auth::Role::Write,
10171                );
10172            }
10173            QueryExpr::CreateSlo(q) => {
10174                return self.check_ddl_object_privilege(
10175                    &auth_store,
10176                    &principal_id,
10177                    role,
10178                    tenant.as_deref(),
10179                    &username,
10180                    "create",
10181                    "collection",
10182                    &q.path,
10183                    crate::auth::Role::Write,
10184                );
10185            }
10186            QueryExpr::DropTimeSeries(q) => {
10187                return self.check_ddl_object_privilege(
10188                    &auth_store,
10189                    &principal_id,
10190                    role,
10191                    tenant.as_deref(),
10192                    &username,
10193                    "drop",
10194                    "collection",
10195                    &q.name,
10196                    crate::auth::Role::Write,
10197                );
10198            }
10199            QueryExpr::CreateQueue(q) => {
10200                return self.check_ddl_object_privilege(
10201                    &auth_store,
10202                    &principal_id,
10203                    role,
10204                    tenant.as_deref(),
10205                    &username,
10206                    "create",
10207                    "collection",
10208                    &q.name,
10209                    crate::auth::Role::Write,
10210                );
10211            }
10212            QueryExpr::AlterQueue(q) => {
10213                return self.check_ddl_object_privilege(
10214                    &auth_store,
10215                    &principal_id,
10216                    role,
10217                    tenant.as_deref(),
10218                    &username,
10219                    "alter",
10220                    "collection",
10221                    &q.name,
10222                    crate::auth::Role::Write,
10223                );
10224            }
10225            QueryExpr::DropQueue(q) => {
10226                return self.check_ddl_object_privilege(
10227                    &auth_store,
10228                    &principal_id,
10229                    role,
10230                    tenant.as_deref(),
10231                    &username,
10232                    "drop",
10233                    "collection",
10234                    &q.name,
10235                    crate::auth::Role::Write,
10236                );
10237            }
10238            QueryExpr::CreateTree(q) => {
10239                return self.check_ddl_object_privilege(
10240                    &auth_store,
10241                    &principal_id,
10242                    role,
10243                    tenant.as_deref(),
10244                    &username,
10245                    "create",
10246                    "collection",
10247                    &q.collection,
10248                    crate::auth::Role::Write,
10249                );
10250            }
10251            QueryExpr::DropTree(q) => {
10252                return self.check_ddl_object_privilege(
10253                    &auth_store,
10254                    &principal_id,
10255                    role,
10256                    tenant.as_deref(),
10257                    &username,
10258                    "drop",
10259                    "collection",
10260                    &q.collection,
10261                    crate::auth::Role::Write,
10262                );
10263            }
10264            // Migration DDL — CREATE MIGRATION is grouped DDL on the
10265            // schema namespace; uses the `schema:write` fallback verb
10266            // (no obvious per-collection target).
10267            QueryExpr::CreateMigration(q) => {
10268                return self.check_ddl_object_privilege(
10269                    &auth_store,
10270                    &principal_id,
10271                    role,
10272                    tenant.as_deref(),
10273                    &username,
10274                    "schema:write",
10275                    "schema",
10276                    &q.name,
10277                    crate::auth::Role::Write,
10278                );
10279            }
10280            // APPLY / ROLLBACK change data and schema — require Admin.
10281            QueryExpr::ApplyMigration(_) | QueryExpr::RollbackMigration(_) => {
10282                return if role == crate::auth::Role::Admin {
10283                    Ok(())
10284                } else {
10285                    Err(format!(
10286                        "principal=`{}` role=`{:?}` cannot issue APPLY/ROLLBACK MIGRATION",
10287                        username, role
10288                    ))
10289                };
10290            }
10291            // EXPLAIN MIGRATION is read-only — any authenticated principal.
10292            QueryExpr::ExplainMigration(_) => return Ok(()),
10293            // Everything else (SET, SHOW, transaction control, graph
10294            // commands, queue/tree commands, MaintenanceCommand …)
10295            // is allowed for any authenticated principal.
10296            _ => return Ok(()),
10297        };
10298
10299        if auth_store.iam_authorization_enabled() {
10300            let iam_action = legacy_action_to_iam(action);
10301            let iam_resource = legacy_resource_to_iam(&resource, tenant.as_deref());
10302            let iam_ctx = runtime_iam_context(role, tenant.as_deref());
10303            if !auth_store.check_policy_authz_with_role(
10304                &principal_id,
10305                iam_action,
10306                &iam_resource,
10307                &iam_ctx,
10308                role,
10309            ) {
10310                return Err(format!(
10311                    "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
10312                    username, iam_action, iam_resource.kind, iam_resource.name
10313                ));
10314            }
10315
10316            if let QueryExpr::Table(table) = expr {
10317                self.check_table_column_projection_privilege(
10318                    &auth_store,
10319                    &principal_id,
10320                    &iam_ctx,
10321                    table,
10322                )?;
10323            }
10324
10325            if let QueryExpr::Update(update) = expr {
10326                let columns = update_set_target_columns(update);
10327                if !columns.is_empty() {
10328                    let request = column_access_request_for_table_update(&update.table, columns);
10329                    let outcome =
10330                        auth_store.check_column_projection_authz(&principal_id, &request, &iam_ctx);
10331                    if let Some(denied) = outcome.first_denied_column() {
10332                        return Err(format!(
10333                            "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM column policy",
10334                            username, iam_action, denied.resource.kind, denied.resource.name
10335                        ));
10336                    }
10337                    if !outcome.allowed() {
10338                        return Err(format!(
10339                            "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
10340                            username,
10341                            iam_action,
10342                            outcome.table_resource.kind,
10343                            outcome.table_resource.name
10344                        ));
10345                    }
10346                }
10347
10348                if let Some(columns) = update_returning_columns_for_policy(self, update) {
10349                    let request = column_access_request_for_table_select(&update.table, columns);
10350                    let outcome =
10351                        auth_store.check_column_projection_authz(&principal_id, &request, &iam_ctx);
10352                    if let Some(denied) = outcome.first_denied_column() {
10353                        return Err(format!(
10354                            "principal=`{}` action=`select` resource=`{}:{}` denied by IAM column policy",
10355                            username, denied.resource.kind, denied.resource.name
10356                        ));
10357                    }
10358                    if !outcome.allowed() {
10359                        return Err(format!(
10360                            "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
10361                            username, outcome.table_resource.kind, outcome.table_resource.name
10362                        ));
10363                    }
10364                }
10365            }
10366
10367            Ok(())
10368        } else {
10369            auth_store
10370                .check_grant(&ctx, action, &resource)
10371                .map_err(|e| e.to_string())
10372        }
10373    }
10374
10375    fn check_table_column_projection_privilege(
10376        &self,
10377        auth_store: &Arc<crate::auth::store::AuthStore>,
10378        principal: &crate::auth::UserId,
10379        ctx: &crate::auth::policies::EvalContext,
10380        table: &crate::storage::query::ast::TableQuery,
10381    ) -> Result<(), String> {
10382        use crate::auth::{ColumnAccessRequest, ColumnDecisionEffect};
10383
10384        let columns = requested_table_columns_for_policy(table);
10385        if columns.is_empty() {
10386            return Ok(());
10387        }
10388
10389        let request = ColumnAccessRequest::select(table.table.clone(), columns);
10390        let outcome = auth_store.check_column_projection_authz(principal, &request, ctx);
10391        if outcome.allowed() {
10392            return Ok(());
10393        }
10394
10395        if !matches!(
10396            outcome.table_decision,
10397            crate::auth::policies::Decision::Allow { .. }
10398                | crate::auth::policies::Decision::AdminBypass
10399        ) {
10400            return Err(format!(
10401                "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
10402                principal, outcome.table_resource.kind, outcome.table_resource.name
10403            ));
10404        }
10405
10406        let denied = outcome
10407            .first_denied_column()
10408            .filter(|decision| decision.effective == ColumnDecisionEffect::Denied);
10409        match denied {
10410            Some(decision) => Err(format!(
10411                "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
10412                principal, decision.resource.kind, decision.resource.name
10413            )),
10414            None => Ok(()),
10415        }
10416    }
10417
10418    fn check_graph_property_projection_privilege(
10419        &self,
10420        auth_store: &Arc<crate::auth::store::AuthStore>,
10421        principal: &crate::auth::UserId,
10422        role: crate::auth::Role,
10423        tenant: Option<&str>,
10424        query: &crate::storage::query::ast::GraphQuery,
10425    ) -> Result<(), String> {
10426        let columns = explicit_graph_projection_properties(query);
10427        if columns.is_empty() {
10428            return Ok(());
10429        }
10430        self.check_table_like_column_projection_privilege(
10431            auth_store, principal, role, tenant, "graph", &columns,
10432        )
10433    }
10434
10435    fn check_table_like_column_projection_privilege(
10436        &self,
10437        auth_store: &Arc<crate::auth::store::AuthStore>,
10438        principal: &crate::auth::UserId,
10439        role: crate::auth::Role,
10440        tenant: Option<&str>,
10441        table: &str,
10442        columns: &[String],
10443    ) -> Result<(), String> {
10444        let iam_ctx = runtime_iam_context(role, tenant);
10445        let request =
10446            crate::auth::ColumnAccessRequest::select(table.to_string(), columns.iter().cloned());
10447        let outcome = auth_store.check_column_projection_authz(principal, &request, &iam_ctx);
10448        if outcome.allowed() {
10449            return Ok(());
10450        }
10451        let denied = outcome
10452            .first_denied_column()
10453            .map(|d| d.resource.name.clone())
10454            .unwrap_or_else(|| format!("{table}.<unknown>"));
10455        Err(format!(
10456            "principal=`{}` action=`select` resource=`column:{}` denied by IAM policy",
10457            principal, denied
10458        ))
10459    }
10460
10461    fn check_policy_management_privilege(
10462        &self,
10463        auth_store: &Arc<crate::auth::store::AuthStore>,
10464        principal: &crate::auth::UserId,
10465        role: crate::auth::Role,
10466        tenant: Option<&str>,
10467        action: &str,
10468        resource_kind: &str,
10469        resource_name: &str,
10470    ) -> Result<(), String> {
10471        let ctx = runtime_iam_context(role, tenant);
10472
10473        if !auth_store.iam_authorization_enabled() {
10474            return if role == crate::auth::Role::Admin {
10475                Ok(())
10476            } else {
10477                Err(format!(
10478                    "principal=`{}` role=`{:?}` cannot issue ACL/auth DDL",
10479                    principal, role
10480                ))
10481            };
10482        }
10483
10484        if resource_kind == "policy"
10485            && matches!(
10486                action,
10487                "policy:put" | "policy:drop" | "policy:attach" | "policy:detach"
10488            )
10489            && self
10490                .inner
10491                .config_registry
10492                .get_active(resource_name)
10493                .map(|entry| entry.managed)
10494                .unwrap_or(false)
10495        {
10496            return Ok(());
10497        }
10498
10499        let mut resource = crate::auth::policies::ResourceRef::new(
10500            resource_kind.to_string(),
10501            resource_name.to_string(),
10502        );
10503        if let Some(t) = tenant {
10504            resource = resource.with_tenant(t.to_string());
10505        }
10506        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
10507            Ok(())
10508        } else {
10509            Err(format!(
10510                "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
10511                principal, action, resource.kind, resource.name
10512            ))
10513        }
10514    }
10515
10516    fn check_managed_config_write_for_set_config(&self, key: &str) -> RedDBResult<()> {
10517        let Some(auth_store) = self.inner.auth_store.read().clone() else {
10518            return Ok(());
10519        };
10520        let (username, role) = current_auth_identity()
10521            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
10522        let tenant = current_tenant();
10523        let principal = crate::auth::UserId::from_parts(tenant.as_deref(), &username);
10524        let ctx = runtime_iam_context(role, tenant.as_deref());
10525        let gate = crate::auth::managed_config::ManagedConfigGate::new(
10526            self.inner.config_registry.as_ref(),
10527        );
10528        match gate.check_write(&auth_store, &principal, &ctx, key) {
10529            crate::auth::managed_config::ManagedConfigDecision::PassThrough { .. }
10530            | crate::auth::managed_config::ManagedConfigDecision::Allow { .. } => Ok(()),
10531            crate::auth::managed_config::ManagedConfigDecision::Deny { reason, .. } => {
10532                Err(RedDBError::Query(format!(
10533                    "permission denied: managed config mutation blocked for `{key}`: {reason}"
10534                )))
10535            }
10536        }
10537    }
10538
10539    /// IAM privilege check for a granular queue operation (issue #755 /
10540    /// PRD #735).
10541    ///
10542    /// Each queue operation maps to a stable verb in
10543    /// [`crate::auth::action_catalog`] (`queue:enqueue`, `queue:read`,
10544    /// `queue:peek`, `queue:ack`, `queue:nack`, `queue:retry`,
10545    /// `queue:dlq:move`, `queue:purge`, `queue:presence:read`). The
10546    /// resource is `queue:<name>` scoped to the current tenant. In
10547    /// legacy mode (no IAM authorization configured) the check is a
10548    /// no-op — the role gates in `execute_queue_command` still apply
10549    /// and the legacy `select` / `write` grant table continues to
10550    /// govern queue access. In IAM-enabled mode a missing granular
10551    /// grant yields a structured, UI-safe error of the form
10552    /// `principal=… action=queue:… resource=queue:… denied by IAM
10553    /// policy` so Red UI can surface the failing toolbar action.
10554    fn check_queue_op_privilege(
10555        &self,
10556        auth_store: &Arc<crate::auth::store::AuthStore>,
10557        principal: &crate::auth::UserId,
10558        role: crate::auth::Role,
10559        tenant: Option<&str>,
10560        action: &str,
10561        queue: &str,
10562    ) -> Result<(), String> {
10563        if !auth_store.iam_authorization_enabled() {
10564            return Ok(());
10565        }
10566        let mut resource =
10567            crate::auth::policies::ResourceRef::new("queue".to_string(), queue.to_string());
10568        if let Some(t) = tenant {
10569            resource = resource.with_tenant(t.to_string());
10570        }
10571        let ctx = runtime_iam_context(role, tenant);
10572        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
10573            Ok(())
10574        } else {
10575            Err(format!(
10576                "principal=`{}` action=`{}` resource=`queue:{}` denied by IAM policy",
10577                principal, action, queue
10578            ))
10579        }
10580    }
10581
10582    /// IAM privilege check for a graph operation (issue #757 / PRD
10583    /// #735).
10584    ///
10585    /// Each graph operation maps to a stable verb in
10586    /// [`crate::auth::action_catalog`] — `graph:read` for
10587    /// metadata/property lookups, `graph:traverse` for MATCH / PATH /
10588    /// NEIGHBORHOOD / TRAVERSE / SHORTEST_PATH, and
10589    /// `graph:algorithm:run` for analytics algorithms (centrality,
10590    /// community, components, cycles, clustering, topological sort).
10591    /// The resource is `graph:*` scoped to the current tenant — the
10592    /// runtime today operates on a singleton graph store so the name
10593    /// has no concrete identifier; policies grant the explorer
10594    /// surface by writing `graph:*` as the resource pattern.
10595    ///
10596    /// In legacy mode (no IAM authorization configured) the check is
10597    /// a no-op so the existing role-based defaults continue to
10598    /// govern. In IAM-enabled mode a missing grant produces the
10599    /// UI-safe envelope `principal=… action=graph:… resource=graph:*
10600    /// denied by IAM policy` Red UI keys on.
10601    fn check_graph_op_privilege(
10602        &self,
10603        auth_store: &Arc<crate::auth::store::AuthStore>,
10604        principal: &crate::auth::UserId,
10605        role: crate::auth::Role,
10606        tenant: Option<&str>,
10607        action: &str,
10608    ) -> Result<(), String> {
10609        if !auth_store.iam_authorization_enabled() {
10610            return Ok(());
10611        }
10612        let mut resource =
10613            crate::auth::policies::ResourceRef::new("graph".to_string(), "*".to_string());
10614        if let Some(t) = tenant {
10615            resource = resource.with_tenant(t.to_string());
10616        }
10617        let ctx = runtime_iam_context(role, tenant);
10618        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
10619            Ok(())
10620        } else {
10621            Err(format!(
10622                "principal=`{}` action=`{}` resource=`graph:*` denied by IAM policy",
10623                principal, action
10624            ))
10625        }
10626    }
10627
10628    /// IAM privilege check for a granular vector operation (issue #756
10629    /// / PRD #735).
10630    ///
10631    /// Each vector operation maps to a stable verb in
10632    /// [`crate::auth::action_catalog`] (`vector:read`, `vector:search`,
10633    /// `vector:artifact:read`, `vector:artifact:rebuild`,
10634    /// `vector:admin`). The resource is `vector:<collection>` scoped to
10635    /// the current tenant. In legacy mode (no IAM authorization
10636    /// configured) the check is a no-op — the role gates and existing
10637    /// `select` / column-projection grants continue to govern access.
10638    /// In IAM-enabled mode a missing granular grant yields a
10639    /// structured, UI-safe error of the form `principal=…
10640    /// action=vector:… resource=vector:… denied by IAM policy` so Red
10641    /// UI can surface the failing toolbar action.
10642    fn check_vector_op_privilege(
10643        &self,
10644        auth_store: &Arc<crate::auth::store::AuthStore>,
10645        principal: &crate::auth::UserId,
10646        role: crate::auth::Role,
10647        tenant: Option<&str>,
10648        action: &str,
10649        collection: &str,
10650    ) -> Result<(), String> {
10651        if !auth_store.iam_authorization_enabled() {
10652            return Ok(());
10653        }
10654        let mut resource =
10655            crate::auth::policies::ResourceRef::new("vector".to_string(), collection.to_string());
10656        if let Some(t) = tenant {
10657            resource = resource.with_tenant(t.to_string());
10658        }
10659        let ctx = runtime_iam_context(role, tenant);
10660        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
10661            Ok(())
10662        } else {
10663            Err(format!(
10664                "principal=`{}` action=`{}` resource=`vector:{}` denied by IAM policy",
10665                principal, action, collection
10666            ))
10667        }
10668    }
10669
10670    /// IAM privilege check for DROP / TRUNCATE on a named collection.
10671    ///
10672    /// Delegates to [`check_ddl_object_privilege`] with `resource_kind =
10673    /// "collection"`. Kept as a thin wrapper so the existing DROP/TRUNCATE
10674    /// callsites stay readable.
10675    fn check_ddl_collection_privilege(
10676        &self,
10677        auth_store: &Arc<crate::auth::store::AuthStore>,
10678        principal: &crate::auth::UserId,
10679        role: crate::auth::Role,
10680        tenant: Option<&str>,
10681        username: &str,
10682        action: &str,
10683        collection: &str,
10684    ) -> Result<(), String> {
10685        self.check_ddl_object_privilege(
10686            auth_store,
10687            principal,
10688            role,
10689            tenant,
10690            username,
10691            action,
10692            "collection",
10693            collection,
10694            crate::auth::Role::Write,
10695        )
10696    }
10697
10698    /// Generalised IAM privilege check for DDL on a named object.
10699    ///
10700    /// `action` is the stable verb advertised through the action catalog
10701    /// (`create`, `alter`, `drop`, `truncate`, `schema:write`,
10702    /// `schema:admin`). `resource_kind` / `resource_name` form the policy
10703    /// resource (`collection:<name>`, `schema:<name>`). `min_role` is the
10704    /// legacy gate when IAM is not yet enabled.
10705    ///
10706    /// Behaviour:
10707    /// * Role below `min_role` → structured "principal=… role=… cannot
10708    ///   issue DDL" denial, audit recorded.
10709    /// * IAM disabled → audit-record success and allow (legacy path).
10710    /// * IAM enabled → call `check_policy_authz_with_role`. Explicit Deny
10711    ///   and DefaultDeny in PolicyOnly mode both produce a UI-safe
10712    ///   "principal=… action=… resource=<kind>:<name> denied by IAM
10713    ///   policy" string. Explicit Allow and the LegacyRbac fallback
10714    ///   allow the action.
10715    #[allow(clippy::too_many_arguments)]
10716    fn check_ddl_object_privilege(
10717        &self,
10718        auth_store: &Arc<crate::auth::store::AuthStore>,
10719        principal: &crate::auth::UserId,
10720        role: crate::auth::Role,
10721        tenant: Option<&str>,
10722        username: &str,
10723        action: &str,
10724        resource_kind: &str,
10725        resource_name: &str,
10726        min_role: crate::auth::Role,
10727    ) -> Result<(), String> {
10728        if role < min_role {
10729            let msg = format!(
10730                "principal=`{}` role=`{:?}` cannot issue DDL action=`{}` resource=`{}:{}`",
10731                username, role, action, resource_kind, resource_name
10732            );
10733            self.inner.audit_log.record(
10734                action,
10735                username,
10736                resource_name,
10737                "denied",
10738                crate::json::Value::Null,
10739            );
10740            return Err(msg);
10741        }
10742
10743        if !auth_store.iam_authorization_enabled() {
10744            self.inner.audit_log.record(
10745                action,
10746                username,
10747                resource_name,
10748                "ok",
10749                crate::json::Value::Null,
10750            );
10751            return Ok(());
10752        }
10753
10754        let mut resource = crate::auth::policies::ResourceRef::new(
10755            resource_kind.to_string(),
10756            resource_name.to_string(),
10757        );
10758        if let Some(t) = tenant {
10759            resource = resource.with_tenant(t.to_string());
10760        }
10761        let ctx = runtime_iam_context(role, tenant);
10762        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
10763            self.inner.audit_log.record(
10764                action,
10765                username,
10766                resource_name,
10767                "ok",
10768                crate::json::Value::Null,
10769            );
10770            Ok(())
10771        } else {
10772            self.inner.audit_log.record(
10773                action,
10774                username,
10775                resource_name,
10776                "denied",
10777                crate::json::Value::Null,
10778            );
10779            Err(format!(
10780                "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
10781                username, action, resource_kind, resource_name
10782            ))
10783        }
10784    }
10785
10786    /// Translate the parsed [`GrantStmt`] into AuthStore mutations.
10787    fn execute_grant_statement(
10788        &self,
10789        query: &str,
10790        stmt: &crate::storage::query::ast::GrantStmt,
10791    ) -> RedDBResult<RuntimeQueryResult> {
10792        use crate::auth::privileges::{Action, GrantPrincipal, Resource};
10793        use crate::auth::UserId;
10794        use crate::storage::query::ast::{GrantObjectKind, GrantPrincipalRef};
10795
10796        let auth_store = self
10797            .inner
10798            .auth_store
10799            .read()
10800            .clone()
10801            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10802
10803        // Granter identity + role.
10804        let (gname, grole) = current_auth_identity().ok_or_else(|| {
10805            RedDBError::Query("GRANT requires an authenticated principal".to_string())
10806        })?;
10807        let granter = UserId::from_parts(current_tenant().as_deref(), &gname);
10808        let granter_role = grole;
10809
10810        // Build the action set.
10811        let mut actions: Vec<Action> = Vec::new();
10812        if stmt.all {
10813            actions.push(Action::All);
10814        } else {
10815            for kw in &stmt.actions {
10816                let a = Action::from_keyword(kw).ok_or_else(|| {
10817                    RedDBError::Query(format!("unknown privilege keyword `{}`", kw))
10818                })?;
10819                actions.push(a);
10820            }
10821        }
10822
10823        // Audit emit (printed; structured emission is Agent #4's lane).
10824        let mut applied = 0usize;
10825        for obj in &stmt.objects {
10826            let resource = match stmt.object_kind {
10827                GrantObjectKind::Table => Resource::Table {
10828                    schema: obj.schema.clone(),
10829                    table: obj.name.clone(),
10830                },
10831                GrantObjectKind::Schema => Resource::Schema(obj.name.clone()),
10832                GrantObjectKind::Database => Resource::Database,
10833                GrantObjectKind::Function => Resource::Function {
10834                    schema: obj.schema.clone(),
10835                    name: obj.name.clone(),
10836                },
10837            };
10838            for principal in &stmt.principals {
10839                let p = match principal {
10840                    GrantPrincipalRef::Public => GrantPrincipal::Public,
10841                    GrantPrincipalRef::Group(g) => GrantPrincipal::Group(g.clone()),
10842                    GrantPrincipalRef::User { tenant, name } => {
10843                        GrantPrincipal::User(UserId::from_parts(tenant.as_deref(), name))
10844                    }
10845                };
10846                // Tenant of the grant follows the granter's tenant
10847                // (cross-tenant guard inside `AuthStore::grant`).
10848                let tenant = granter.tenant.clone();
10849                auth_store
10850                    .grant(
10851                        &granter,
10852                        granter_role,
10853                        p.clone(),
10854                        resource.clone(),
10855                        actions.clone(),
10856                        stmt.with_grant_option,
10857                        tenant.clone(),
10858                    )
10859                    .map_err(|e| RedDBError::Query(e.to_string()))?;
10860
10861                // IAM policy translation: every GRANT also lands as a
10862                // synthetic `_grant_<id>` policy attached to the
10863                // principal so the new evaluator sees it.
10864                if let Some(policy) =
10865                    grant_to_iam_policy(&p, &resource, &actions, tenant.as_deref())
10866                {
10867                    let pid = policy.id.clone();
10868                    auth_store
10869                        .put_policy_internal(policy)
10870                        .map_err(|e| RedDBError::Query(e.to_string()))?;
10871                    let attachment = match &p {
10872                        GrantPrincipal::User(uid) => {
10873                            crate::auth::store::PrincipalRef::User(uid.clone())
10874                        }
10875                        GrantPrincipal::Group(group) => {
10876                            crate::auth::store::PrincipalRef::Group(group.clone())
10877                        }
10878                        GrantPrincipal::Public => crate::auth::store::PrincipalRef::Group(
10879                            crate::auth::store::PUBLIC_IAM_GROUP.to_string(),
10880                        ),
10881                    };
10882                    auth_store
10883                        .attach_policy(attachment, &pid)
10884                        .map_err(|e| RedDBError::Query(e.to_string()))?;
10885                }
10886                applied += 1;
10887                tracing::info!(
10888                    target: "audit",
10889                    principal = %granter,
10890                    action = "grant",
10891                    "GRANT applied"
10892                );
10893            }
10894        }
10895
10896        self.invalidate_result_cache();
10897        Ok(RuntimeQueryResult::ok_message(
10898            query.to_string(),
10899            &format!("GRANT applied to {} target(s)", applied),
10900            "grant",
10901        ))
10902    }
10903
10904    /// Translate the parsed [`RevokeStmt`] into AuthStore mutations.
10905    fn execute_revoke_statement(
10906        &self,
10907        query: &str,
10908        stmt: &crate::storage::query::ast::RevokeStmt,
10909    ) -> RedDBResult<RuntimeQueryResult> {
10910        use crate::auth::privileges::{Action, GrantPrincipal, Resource};
10911        use crate::auth::UserId;
10912        use crate::storage::query::ast::{GrantObjectKind, GrantPrincipalRef};
10913
10914        let auth_store = self
10915            .inner
10916            .auth_store
10917            .read()
10918            .clone()
10919            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10920
10921        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
10922            RedDBError::Query("REVOKE requires an authenticated principal".to_string())
10923        })?;
10924        let granter_role = grole;
10925
10926        let actions: Vec<Action> = if stmt.all {
10927            vec![Action::All]
10928        } else {
10929            stmt.actions
10930                .iter()
10931                .map(|kw| Action::from_keyword(kw).unwrap_or(Action::Select))
10932                .collect()
10933        };
10934
10935        let mut total_removed = 0usize;
10936        for obj in &stmt.objects {
10937            let resource = match stmt.object_kind {
10938                GrantObjectKind::Table => Resource::Table {
10939                    schema: obj.schema.clone(),
10940                    table: obj.name.clone(),
10941                },
10942                GrantObjectKind::Schema => Resource::Schema(obj.name.clone()),
10943                GrantObjectKind::Database => Resource::Database,
10944                GrantObjectKind::Function => Resource::Function {
10945                    schema: obj.schema.clone(),
10946                    name: obj.name.clone(),
10947                },
10948            };
10949            for principal in &stmt.principals {
10950                let p = match principal {
10951                    GrantPrincipalRef::Public => GrantPrincipal::Public,
10952                    GrantPrincipalRef::Group(g) => GrantPrincipal::Group(g.clone()),
10953                    GrantPrincipalRef::User { tenant, name } => {
10954                        GrantPrincipal::User(UserId::from_parts(tenant.as_deref(), name))
10955                    }
10956                };
10957                let removed = auth_store
10958                    .revoke(granter_role, &p, &resource, &actions)
10959                    .map_err(|e| RedDBError::Query(e.to_string()))?;
10960                let _removed_policies =
10961                    auth_store.delete_synthetic_grant_policies(&p, &resource, &actions);
10962                total_removed += removed;
10963            }
10964        }
10965
10966        self.invalidate_result_cache();
10967        Ok(RuntimeQueryResult::ok_message(
10968            query.to_string(),
10969            &format!("REVOKE removed {} grant(s)", total_removed),
10970            "revoke",
10971        ))
10972    }
10973
10974    /// Translate the parsed [`CreateUserStmt`] into an AuthStore user.
10975    fn execute_create_user_statement(
10976        &self,
10977        query: &str,
10978        stmt: &crate::storage::query::ast::CreateUserStmt,
10979    ) -> RedDBResult<RuntimeQueryResult> {
10980        let auth_store = self
10981            .inner
10982            .auth_store
10983            .read()
10984            .clone()
10985            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10986
10987        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
10988            RedDBError::Query("CREATE USER requires an authenticated principal".to_string())
10989        })?;
10990        if grole != crate::auth::Role::Admin {
10991            return Err(RedDBError::Query(
10992                "CREATE USER requires Admin role".to_string(),
10993            ));
10994        }
10995
10996        let role = crate::auth::Role::from_str(&stmt.role)
10997            .ok_or_else(|| RedDBError::Query(format!("invalid role `{}`", stmt.role)))?;
10998        let user = auth_store
10999            .create_user_in_tenant(stmt.tenant.as_deref(), &stmt.username, &stmt.password, role)
11000            .map_err(|e| RedDBError::Query(e.to_string()))?;
11001
11002        self.invalidate_result_cache();
11003        let target = crate::auth::UserId::from_parts(user.tenant_id.as_deref(), &user.username);
11004        tracing::info!(
11005            target: "audit",
11006            principal = %target,
11007            role = %role,
11008            action = "create_user",
11009            "CREATE USER applied"
11010        );
11011
11012        Ok(RuntimeQueryResult::ok_message(
11013            query.to_string(),
11014            &format!("CREATE USER {} applied", target),
11015            "create_user",
11016        ))
11017    }
11018
11019    /// Translate the parsed [`AlterUserStmt`] into AuthStore mutations.
11020    fn execute_alter_user_statement(
11021        &self,
11022        query: &str,
11023        stmt: &crate::storage::query::ast::AlterUserStmt,
11024    ) -> RedDBResult<RuntimeQueryResult> {
11025        use crate::auth::privileges::UserAttributes;
11026        use crate::auth::UserId;
11027        use crate::storage::query::ast::AlterUserAttribute;
11028
11029        let auth_store = self
11030            .inner
11031            .auth_store
11032            .read()
11033            .clone()
11034            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11035
11036        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
11037            RedDBError::Query("ALTER USER requires an authenticated principal".to_string())
11038        })?;
11039        if grole != crate::auth::Role::Admin {
11040            return Err(RedDBError::Query(
11041                "ALTER USER requires Admin role".to_string(),
11042            ));
11043        }
11044
11045        let target = UserId::from_parts(stmt.tenant.as_deref(), &stmt.username);
11046
11047        // Apply attributes incrementally — each one reads the current
11048        // record, mutates the relevant field, writes back.
11049        let mut attrs = auth_store.user_attributes(&target);
11050        let mut enable_change: Option<bool> = None;
11051
11052        for a in &stmt.attributes {
11053            match a {
11054                AlterUserAttribute::ValidUntil(ts) => {
11055                    // Parse ISO-ish timestamp → ms since epoch. Fall
11056                    // back to integer-ms parsing for callers that pass
11057                    // `'1234567890123'`.
11058                    let ms = parse_timestamp_to_ms(ts).ok_or_else(|| {
11059                        RedDBError::Query(format!("invalid VALID UNTIL timestamp `{ts}`"))
11060                    })?;
11061                    attrs.valid_until = Some(ms);
11062                }
11063                AlterUserAttribute::ConnectionLimit(n) => {
11064                    if *n < 0 {
11065                        return Err(RedDBError::Query(
11066                            "CONNECTION LIMIT must be non-negative".to_string(),
11067                        ));
11068                    }
11069                    attrs.connection_limit = Some(*n as u32);
11070                }
11071                AlterUserAttribute::SetSearchPath(p) => {
11072                    attrs.search_path = Some(p.clone());
11073                }
11074                AlterUserAttribute::AddGroup(g) => {
11075                    if !attrs.groups.iter().any(|existing| existing == g) {
11076                        attrs.groups.push(g.clone());
11077                        attrs.groups.sort();
11078                    }
11079                }
11080                AlterUserAttribute::DropGroup(g) => {
11081                    attrs.groups.retain(|existing| existing != g);
11082                }
11083                AlterUserAttribute::Enable => enable_change = Some(true),
11084                AlterUserAttribute::Disable => enable_change = Some(false),
11085                AlterUserAttribute::Password(_) => {
11086                    // Out of scope — accept the AST but no-op so the
11087                    // parser stays compatible with future password
11088                    // rotation work.
11089                }
11090            }
11091        }
11092
11093        auth_store
11094            .set_user_attributes(&target, attrs)
11095            .map_err(|e| RedDBError::Query(e.to_string()))?;
11096        if let Some(en) = enable_change {
11097            auth_store
11098                .set_user_enabled(&target, en)
11099                .map_err(|e| RedDBError::Query(e.to_string()))?;
11100        }
11101        self.invalidate_result_cache();
11102        tracing::info!(
11103            target: "audit",
11104            principal = %target,
11105            action = "alter_user",
11106            "ALTER USER applied"
11107        );
11108
11109        Ok(RuntimeQueryResult::ok_message(
11110            query.to_string(),
11111            &format!("ALTER USER {} applied", target),
11112            "alter_user",
11113        ))
11114    }
11115
11116    // -----------------------------------------------------------------
11117    // IAM policy executors
11118    // -----------------------------------------------------------------
11119
11120    fn execute_create_iam_policy(
11121        &self,
11122        query: &str,
11123        id: &str,
11124        json: &str,
11125    ) -> RedDBResult<RuntimeQueryResult> {
11126        use crate::auth::policies::Policy;
11127
11128        let auth_store = self
11129            .inner
11130            .auth_store
11131            .read()
11132            .clone()
11133            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11134
11135        // Parse + validate. The kernel rejects oversize / bad shape /
11136        // bad action keywords. If the supplied id differs from the JSON
11137        // id, override it with the SQL-provided id (the JSON id is
11138        // optional context — the SQL DDL form is authoritative).
11139        let mut policy = Policy::from_json_str(json)
11140            .map_err(|e| RedDBError::Query(format!("policy parse: {e}")))?;
11141        if policy.id != id {
11142            policy.id = id.to_string();
11143        }
11144        let pid = policy.id.clone();
11145        let tenant = current_tenant();
11146        let (actor_name, actor_role) = current_auth_identity()
11147            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
11148        let actor = crate::auth::UserId::from_parts(tenant.as_deref(), &actor_name);
11149        let eval_ctx = runtime_iam_context(actor_role, tenant.as_deref());
11150        let event_ctx = self.policy_mutation_control_ctx(&actor, tenant.as_deref());
11151        let ledger = self.inner.control_event_ledger.read();
11152        let control = crate::auth::store::PolicyMutationControl {
11153            ctx: &event_ctx,
11154            ledger: ledger.as_ref(),
11155            config: self.inner.control_event_config,
11156            registry: Some(self.inner.config_registry.as_ref()),
11157            actor: &actor,
11158            eval_ctx: &eval_ctx,
11159        };
11160        auth_store
11161            .put_policy_with_control_events(policy, &control)
11162            .map_err(|e| RedDBError::Query(e.to_string()))?;
11163
11164        let principal = actor_name;
11165        tracing::info!(
11166            target: "audit",
11167            principal = %principal,
11168            action = "iam:policy.put",
11169            matched_policy_id = %pid,
11170            "CREATE POLICY applied"
11171        );
11172        self.inner.audit_log.record(
11173            "iam/policy.put",
11174            &principal,
11175            &pid,
11176            "ok",
11177            crate::json::Value::Null,
11178        );
11179
11180        self.invalidate_result_cache();
11181        Ok(RuntimeQueryResult::ok_message(
11182            query.to_string(),
11183            &format!("policy `{pid}` stored"),
11184            "create_iam_policy",
11185        ))
11186    }
11187
11188    fn execute_drop_iam_policy(&self, query: &str, id: &str) -> RedDBResult<RuntimeQueryResult> {
11189        let auth_store = self
11190            .inner
11191            .auth_store
11192            .read()
11193            .clone()
11194            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11195        let tenant = current_tenant();
11196        let (actor_name, actor_role) = current_auth_identity()
11197            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
11198        let actor = crate::auth::UserId::from_parts(tenant.as_deref(), &actor_name);
11199        let eval_ctx = runtime_iam_context(actor_role, tenant.as_deref());
11200        let event_ctx = self.policy_mutation_control_ctx(&actor, tenant.as_deref());
11201        let ledger = self.inner.control_event_ledger.read();
11202        let control = crate::auth::store::PolicyMutationControl {
11203            ctx: &event_ctx,
11204            ledger: ledger.as_ref(),
11205            config: self.inner.control_event_config,
11206            registry: Some(self.inner.config_registry.as_ref()),
11207            actor: &actor,
11208            eval_ctx: &eval_ctx,
11209        };
11210        auth_store
11211            .delete_policy_with_control_events(id, &control)
11212            .map_err(|e| RedDBError::Query(e.to_string()))?;
11213
11214        let principal = actor_name;
11215        tracing::info!(
11216            target: "audit",
11217            principal = %principal,
11218            action = "iam:policy.drop",
11219            matched_policy_id = %id,
11220            "DROP POLICY applied"
11221        );
11222        self.inner.audit_log.record(
11223            "iam/policy.drop",
11224            &principal,
11225            id,
11226            "ok",
11227            crate::json::Value::Null,
11228        );
11229
11230        self.invalidate_result_cache();
11231        Ok(RuntimeQueryResult::ok_message(
11232            query.to_string(),
11233            &format!("policy `{id}` dropped"),
11234            "drop_iam_policy",
11235        ))
11236    }
11237
11238    fn execute_attach_policy(
11239        &self,
11240        query: &str,
11241        policy_id: &str,
11242        principal: &crate::storage::query::ast::PolicyPrincipalRef,
11243    ) -> RedDBResult<RuntimeQueryResult> {
11244        use crate::auth::store::PrincipalRef;
11245        use crate::auth::UserId;
11246        use crate::storage::query::ast::PolicyPrincipalRef;
11247
11248        let auth_store = self
11249            .inner
11250            .auth_store
11251            .read()
11252            .clone()
11253            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11254        let p = match principal {
11255            PolicyPrincipalRef::User(u) => {
11256                PrincipalRef::User(UserId::from_parts(u.tenant.as_deref(), &u.username))
11257            }
11258            PolicyPrincipalRef::Group(g) => PrincipalRef::Group(g.clone()),
11259        };
11260        let pretty_target = principal_label(principal);
11261        let tenant = current_tenant();
11262        let (actor_name, actor_role) = current_auth_identity()
11263            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
11264        let actor = crate::auth::UserId::from_parts(tenant.as_deref(), &actor_name);
11265        let eval_ctx = runtime_iam_context(actor_role, tenant.as_deref());
11266        let event_ctx = self.policy_mutation_control_ctx(&actor, tenant.as_deref());
11267        let ledger = self.inner.control_event_ledger.read();
11268        let control = crate::auth::store::PolicyMutationControl {
11269            ctx: &event_ctx,
11270            ledger: ledger.as_ref(),
11271            config: self.inner.control_event_config,
11272            registry: Some(self.inner.config_registry.as_ref()),
11273            actor: &actor,
11274            eval_ctx: &eval_ctx,
11275        };
11276        auth_store
11277            .attach_policy_with_control_events(p, policy_id, &control)
11278            .map_err(|e| RedDBError::Query(e.to_string()))?;
11279
11280        let principal_str = actor_name;
11281        tracing::info!(
11282            target: "audit",
11283            principal = %principal_str,
11284            action = "iam:policy.attach",
11285            matched_policy_id = %policy_id,
11286            target = %pretty_target,
11287            "ATTACH POLICY applied"
11288        );
11289        self.inner.audit_log.record(
11290            "iam/policy.attach",
11291            &principal_str,
11292            &pretty_target,
11293            "ok",
11294            crate::json::Value::Null,
11295        );
11296
11297        self.invalidate_result_cache();
11298        Ok(RuntimeQueryResult::ok_message(
11299            query.to_string(),
11300            &format!("policy `{policy_id}` attached to {pretty_target}"),
11301            "attach_policy",
11302        ))
11303    }
11304
11305    fn execute_detach_policy(
11306        &self,
11307        query: &str,
11308        policy_id: &str,
11309        principal: &crate::storage::query::ast::PolicyPrincipalRef,
11310    ) -> RedDBResult<RuntimeQueryResult> {
11311        use crate::auth::store::PrincipalRef;
11312        use crate::auth::UserId;
11313        use crate::storage::query::ast::PolicyPrincipalRef;
11314
11315        let auth_store = self
11316            .inner
11317            .auth_store
11318            .read()
11319            .clone()
11320            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11321        let p = match principal {
11322            PolicyPrincipalRef::User(u) => {
11323                PrincipalRef::User(UserId::from_parts(u.tenant.as_deref(), &u.username))
11324            }
11325            PolicyPrincipalRef::Group(g) => PrincipalRef::Group(g.clone()),
11326        };
11327        let pretty_target = principal_label(principal);
11328        let tenant = current_tenant();
11329        let (actor_name, actor_role) = current_auth_identity()
11330            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
11331        let actor = crate::auth::UserId::from_parts(tenant.as_deref(), &actor_name);
11332        let eval_ctx = runtime_iam_context(actor_role, tenant.as_deref());
11333        let event_ctx = self.policy_mutation_control_ctx(&actor, tenant.as_deref());
11334        let ledger = self.inner.control_event_ledger.read();
11335        let control = crate::auth::store::PolicyMutationControl {
11336            ctx: &event_ctx,
11337            ledger: ledger.as_ref(),
11338            config: self.inner.control_event_config,
11339            registry: Some(self.inner.config_registry.as_ref()),
11340            actor: &actor,
11341            eval_ctx: &eval_ctx,
11342        };
11343        auth_store
11344            .detach_policy_with_control_events(p, policy_id, &control)
11345            .map_err(|e| RedDBError::Query(e.to_string()))?;
11346
11347        let principal_str = actor_name;
11348        tracing::info!(
11349            target: "audit",
11350            principal = %principal_str,
11351            action = "iam:policy.detach",
11352            matched_policy_id = %policy_id,
11353            target = %pretty_target,
11354            "DETACH POLICY applied"
11355        );
11356        self.inner.audit_log.record(
11357            "iam/policy.detach",
11358            &principal_str,
11359            &pretty_target,
11360            "ok",
11361            crate::json::Value::Null,
11362        );
11363
11364        self.invalidate_result_cache();
11365        Ok(RuntimeQueryResult::ok_message(
11366            query.to_string(),
11367            &format!("policy `{policy_id}` detached from {pretty_target}"),
11368            "detach_policy",
11369        ))
11370    }
11371
11372    fn execute_show_policies(
11373        &self,
11374        query: &str,
11375        filter: Option<&crate::storage::query::ast::PolicyPrincipalRef>,
11376    ) -> RedDBResult<RuntimeQueryResult> {
11377        use crate::auth::UserId;
11378        use crate::storage::query::ast::PolicyPrincipalRef;
11379        use crate::storage::query::unified::UnifiedRecord;
11380        use crate::storage::schema::Value as SchemaValue;
11381        use std::sync::Arc;
11382
11383        let auth_store = self
11384            .inner
11385            .auth_store
11386            .read()
11387            .clone()
11388            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11389
11390        let pols = match filter {
11391            None => auth_store.list_policies(),
11392            Some(PolicyPrincipalRef::User(u)) => {
11393                let id = UserId::from_parts(u.tenant.as_deref(), &u.username);
11394                auth_store.effective_policies(&id)
11395            }
11396            Some(PolicyPrincipalRef::Group(g)) => auth_store.group_policies(g),
11397        };
11398
11399        let mut records = Vec::with_capacity(pols.len() + 1);
11400
11401        // Header row (#712 / S5A): synthetic record at index 0 that
11402        // reports the active PolicyEnforcementMode and the hard-cutover
11403        // version, so an operator running SHOW POLICIES can see the
11404        // current posture without a separate command.
11405        let mode = auth_store.enforcement_mode();
11406        let mut header = UnifiedRecord::default();
11407        header.set_arc(
11408            Arc::from("id"),
11409            SchemaValue::text("<enforcement_mode>".to_string()),
11410        );
11411        header.set_arc(Arc::from("statements"), SchemaValue::Integer(0));
11412        header.set_arc(Arc::from("tenant"), SchemaValue::Null);
11413        let header_json = format!(
11414            r#"{{"enforcement_mode":"{}","policy_only_hard_version":"{}"}}"#,
11415            mode.as_str(),
11416            crate::auth::enforcement_mode::POLICY_ONLY_HARD_VERSION
11417        );
11418        header.set_arc(Arc::from("json"), SchemaValue::text(header_json));
11419        records.push(header);
11420
11421        for p in pols.iter() {
11422            let mut rec = UnifiedRecord::default();
11423            rec.set_arc(Arc::from("id"), SchemaValue::text(p.id.clone()));
11424            rec.set_arc(
11425                Arc::from("statements"),
11426                SchemaValue::Integer(p.statements.len() as i64),
11427            );
11428            rec.set_arc(
11429                Arc::from("tenant"),
11430                p.tenant
11431                    .as_deref()
11432                    .map(|t| SchemaValue::text(t.to_string()))
11433                    .unwrap_or(SchemaValue::Null),
11434            );
11435            rec.set_arc(Arc::from("json"), SchemaValue::text(p.to_json_string()));
11436            records.push(rec);
11437        }
11438        let mut result = crate::storage::query::unified::UnifiedResult::empty();
11439        result.records = records;
11440        Ok(RuntimeQueryResult {
11441            query: query.to_string(),
11442            mode: crate::storage::query::modes::QueryMode::Sql,
11443            statement: "show_policies",
11444            engine: "iam-policies",
11445            result,
11446            affected_rows: 0,
11447            statement_type: "select",
11448            bookmark: None,
11449        })
11450    }
11451
11452    fn execute_show_effective_permissions(
11453        &self,
11454        query: &str,
11455        user: &crate::storage::query::ast::PolicyUserRef,
11456        resource: Option<&crate::storage::query::ast::PolicyResourceRef>,
11457    ) -> RedDBResult<RuntimeQueryResult> {
11458        use crate::auth::UserId;
11459        use crate::storage::query::unified::UnifiedRecord;
11460        use crate::storage::schema::Value as SchemaValue;
11461        use std::sync::Arc;
11462
11463        let auth_store = self
11464            .inner
11465            .auth_store
11466            .read()
11467            .clone()
11468            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11469        let id = UserId::from_parts(user.tenant.as_deref(), &user.username);
11470        let pols = auth_store.effective_policies(&id);
11471
11472        // Show one row per (policy, statement) tuple, plus any
11473        // resource-level filter passed by the caller.
11474        let mut records = Vec::new();
11475        for p in pols.iter() {
11476            for (idx, st) in p.statements.iter().enumerate() {
11477                if let Some(_r) = resource {
11478                    // Naive filter: render statement targets to strings
11479                    // and skip if no match. Conservative default = include
11480                    // (the simulator handles fine-grained matching).
11481                }
11482                let mut rec = UnifiedRecord::default();
11483                rec.set_arc(Arc::from("policy_id"), SchemaValue::text(p.id.clone()));
11484                rec.set_arc(
11485                    Arc::from("statement_index"),
11486                    SchemaValue::Integer(idx as i64),
11487                );
11488                rec.set_arc(
11489                    Arc::from("sid"),
11490                    st.sid
11491                        .as_deref()
11492                        .map(|s| SchemaValue::text(s.to_string()))
11493                        .unwrap_or(SchemaValue::Null),
11494                );
11495                rec.set_arc(
11496                    Arc::from("effect"),
11497                    SchemaValue::text(match st.effect {
11498                        crate::auth::policies::Effect::Allow => "allow",
11499                        crate::auth::policies::Effect::Deny => "deny",
11500                    }),
11501                );
11502                rec.set_arc(
11503                    Arc::from("actions"),
11504                    SchemaValue::Integer(st.actions.len() as i64),
11505                );
11506                rec.set_arc(
11507                    Arc::from("resources"),
11508                    SchemaValue::Integer(st.resources.len() as i64),
11509                );
11510                records.push(rec);
11511            }
11512        }
11513        let mut result = crate::storage::query::unified::UnifiedResult::empty();
11514        result.records = records;
11515        Ok(RuntimeQueryResult {
11516            query: query.to_string(),
11517            mode: crate::storage::query::modes::QueryMode::Sql,
11518            statement: "show_effective_permissions",
11519            engine: "iam-policies",
11520            result,
11521            affected_rows: 0,
11522            statement_type: "select",
11523            bookmark: None,
11524        })
11525    }
11526
11527    fn execute_lint_policy(
11528        &self,
11529        query: &str,
11530        source: &crate::storage::query::ast::LintPolicySource,
11531    ) -> RedDBResult<RuntimeQueryResult> {
11532        use crate::auth::policy_linter::lint;
11533        use crate::storage::query::ast::LintPolicySource;
11534        use crate::storage::query::unified::UnifiedRecord;
11535        use crate::storage::schema::Value as SchemaValue;
11536        use std::sync::Arc;
11537
11538        // Resolve the policy text. `JSON` source lints the literal
11539        // verbatim; `Id` source fetches the stored document so
11540        // operators can lint a policy by name without rebuilding the
11541        // JSON from `SHOW POLICY`.
11542        let policy_text = match source {
11543            LintPolicySource::Json(text) => text.clone(),
11544            LintPolicySource::Id(id) => {
11545                let auth_store =
11546                    self.inner.auth_store.read().clone().ok_or_else(|| {
11547                        RedDBError::Query("auth store not configured".to_string())
11548                    })?;
11549                let policy = auth_store
11550                    .get_policy(id)
11551                    .ok_or_else(|| RedDBError::Query(format!("policy `{id}` not found")))?;
11552                policy.to_json_string()
11553            }
11554        };
11555        let diagnostics = lint(&policy_text);
11556
11557        let principal_str = current_auth_identity()
11558            .map(|(u, _)| u)
11559            .unwrap_or_else(|| "anonymous".into());
11560        tracing::info!(
11561            target: "audit",
11562            principal = %principal_str,
11563            action = "iam:policy.lint",
11564            diagnostic_count = diagnostics.len(),
11565            "LINT POLICY issued"
11566        );
11567        self.inner.audit_log.record(
11568            "iam/policy.lint",
11569            &principal_str,
11570            match source {
11571                LintPolicySource::Id(id) => id.as_str(),
11572                LintPolicySource::Json(_) => "<json>",
11573            },
11574            "ok",
11575            crate::json::Value::Null,
11576        );
11577
11578        // One row per diagnostic. Column order matches the HTTP
11579        // surface's JSON keys so the two contracts line up.
11580        const COLUMNS: [&str; 5] = ["severity", "code", "message", "suggested_fix", "location"];
11581        let schema = Arc::new(
11582            COLUMNS
11583                .iter()
11584                .map(|name| Arc::<str>::from(*name))
11585                .collect::<Vec<_>>(),
11586        );
11587        let records: Vec<UnifiedRecord> = diagnostics
11588            .iter()
11589            .map(|d| {
11590                UnifiedRecord::with_schema(
11591                    Arc::clone(&schema),
11592                    vec![
11593                        SchemaValue::text(d.severity.as_str()),
11594                        SchemaValue::text(d.code.as_str()),
11595                        SchemaValue::text(d.message.clone()),
11596                        d.suggested_fix
11597                            .as_deref()
11598                            .map(SchemaValue::text)
11599                            .unwrap_or(SchemaValue::Null),
11600                        d.location
11601                            .as_deref()
11602                            .map(SchemaValue::text)
11603                            .unwrap_or(SchemaValue::Null),
11604                    ],
11605                )
11606            })
11607            .collect();
11608        let mut result = crate::storage::query::unified::UnifiedResult::with_columns(
11609            COLUMNS.iter().map(|c| c.to_string()).collect(),
11610        );
11611        result.records = records;
11612        Ok(RuntimeQueryResult {
11613            query: query.to_string(),
11614            mode: crate::storage::query::modes::QueryMode::Sql,
11615            statement: "lint_policy",
11616            engine: "iam-policies",
11617            result,
11618            affected_rows: 0,
11619            statement_type: "select",
11620            bookmark: None,
11621        })
11622    }
11623
11624    /// `MIGRATE POLICY MODE TO '<target>' [DRY RUN]` — flip the install
11625    /// from `legacy_rbac` to `policy_only` after the pre-flight delta
11626    /// simulator confirms no non-admin principal would lose access.
11627    /// Issue #714.
11628    fn execute_migrate_policy_mode(
11629        &self,
11630        query: &str,
11631        target: &str,
11632        dry_run: bool,
11633    ) -> RedDBResult<RuntimeQueryResult> {
11634        use crate::auth::enforcement_mode::PolicyEnforcementMode;
11635        use crate::auth::migrate_policy_mode::{
11636            principal_label, simulate_migration_delta, MigratePolicyDelta,
11637        };
11638        use crate::auth::policies::ResourceRef;
11639        use crate::storage::query::unified::UnifiedRecord;
11640        use crate::storage::schema::Value as SchemaValue;
11641        use std::sync::Arc;
11642
11643        // Only `policy_only` is a meaningful destination for this
11644        // command — flipping back to `legacy_rbac` is supported via
11645        // direct config writes (it doesn't need a pre-flight). We
11646        // reject everything else with the same allowlist `parse` uses.
11647        let parsed = PolicyEnforcementMode::parse(target).ok_or_else(|| {
11648            RedDBError::Query(format!(
11649                "MIGRATE POLICY MODE: invalid target `{target}` (expected `policy_only`)"
11650            ))
11651        })?;
11652        if parsed != PolicyEnforcementMode::PolicyOnly {
11653            return Err(RedDBError::Query(format!(
11654                "MIGRATE POLICY MODE: target `{target}` is not supported — only `policy_only` may be migrated to via this command"
11655            )));
11656        }
11657
11658        let auth_store = self
11659            .inner
11660            .auth_store
11661            .read()
11662            .clone()
11663            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11664
11665        // Resource enumeration: every existing collection probed as
11666        // `table:<name>`. This is the realistic resource surface for
11667        // the legacy_rbac fallback (the role floors gate per-table
11668        // actions). Wildcard / column-scoped resources are still
11669        // covered by the policy evaluator because evaluate() resolves
11670        // resource patterns relative to the concrete resources we
11671        // probe here.
11672        let snapshot = self.inner.db.catalog_model_snapshot();
11673        let resources: Vec<ResourceRef> = snapshot
11674            .collections
11675            .iter()
11676            .map(|c| ResourceRef::new("table", c.name.clone()))
11677            .collect();
11678
11679        let now_ms = crate::utils::now_unix_millis() as u128;
11680        let deltas: Vec<MigratePolicyDelta> =
11681            simulate_migration_delta(auth_store.as_ref(), &resources, now_ms);
11682
11683        let principal_str = current_auth_identity()
11684            .map(|(u, _)| u)
11685            .unwrap_or_else(|| "anonymous".into());
11686
11687        // Audit every issuance. The outcome line differentiates
11688        // dry-run, refused, and applied — operators can grep for these
11689        // strings in the audit log.
11690        let outcome_str = if dry_run {
11691            "dry_run"
11692        } else if deltas.is_empty() {
11693            "applied"
11694        } else {
11695            "refused"
11696        };
11697        tracing::info!(
11698            target: "audit",
11699            principal = %principal_str,
11700            action = "iam:policy.migrate_mode",
11701            target = %target,
11702            dry_run,
11703            delta_count = deltas.len(),
11704            outcome = outcome_str,
11705            "MIGRATE POLICY MODE issued"
11706        );
11707        self.inner.audit_log.record(
11708            "iam/policy.migrate_mode",
11709            &principal_str,
11710            target,
11711            outcome_str,
11712            crate::json::Value::Null,
11713        );
11714
11715        // Refuse the non-dry-run path when any principal would lose
11716        // access. The error string carries a compact summary plus the
11717        // delta count so operators can re-run with DRY RUN to inspect.
11718        if !dry_run && !deltas.is_empty() {
11719            let summary = deltas
11720                .iter()
11721                .take(5)
11722                .map(|d| {
11723                    format!(
11724                        "{}:{}/{}:{}",
11725                        principal_label(&d.principal),
11726                        d.action,
11727                        d.resource_kind,
11728                        d.resource_name
11729                    )
11730                })
11731                .collect::<Vec<_>>()
11732                .join(", ");
11733            let more = if deltas.len() > 5 {
11734                format!(" (and {} more)", deltas.len() - 5)
11735            } else {
11736                String::new()
11737            };
11738            return Err(RedDBError::Query(format!(
11739                "MIGRATE POLICY MODE refused: {n} principal/action/resource pair(s) would lose access under `policy_only`. Run `MIGRATE POLICY MODE TO '{target}' DRY RUN` to inspect. Sample: {summary}{more}",
11740                n = deltas.len(),
11741            )));
11742        }
11743
11744        // Mutate the live enforcement mode only on the non-dry-run
11745        // path with an empty delta. `set_enforcement_mode` also
11746        // persists to vault_kv so the new mode survives restart.
11747        if !dry_run {
11748            auth_store.set_enforcement_mode(parsed);
11749        }
11750
11751        const COLUMNS: [&str; 5] = [
11752            "principal",
11753            "role",
11754            "action",
11755            "resource_kind",
11756            "resource_name",
11757        ];
11758        let schema = Arc::new(
11759            COLUMNS
11760                .iter()
11761                .map(|name| Arc::<str>::from(*name))
11762                .collect::<Vec<_>>(),
11763        );
11764        let records: Vec<UnifiedRecord> = deltas
11765            .iter()
11766            .map(|d| {
11767                UnifiedRecord::with_schema(
11768                    Arc::clone(&schema),
11769                    vec![
11770                        SchemaValue::text(principal_label(&d.principal)),
11771                        SchemaValue::text(d.role.as_str()),
11772                        SchemaValue::text(d.action.clone()),
11773                        SchemaValue::text(d.resource_kind.clone()),
11774                        SchemaValue::text(d.resource_name.clone()),
11775                    ],
11776                )
11777            })
11778            .collect();
11779        let mut result = crate::storage::query::unified::UnifiedResult::with_columns(
11780            COLUMNS.iter().map(|c| c.to_string()).collect(),
11781        );
11782        result.records = records;
11783        Ok(RuntimeQueryResult {
11784            query: query.to_string(),
11785            mode: crate::storage::query::modes::QueryMode::Sql,
11786            statement: "migrate_policy_mode",
11787            engine: "iam-policies",
11788            result,
11789            affected_rows: 0,
11790            statement_type: "select",
11791            bookmark: None,
11792        })
11793    }
11794
11795    fn execute_simulate_policy(
11796        &self,
11797        query: &str,
11798        user: &crate::storage::query::ast::PolicyUserRef,
11799        action: &str,
11800        resource: &crate::storage::query::ast::PolicyResourceRef,
11801    ) -> RedDBResult<RuntimeQueryResult> {
11802        use crate::auth::policies::ResourceRef;
11803        use crate::auth::store::SimCtx;
11804        use crate::auth::UserId;
11805        use crate::storage::query::unified::UnifiedRecord;
11806        use crate::storage::schema::Value as SchemaValue;
11807        use std::sync::Arc;
11808
11809        let auth_store = self
11810            .inner
11811            .auth_store
11812            .read()
11813            .clone()
11814            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11815        let id = UserId::from_parts(user.tenant.as_deref(), &user.username);
11816        let r = ResourceRef::new(resource.kind.clone(), resource.name.clone());
11817        let outcome = auth_store.simulate(&id, action, &r, SimCtx::default());
11818
11819        let principal_str = current_auth_identity()
11820            .map(|(u, _)| u)
11821            .unwrap_or_else(|| "anonymous".into());
11822        let (decision_str, matched_pid, matched_sid) = decision_to_strings(&outcome.decision);
11823        tracing::info!(
11824            target: "audit",
11825            principal = %principal_str,
11826            action = "iam:policy.simulate",
11827            decision = %decision_str,
11828            matched_policy_id = ?matched_pid,
11829            matched_sid = ?matched_sid,
11830            "SIMULATE issued"
11831        );
11832        self.inner.audit_log.record(
11833            "iam/policy.simulate",
11834            &principal_str,
11835            &id.to_string(),
11836            "ok",
11837            crate::json::Value::Null,
11838        );
11839
11840        let mut rec = UnifiedRecord::default();
11841        rec.set_arc(Arc::from("decision"), SchemaValue::text(decision_str));
11842        rec.set_arc(
11843            Arc::from("matched_policy_id"),
11844            matched_pid
11845                .map(SchemaValue::text)
11846                .unwrap_or(SchemaValue::Null),
11847        );
11848        rec.set_arc(
11849            Arc::from("matched_sid"),
11850            matched_sid
11851                .map(SchemaValue::text)
11852                .unwrap_or(SchemaValue::Null),
11853        );
11854        rec.set_arc(Arc::from("reason"), SchemaValue::text(outcome.reason));
11855        rec.set_arc(
11856            Arc::from("trail_len"),
11857            SchemaValue::Integer(outcome.trail.len() as i64),
11858        );
11859        let mut result = crate::storage::query::unified::UnifiedResult::empty();
11860        result.records = vec![rec];
11861        Ok(RuntimeQueryResult {
11862            query: query.to_string(),
11863            mode: crate::storage::query::modes::QueryMode::Sql,
11864            statement: "simulate_policy",
11865            engine: "iam-policies",
11866            result,
11867            affected_rows: 0,
11868            statement_type: "select",
11869            bookmark: None,
11870        })
11871    }
11872}
11873
11874/// Translate a parsed GRANT into a synthetic IAM policy whose id
11875/// starts with `_grant_<unique>`. PUBLIC is represented as an
11876/// implicit IAM group; legacy GROUP grants are still rejected by the
11877/// grant store and are not translated here.
11878fn grant_to_iam_policy(
11879    principal: &crate::auth::privileges::GrantPrincipal,
11880    resource: &crate::auth::privileges::Resource,
11881    actions: &[crate::auth::privileges::Action],
11882    tenant: Option<&str>,
11883) -> Option<crate::auth::policies::Policy> {
11884    use crate::auth::policies::{
11885        compile_action, ActionPattern, Effect, Policy, ResourcePattern, Statement,
11886    };
11887    use crate::auth::privileges::{Action, GrantPrincipal, Resource};
11888
11889    if matches!(principal, GrantPrincipal::Group(_)) {
11890        return None;
11891    }
11892
11893    let now = crate::auth::now_ms();
11894    let id = format!("_grant_{:x}_{:x}", now, std::process::id());
11895
11896    let resource_str = match resource {
11897        Resource::Database => "table:*".to_string(),
11898        Resource::Schema(s) => format!("table:{s}.*"),
11899        Resource::Table { schema, table } => match schema {
11900            Some(s) => format!("table:{s}.{table}"),
11901            None => format!("table:{table}"),
11902        },
11903        Resource::Function { schema, name } => match schema {
11904            Some(s) => format!("function:{s}.{name}"),
11905            None => format!("function:{name}"),
11906        },
11907    };
11908
11909    // Compile actions — fall back to `*` only when the grant included
11910    // `Action::All`. Map every other action keyword to its lowercase
11911    // form so it lines up with the kernel's allowlist.
11912    let action_patterns: Vec<ActionPattern> = if actions.contains(&Action::All) {
11913        vec![ActionPattern::Wildcard]
11914    } else {
11915        actions
11916            .iter()
11917            .map(|a| compile_action(&a.as_str().to_ascii_lowercase()))
11918            .collect()
11919    };
11920    if action_patterns.is_empty() {
11921        return None;
11922    }
11923
11924    // Inline resource compilation matching the kernel's `compile_resource`:
11925    //   * `*` → wildcard
11926    //   * contains `*` → glob
11927    //   * `kind:name` → exact
11928    let resource_patterns = if resource_str == "*" {
11929        vec![ResourcePattern::Wildcard]
11930    } else if resource_str.contains('*') {
11931        vec![ResourcePattern::Glob(resource_str.clone())]
11932    } else if let Some((kind, name)) = resource_str.split_once(':') {
11933        vec![ResourcePattern::Exact {
11934            kind: kind.to_string(),
11935            name: name.to_string(),
11936        }]
11937    } else {
11938        vec![ResourcePattern::Wildcard]
11939    };
11940
11941    let policy = Policy {
11942        id,
11943        version: 1,
11944        tenant: tenant.map(|t| t.to_string()),
11945        created_at: now,
11946        updated_at: now,
11947        statements: vec![Statement {
11948            sid: None,
11949            effect: Effect::Allow,
11950            actions: action_patterns,
11951            resources: resource_patterns,
11952            condition: None,
11953        }],
11954    };
11955    if policy.validate().is_err() {
11956        return None;
11957    }
11958    Some(policy)
11959}
11960
11961/// Coerce a `key => <number>` table-function named argument into a positive
11962/// iteration count for the centrality TVFs (issue #797). The parser lexes all
11963/// named values as `f64`, so an integral, finite, strictly-positive value is
11964/// required here; anything else (fractional, zero, negative, NaN/inf) is a
11965/// clear query error. `func` names the function for the message.
11966fn parse_positive_iterations(func: &str, value: &f64) -> RedDBResult<usize> {
11967    if !value.is_finite() || *value < 1.0 || value.fract() != 0.0 {
11968        return Err(RedDBError::Query(format!(
11969            "table function '{func}' max_iterations must be a positive integer, got {value}"
11970        )));
11971    }
11972    Ok(*value as usize)
11973}
11974
11975fn legacy_action_to_iam(action: crate::auth::privileges::Action) -> &'static str {
11976    use crate::auth::privileges::Action;
11977    match action {
11978        Action::Select => "select",
11979        Action::Insert => "insert",
11980        Action::Update => "update",
11981        Action::Delete => "delete",
11982        Action::Truncate => "truncate",
11983        Action::References => "references",
11984        Action::Execute => "execute",
11985        Action::Usage => "usage",
11986        Action::All => "*",
11987    }
11988}
11989
11990fn update_set_target_columns(query: &crate::storage::query::ast::UpdateQuery) -> Vec<String> {
11991    let mut columns = Vec::new();
11992    for (column, _) in &query.assignment_exprs {
11993        if !columns.iter().any(|seen| seen == column) {
11994            columns.push(column.clone());
11995        }
11996    }
11997    columns
11998}
11999
12000fn column_access_request_for_table_update(
12001    table_name: &str,
12002    columns: Vec<String>,
12003) -> crate::auth::ColumnAccessRequest {
12004    match table_name.split_once('.') {
12005        Some((schema, table)) => {
12006            crate::auth::ColumnAccessRequest::update(table.to_string(), columns)
12007                .with_schema(schema.to_string())
12008        }
12009        None => crate::auth::ColumnAccessRequest::update(table_name.to_string(), columns),
12010    }
12011}
12012
12013fn column_access_request_for_table_select(
12014    table_name: &str,
12015    columns: Vec<String>,
12016) -> crate::auth::ColumnAccessRequest {
12017    match table_name.split_once('.') {
12018        Some((schema, table)) => {
12019            crate::auth::ColumnAccessRequest::select(table.to_string(), columns)
12020                .with_schema(schema.to_string())
12021        }
12022        None => crate::auth::ColumnAccessRequest::select(table_name.to_string(), columns),
12023    }
12024}
12025
12026fn update_returning_columns_for_policy(
12027    runtime: &RedDBRuntime,
12028    query: &crate::storage::query::ast::UpdateQuery,
12029) -> Option<Vec<String>> {
12030    let items = query.returning.as_ref()?;
12031    let mut columns = Vec::new();
12032    let project_all = items
12033        .iter()
12034        .any(|item| matches!(item, crate::storage::query::ast::ReturningItem::All));
12035    if project_all {
12036        collect_returning_star_columns(runtime, query, &mut columns);
12037    } else {
12038        for item in items {
12039            let crate::storage::query::ast::ReturningItem::Column(column) = item else {
12040                continue;
12041            };
12042            push_returning_policy_column(&mut columns, column);
12043        }
12044    }
12045    (!columns.is_empty()).then_some(columns)
12046}
12047
12048fn collect_returning_star_columns(
12049    runtime: &RedDBRuntime,
12050    query: &crate::storage::query::ast::UpdateQuery,
12051    columns: &mut Vec<String>,
12052) {
12053    let store = runtime.db().store();
12054    let Some(manager) = store.get_collection(&query.table) else {
12055        return;
12056    };
12057    if let Some(schema) = manager.column_schema() {
12058        for column in schema.iter() {
12059            push_returning_policy_column(columns, column);
12060        }
12061    }
12062    for entity in manager.query_all(|_| true) {
12063        if !returning_entity_matches_update_target(&entity, query.target) {
12064            continue;
12065        }
12066        match &entity.data {
12067            crate::storage::EntityData::Row(row) => {
12068                for (column, _) in row.iter_fields() {
12069                    push_returning_policy_column(columns, column);
12070                }
12071            }
12072            crate::storage::EntityData::Node(node) => {
12073                push_returning_policy_column(columns, "label");
12074                push_returning_policy_column(columns, "node_type");
12075                for column in node.properties.keys() {
12076                    push_returning_policy_column(columns, column);
12077                }
12078            }
12079            crate::storage::EntityData::Edge(edge) => {
12080                push_returning_policy_column(columns, "label");
12081                push_returning_policy_column(columns, "from_rid");
12082                push_returning_policy_column(columns, "to_rid");
12083                push_returning_policy_column(columns, "weight");
12084                for column in edge.properties.keys() {
12085                    push_returning_policy_column(columns, column);
12086                }
12087            }
12088            _ => {}
12089        }
12090    }
12091}
12092
12093fn push_returning_policy_column(columns: &mut Vec<String>, column: &str) {
12094    if returning_public_envelope_column(column) {
12095        return;
12096    }
12097    if !columns.iter().any(|seen| seen == column) {
12098        columns.push(column.to_string());
12099    }
12100}
12101
12102fn returning_public_envelope_column(column: &str) -> bool {
12103    matches!(
12104        column.to_ascii_lowercase().as_str(),
12105        "rid" | "collection" | "kind" | "tenant" | "created_at" | "updated_at"
12106    )
12107}
12108
12109fn returning_entity_matches_update_target(
12110    entity: &crate::storage::UnifiedEntity,
12111    target: crate::storage::query::ast::UpdateTarget,
12112) -> bool {
12113    use crate::storage::query::ast::UpdateTarget;
12114    match target {
12115        UpdateTarget::Rows => {
12116            matches!(returning_row_item_kind(entity), Some(ReturningRowKind::Row))
12117        }
12118        UpdateTarget::Documents => {
12119            matches!(
12120                returning_row_item_kind(entity),
12121                Some(ReturningRowKind::Document)
12122            )
12123        }
12124        UpdateTarget::Kv => matches!(returning_row_item_kind(entity), Some(ReturningRowKind::Kv)),
12125        UpdateTarget::Nodes => matches!(
12126            (&entity.kind, &entity.data),
12127            (
12128                crate::storage::EntityKind::GraphNode(_),
12129                crate::storage::EntityData::Node(_)
12130            )
12131        ),
12132        UpdateTarget::Edges => matches!(
12133            (&entity.kind, &entity.data),
12134            (
12135                crate::storage::EntityKind::GraphEdge(_),
12136                crate::storage::EntityData::Edge(_)
12137            )
12138        ),
12139    }
12140}
12141
12142#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12143enum ReturningRowKind {
12144    Row,
12145    Document,
12146    Kv,
12147}
12148
12149fn returning_row_item_kind(entity: &crate::storage::UnifiedEntity) -> Option<ReturningRowKind> {
12150    let row = entity.data.as_row()?;
12151    let is_kv = row.iter_fields().all(|(column, _)| {
12152        column.eq_ignore_ascii_case("key") || column.eq_ignore_ascii_case("value")
12153    });
12154    if is_kv {
12155        return Some(ReturningRowKind::Kv);
12156    }
12157    let is_document = row
12158        .iter_fields()
12159        .any(|(_, value)| matches!(value, crate::storage::schema::Value::Json(_)));
12160    if is_document {
12161        Some(ReturningRowKind::Document)
12162    } else {
12163        Some(ReturningRowKind::Row)
12164    }
12165}
12166
12167fn requested_table_columns_for_policy(
12168    table: &crate::storage::query::ast::TableQuery,
12169) -> Vec<String> {
12170    use crate::storage::query::sql_lowering::{
12171        effective_table_filter, effective_table_group_by_exprs, effective_table_having_filter,
12172        effective_table_projections,
12173    };
12174
12175    let table_name = table.table.as_str();
12176    let table_alias = table.alias.as_deref();
12177    let mut columns = std::collections::BTreeSet::new();
12178
12179    for projection in effective_table_projections(table) {
12180        collect_projection_columns(&projection, table_name, table_alias, &mut columns);
12181    }
12182    if let Some(filter) = effective_table_filter(table) {
12183        collect_filter_columns(&filter, table_name, table_alias, &mut columns);
12184    }
12185    for expr in effective_table_group_by_exprs(table) {
12186        collect_expr_columns(&expr, table_name, table_alias, &mut columns);
12187    }
12188    if let Some(filter) = effective_table_having_filter(table) {
12189        collect_filter_columns(&filter, table_name, table_alias, &mut columns);
12190    }
12191    for order in &table.order_by {
12192        if let Some(expr) = order.expr.as_ref() {
12193            collect_expr_columns(expr, table_name, table_alias, &mut columns);
12194        } else {
12195            collect_field_ref_column(&order.field, table_name, table_alias, &mut columns);
12196        }
12197    }
12198
12199    columns.into_iter().collect()
12200}
12201
12202fn collect_projection_columns(
12203    projection: &crate::storage::query::ast::Projection,
12204    table_name: &str,
12205    table_alias: Option<&str>,
12206    columns: &mut std::collections::BTreeSet<String>,
12207) {
12208    use crate::storage::query::ast::Projection;
12209    match projection {
12210        Projection::All => {
12211            columns.insert("*".to_string());
12212        }
12213        Projection::Column(column) | Projection::Alias(column, _) => {
12214            if column != "*" {
12215                columns.insert(column.clone());
12216            }
12217        }
12218        Projection::Function(_, args) => {
12219            for arg in args {
12220                collect_projection_columns(arg, table_name, table_alias, columns);
12221            }
12222        }
12223        Projection::Expression(filter, _) => {
12224            collect_filter_columns(filter, table_name, table_alias, columns);
12225        }
12226        Projection::Field(field, _) => {
12227            collect_field_ref_column(field, table_name, table_alias, columns);
12228        }
12229        // Slice 7a (#589): no runtime support yet; recurse into args so
12230        // any column references are still tracked in case a future
12231        // executor needs the column set.
12232        Projection::Window { args, .. } => {
12233            for arg in args {
12234                collect_projection_columns(arg, table_name, table_alias, columns);
12235            }
12236        }
12237    }
12238}
12239
12240fn collect_filter_columns(
12241    filter: &crate::storage::query::ast::Filter,
12242    table_name: &str,
12243    table_alias: Option<&str>,
12244    columns: &mut std::collections::BTreeSet<String>,
12245) {
12246    use crate::storage::query::ast::Filter;
12247    match filter {
12248        Filter::Compare { field, .. }
12249        | Filter::IsNull(field)
12250        | Filter::IsNotNull(field)
12251        | Filter::In { field, .. }
12252        | Filter::Between { field, .. }
12253        | Filter::Like { field, .. }
12254        | Filter::StartsWith { field, .. }
12255        | Filter::EndsWith { field, .. }
12256        | Filter::Contains { field, .. } => {
12257            collect_field_ref_column(field, table_name, table_alias, columns);
12258        }
12259        Filter::CompareFields { left, right, .. } => {
12260            collect_field_ref_column(left, table_name, table_alias, columns);
12261            collect_field_ref_column(right, table_name, table_alias, columns);
12262        }
12263        Filter::CompareExpr { lhs, rhs, .. } => {
12264            collect_expr_columns(lhs, table_name, table_alias, columns);
12265            collect_expr_columns(rhs, table_name, table_alias, columns);
12266        }
12267        Filter::And(left, right) | Filter::Or(left, right) => {
12268            collect_filter_columns(left, table_name, table_alias, columns);
12269            collect_filter_columns(right, table_name, table_alias, columns);
12270        }
12271        Filter::Not(inner) => collect_filter_columns(inner, table_name, table_alias, columns),
12272    }
12273}
12274
12275fn collect_expr_columns(
12276    expr: &crate::storage::query::ast::Expr,
12277    table_name: &str,
12278    table_alias: Option<&str>,
12279    columns: &mut std::collections::BTreeSet<String>,
12280) {
12281    use crate::storage::query::ast::Expr;
12282    match expr {
12283        Expr::Column { field, .. } => {
12284            collect_field_ref_column(field, table_name, table_alias, columns);
12285        }
12286        Expr::Literal { .. } | Expr::Parameter { .. } => {}
12287        Expr::UnaryOp { operand, .. } | Expr::Cast { inner: operand, .. } => {
12288            collect_expr_columns(operand, table_name, table_alias, columns);
12289        }
12290        Expr::BinaryOp { lhs, rhs, .. } => {
12291            collect_expr_columns(lhs, table_name, table_alias, columns);
12292            collect_expr_columns(rhs, table_name, table_alias, columns);
12293        }
12294        Expr::FunctionCall { args, .. } => {
12295            for arg in args {
12296                collect_expr_columns(arg, table_name, table_alias, columns);
12297            }
12298        }
12299        Expr::Case {
12300            branches, else_, ..
12301        } => {
12302            for (condition, value) in branches {
12303                collect_expr_columns(condition, table_name, table_alias, columns);
12304                collect_expr_columns(value, table_name, table_alias, columns);
12305            }
12306            if let Some(value) = else_ {
12307                collect_expr_columns(value, table_name, table_alias, columns);
12308            }
12309        }
12310        Expr::IsNull { operand, .. } => {
12311            collect_expr_columns(operand, table_name, table_alias, columns);
12312        }
12313        Expr::InList { target, values, .. } => {
12314            collect_expr_columns(target, table_name, table_alias, columns);
12315            for value in values {
12316                collect_expr_columns(value, table_name, table_alias, columns);
12317            }
12318        }
12319        Expr::Between {
12320            target, low, high, ..
12321        } => {
12322            collect_expr_columns(target, table_name, table_alias, columns);
12323            collect_expr_columns(low, table_name, table_alias, columns);
12324            collect_expr_columns(high, table_name, table_alias, columns);
12325        }
12326        Expr::Subquery { .. } => {}
12327        Expr::WindowFunctionCall { args, window, .. } => {
12328            for arg in args {
12329                collect_expr_columns(arg, table_name, table_alias, columns);
12330            }
12331            for e in &window.partition_by {
12332                collect_expr_columns(e, table_name, table_alias, columns);
12333            }
12334            for o in &window.order_by {
12335                collect_expr_columns(&o.expr, table_name, table_alias, columns);
12336            }
12337        }
12338    }
12339}
12340
12341fn collect_field_ref_column(
12342    field: &crate::storage::query::ast::FieldRef,
12343    table_name: &str,
12344    table_alias: Option<&str>,
12345    columns: &mut std::collections::BTreeSet<String>,
12346) {
12347    if let Some(column) = policy_column_name_from_field_ref(field, table_name, table_alias) {
12348        if column != "*" {
12349            columns.insert(column);
12350        }
12351    }
12352}
12353
12354fn policy_column_name_from_field_ref(
12355    field: &crate::storage::query::ast::FieldRef,
12356    table_name: &str,
12357    table_alias: Option<&str>,
12358) -> Option<String> {
12359    match field {
12360        crate::storage::query::ast::FieldRef::TableColumn { table, column } => {
12361            if column == "*" {
12362                return Some("*".to_string());
12363            }
12364            if table.is_empty() || table == table_name || Some(table.as_str()) == table_alias {
12365                Some(column.clone())
12366            } else {
12367                Some(format!("{table}.{column}"))
12368            }
12369        }
12370        _ => None,
12371    }
12372}
12373
12374fn legacy_resource_to_iam(
12375    resource: &crate::auth::privileges::Resource,
12376    tenant: Option<&str>,
12377) -> crate::auth::policies::ResourceRef {
12378    use crate::auth::privileges::Resource;
12379
12380    let (kind, name) = match resource {
12381        Resource::Database => ("database".to_string(), "*".to_string()),
12382        Resource::Schema(s) => ("schema".to_string(), format!("{s}.*")),
12383        Resource::Table { schema, table } => (
12384            "table".to_string(),
12385            match schema {
12386                Some(s) => format!("{s}.{table}"),
12387                None => table.clone(),
12388            },
12389        ),
12390        Resource::Function { schema, name } => (
12391            "function".to_string(),
12392            match schema {
12393                Some(s) => format!("{s}.{name}"),
12394                None => name.clone(),
12395            },
12396        ),
12397    };
12398
12399    let mut out = crate::auth::policies::ResourceRef::new(kind, name);
12400    if let Some(t) = tenant {
12401        out = out.with_tenant(t.to_string());
12402    }
12403    out
12404}
12405
12406#[derive(Debug)]
12407struct JoinTableSide {
12408    table: String,
12409    alias: String,
12410}
12411
12412fn table_side_context(expr: &QueryExpr) -> Option<JoinTableSide> {
12413    match expr {
12414        QueryExpr::Table(table) => Some(JoinTableSide {
12415            table: table.table.clone(),
12416            alias: table.alias.clone().unwrap_or_else(|| table.table.clone()),
12417        }),
12418        _ => None,
12419    }
12420}
12421
12422fn collect_projection_columns_for_table(
12423    projection: &Projection,
12424    table: &str,
12425    alias: Option<&str>,
12426    out: &mut BTreeSet<String>,
12427) {
12428    match projection {
12429        Projection::Column(column) | Projection::Alias(column, _) => {
12430            match split_qualified_column(column) {
12431                Some((qualifier, column))
12432                    if qualifier == table || alias.is_some_and(|alias| qualifier == alias) =>
12433                {
12434                    push_policy_column(column, out);
12435                }
12436                Some(_) => {}
12437                None => push_policy_column(column, out),
12438            }
12439        }
12440        Projection::Field(
12441            FieldRef::TableColumn {
12442                table: qualifier,
12443                column,
12444            },
12445            _,
12446        ) => {
12447            if qualifier.is_empty()
12448                || qualifier == table
12449                || alias.is_some_and(|alias| qualifier == alias)
12450            {
12451                push_policy_column(column, out);
12452            }
12453        }
12454        Projection::Field(
12455            FieldRef::NodeProperty {
12456                alias: qualifier,
12457                property,
12458            },
12459            _,
12460        )
12461        | Projection::Field(
12462            FieldRef::EdgeProperty {
12463                alias: qualifier,
12464                property,
12465            },
12466            _,
12467        ) => {
12468            if qualifier == table || alias.is_some_and(|alias| qualifier == alias) {
12469                push_policy_column(property, out);
12470            }
12471        }
12472        Projection::Function(_, args) => {
12473            for arg in args {
12474                collect_projection_columns_for_table(arg, table, alias, out);
12475            }
12476        }
12477        Projection::Expression(_, _) | Projection::All | Projection::Field(_, _) => {}
12478        Projection::Window { args, .. } => {
12479            for arg in args {
12480                collect_projection_columns_for_table(arg, table, alias, out);
12481            }
12482        }
12483    }
12484}
12485
12486fn collect_projection_columns_for_join_side(
12487    projection: &Projection,
12488    left: Option<&JoinTableSide>,
12489    right: Option<&JoinTableSide>,
12490    out: &mut HashMap<String, BTreeSet<String>>,
12491) -> RedDBResult<()> {
12492    match projection {
12493        Projection::Column(column) | Projection::Alias(column, _) => {
12494            if let Some((qualifier, column)) = split_qualified_column(column) {
12495                push_qualified_join_column(qualifier, column, left, right, out);
12496            } else {
12497                push_unqualified_join_column(column, left, right, out);
12498            }
12499        }
12500        Projection::Field(FieldRef::TableColumn { table, column }, _) => {
12501            if table.is_empty() {
12502                push_unqualified_join_column(column, left, right, out);
12503            } else if let Some(side) = [left, right]
12504                .into_iter()
12505                .flatten()
12506                .find(|side| table == side.table.as_str() || table == side.alias.as_str())
12507            {
12508                push_join_column(&side.table, column, out);
12509            }
12510        }
12511        Projection::Field(FieldRef::NodeProperty { alias, property }, _)
12512        | Projection::Field(FieldRef::EdgeProperty { alias, property }, _) => {
12513            push_qualified_join_column(alias, property, left, right, out);
12514        }
12515        Projection::Function(_, args) => {
12516            for arg in args {
12517                collect_projection_columns_for_join_side(arg, left, right, out)?;
12518            }
12519        }
12520        Projection::Expression(_, _) | Projection::All | Projection::Field(_, _) => {}
12521        Projection::Window { args, .. } => {
12522            for arg in args {
12523                collect_projection_columns_for_join_side(arg, left, right, out)?;
12524            }
12525        }
12526    }
12527    Ok(())
12528}
12529
12530fn split_qualified_column(column: &str) -> Option<(&str, &str)> {
12531    let (qualifier, column) = column.split_once('.')?;
12532    if qualifier.is_empty() || column.is_empty() || column.contains('.') {
12533        return None;
12534    }
12535    Some((qualifier, column))
12536}
12537
12538fn push_qualified_join_column(
12539    qualifier: &str,
12540    column: &str,
12541    left: Option<&JoinTableSide>,
12542    right: Option<&JoinTableSide>,
12543    out: &mut HashMap<String, BTreeSet<String>>,
12544) {
12545    if let Some(side) = [left, right]
12546        .into_iter()
12547        .flatten()
12548        .find(|side| qualifier == side.table.as_str() || qualifier == side.alias.as_str())
12549    {
12550        push_join_column(&side.table, column, out);
12551    }
12552}
12553
12554fn push_unqualified_join_column(
12555    column: &str,
12556    left: Option<&JoinTableSide>,
12557    right: Option<&JoinTableSide>,
12558    out: &mut HashMap<String, BTreeSet<String>>,
12559) {
12560    for side in [left, right].into_iter().flatten() {
12561        push_join_column(&side.table, column, out);
12562    }
12563}
12564
12565fn push_join_column(table: &str, column: &str, out: &mut HashMap<String, BTreeSet<String>>) {
12566    if is_policy_column_name(column) {
12567        out.entry(table.to_string())
12568            .or_default()
12569            .insert(column.to_string());
12570    }
12571}
12572
12573fn push_policy_column(column: &str, out: &mut BTreeSet<String>) {
12574    if is_policy_column_name(column) {
12575        out.insert(column.to_string());
12576    }
12577}
12578
12579fn is_policy_column_name(column: &str) -> bool {
12580    !column.is_empty()
12581        && column != "*"
12582        && !column.starts_with("LIT:")
12583        && !column.starts_with("TYPE:")
12584}
12585
12586fn runtime_iam_context(
12587    role: crate::auth::Role,
12588    tenant: Option<&str>,
12589) -> crate::auth::policies::EvalContext {
12590    crate::auth::policies::EvalContext {
12591        principal_tenant: tenant.map(|t| t.to_string()),
12592        current_tenant: tenant.map(|t| t.to_string()),
12593        peer_ip: None,
12594        mfa_present: false,
12595        now_ms: crate::auth::now_ms(),
12596        principal_is_admin_role: role == crate::auth::Role::Admin,
12597        principal_is_platform_scoped: tenant.is_none(),
12598    }
12599}
12600
12601fn explicit_table_projection_columns(
12602    query: &crate::storage::query::ast::TableQuery,
12603) -> Vec<String> {
12604    use crate::storage::query::ast::{FieldRef, Projection};
12605
12606    let mut columns = Vec::new();
12607    for projection in crate::storage::query::sql_lowering::effective_table_projections(query) {
12608        match projection {
12609            Projection::Column(column) | Projection::Alias(column, _) => {
12610                push_unique(&mut columns, column)
12611            }
12612            Projection::Field(FieldRef::TableColumn { column, .. }, _) => {
12613                push_unique(&mut columns, column)
12614            }
12615            // SELECT * and expression/function projections need the
12616            // executor-wide column-policy context mapped in
12617            // docs/security/select-relational-column-policy-audit-2026-05-08.md.
12618            _ => {}
12619        }
12620    }
12621    columns
12622}
12623
12624fn explicit_graph_projection_properties(
12625    query: &crate::storage::query::ast::GraphQuery,
12626) -> Vec<String> {
12627    use crate::storage::query::ast::{FieldRef, Projection};
12628
12629    let mut columns = Vec::new();
12630    for projection in &query.return_ {
12631        match projection {
12632            Projection::Field(FieldRef::NodeProperty { property, .. }, _)
12633            | Projection::Field(FieldRef::EdgeProperty { property, .. }, _) => {
12634                push_unique(&mut columns, property.clone())
12635            }
12636            _ => {}
12637        }
12638    }
12639    columns
12640}
12641
12642fn push_unique(columns: &mut Vec<String>, column: String) {
12643    if !columns.iter().any(|existing| existing == &column) {
12644        columns.push(column);
12645    }
12646}
12647
12648fn principal_label(p: &crate::storage::query::ast::PolicyPrincipalRef) -> String {
12649    use crate::storage::query::ast::PolicyPrincipalRef;
12650    match p {
12651        PolicyPrincipalRef::User(u) => match &u.tenant {
12652            Some(t) => format!("user:{t}/{}", u.username),
12653            None => format!("user:{}", u.username),
12654        },
12655        PolicyPrincipalRef::Group(g) => format!("group:{g}"),
12656    }
12657}
12658
12659/// Render a `Decision` into the (decision, matched_policy_id, matched_sid)
12660/// shape used by every audit emit + the simulator response.
12661pub(crate) fn decision_to_strings(
12662    d: &crate::auth::policies::Decision,
12663) -> (String, Option<String>, Option<String>) {
12664    use crate::auth::policies::Decision;
12665    match d {
12666        Decision::Allow {
12667            matched_policy_id,
12668            matched_sid,
12669        } => (
12670            "allow".into(),
12671            Some(matched_policy_id.clone()),
12672            matched_sid.clone(),
12673        ),
12674        Decision::Deny {
12675            matched_policy_id,
12676            matched_sid,
12677        } => (
12678            "deny".into(),
12679            Some(matched_policy_id.clone()),
12680            matched_sid.clone(),
12681        ),
12682        Decision::DefaultDeny => ("default_deny".into(), None, None),
12683        Decision::AdminBypass => ("admin_bypass".into(), None, None),
12684    }
12685}
12686
12687fn relation_scopes_for_query(query: &QueryExpr) -> Vec<String> {
12688    let mut scopes = Vec::new();
12689    collect_relation_scopes(query, &mut scopes);
12690    scopes.sort();
12691    scopes.dedup();
12692    scopes
12693}
12694
12695fn collect_relation_scopes(query: &QueryExpr, scopes: &mut Vec<String>) {
12696    match query {
12697        QueryExpr::Table(table) => {
12698            if !table.table.is_empty() {
12699                scopes.push(table.table.clone());
12700            }
12701            if let Some(alias) = &table.alias {
12702                scopes.push(alias.clone());
12703            }
12704        }
12705        QueryExpr::Join(join) => {
12706            collect_relation_scopes(&join.left, scopes);
12707            collect_relation_scopes(&join.right, scopes);
12708        }
12709        _ => {}
12710    }
12711}
12712
12713fn query_references_outer_scope(query: &QueryExpr, outer_scopes: &[String]) -> bool {
12714    let inner_scopes = relation_scopes_for_query(query);
12715    query_expr_references_outer_scope(query, outer_scopes, &inner_scopes)
12716}
12717
12718fn query_expr_references_outer_scope(
12719    query: &QueryExpr,
12720    outer_scopes: &[String],
12721    inner_scopes: &[String],
12722) -> bool {
12723    match query {
12724        QueryExpr::Table(table) => {
12725            table.select_items.iter().any(|item| match item {
12726                crate::storage::query::ast::SelectItem::Wildcard => false,
12727                crate::storage::query::ast::SelectItem::Expr { expr, .. } => {
12728                    expr_references_outer_scope(expr, outer_scopes, inner_scopes)
12729                }
12730            }) || table
12731                .where_expr
12732                .as_ref()
12733                .is_some_and(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
12734                || table.filter.as_ref().is_some_and(|filter| {
12735                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
12736                })
12737                || table.having_expr.as_ref().is_some_and(|expr| {
12738                    expr_references_outer_scope(expr, outer_scopes, inner_scopes)
12739                })
12740                || table.having.as_ref().is_some_and(|filter| {
12741                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
12742                })
12743                || table
12744                    .group_by_exprs
12745                    .iter()
12746                    .any(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
12747                || table.order_by.iter().any(|clause| {
12748                    clause.expr.as_ref().is_some_and(|expr| {
12749                        expr_references_outer_scope(expr, outer_scopes, inner_scopes)
12750                    })
12751                })
12752        }
12753        QueryExpr::Join(join) => {
12754            query_expr_references_outer_scope(&join.left, outer_scopes, inner_scopes)
12755                || query_expr_references_outer_scope(&join.right, outer_scopes, inner_scopes)
12756                || join.filter.as_ref().is_some_and(|filter| {
12757                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
12758                })
12759                || join.return_items.iter().any(|item| match item {
12760                    crate::storage::query::ast::SelectItem::Wildcard => false,
12761                    crate::storage::query::ast::SelectItem::Expr { expr, .. } => {
12762                        expr_references_outer_scope(expr, outer_scopes, inner_scopes)
12763                    }
12764                })
12765        }
12766        _ => false,
12767    }
12768}
12769
12770fn filter_references_outer_scope(
12771    filter: &crate::storage::query::ast::Filter,
12772    outer_scopes: &[String],
12773    inner_scopes: &[String],
12774) -> bool {
12775    use crate::storage::query::ast::Filter;
12776    match filter {
12777        Filter::Compare { field, .. }
12778        | Filter::IsNull(field)
12779        | Filter::IsNotNull(field)
12780        | Filter::In { field, .. }
12781        | Filter::Between { field, .. }
12782        | Filter::Like { field, .. }
12783        | Filter::StartsWith { field, .. }
12784        | Filter::EndsWith { field, .. }
12785        | Filter::Contains { field, .. } => {
12786            field_ref_references_outer_scope(field, outer_scopes, inner_scopes)
12787        }
12788        Filter::CompareFields { left, right, .. } => {
12789            field_ref_references_outer_scope(left, outer_scopes, inner_scopes)
12790                || field_ref_references_outer_scope(right, outer_scopes, inner_scopes)
12791        }
12792        Filter::CompareExpr { lhs, rhs, .. } => {
12793            expr_references_outer_scope(lhs, outer_scopes, inner_scopes)
12794                || expr_references_outer_scope(rhs, outer_scopes, inner_scopes)
12795        }
12796        Filter::And(left, right) | Filter::Or(left, right) => {
12797            filter_references_outer_scope(left, outer_scopes, inner_scopes)
12798                || filter_references_outer_scope(right, outer_scopes, inner_scopes)
12799        }
12800        Filter::Not(inner) => filter_references_outer_scope(inner, outer_scopes, inner_scopes),
12801    }
12802}
12803
12804fn expr_references_outer_scope(
12805    expr: &crate::storage::query::ast::Expr,
12806    outer_scopes: &[String],
12807    inner_scopes: &[String],
12808) -> bool {
12809    use crate::storage::query::ast::Expr;
12810    match expr {
12811        Expr::Column { field, .. } => {
12812            field_ref_references_outer_scope(field, outer_scopes, inner_scopes)
12813        }
12814        Expr::BinaryOp { lhs, rhs, .. } => {
12815            expr_references_outer_scope(lhs, outer_scopes, inner_scopes)
12816                || expr_references_outer_scope(rhs, outer_scopes, inner_scopes)
12817        }
12818        Expr::UnaryOp { operand, .. }
12819        | Expr::Cast { inner: operand, .. }
12820        | Expr::IsNull { operand, .. } => {
12821            expr_references_outer_scope(operand, outer_scopes, inner_scopes)
12822        }
12823        Expr::FunctionCall { args, .. } => args
12824            .iter()
12825            .any(|arg| expr_references_outer_scope(arg, outer_scopes, inner_scopes)),
12826        Expr::Case {
12827            branches, else_, ..
12828        } => {
12829            branches.iter().any(|(cond, value)| {
12830                expr_references_outer_scope(cond, outer_scopes, inner_scopes)
12831                    || expr_references_outer_scope(value, outer_scopes, inner_scopes)
12832            }) || else_
12833                .as_ref()
12834                .is_some_and(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
12835        }
12836        Expr::InList { target, values, .. } => {
12837            expr_references_outer_scope(target, outer_scopes, inner_scopes)
12838                || values
12839                    .iter()
12840                    .any(|value| expr_references_outer_scope(value, outer_scopes, inner_scopes))
12841        }
12842        Expr::Between {
12843            target, low, high, ..
12844        } => {
12845            expr_references_outer_scope(target, outer_scopes, inner_scopes)
12846                || expr_references_outer_scope(low, outer_scopes, inner_scopes)
12847                || expr_references_outer_scope(high, outer_scopes, inner_scopes)
12848        }
12849        Expr::Subquery { query, .. } => query_references_outer_scope(&query.query, inner_scopes),
12850        Expr::Literal { .. } | Expr::Parameter { .. } => false,
12851        Expr::WindowFunctionCall { args, window, .. } => {
12852            args.iter()
12853                .any(|arg| expr_references_outer_scope(arg, outer_scopes, inner_scopes))
12854                || window
12855                    .partition_by
12856                    .iter()
12857                    .any(|e| expr_references_outer_scope(e, outer_scopes, inner_scopes))
12858                || window
12859                    .order_by
12860                    .iter()
12861                    .any(|o| expr_references_outer_scope(&o.expr, outer_scopes, inner_scopes))
12862        }
12863    }
12864}
12865
12866fn field_ref_references_outer_scope(
12867    field: &crate::storage::query::ast::FieldRef,
12868    outer_scopes: &[String],
12869    inner_scopes: &[String],
12870) -> bool {
12871    match field {
12872        crate::storage::query::ast::FieldRef::TableColumn { table, .. } if !table.is_empty() => {
12873            outer_scopes.iter().any(|scope| scope == table)
12874                && !inner_scopes.iter().any(|scope| scope == table)
12875        }
12876        _ => false,
12877    }
12878}
12879
12880fn first_column_values(
12881    result: crate::storage::query::unified::UnifiedResult,
12882) -> RedDBResult<Vec<Value>> {
12883    if result.columns.len() > 1 {
12884        return Err(RedDBError::Query(
12885            "expression subquery must return exactly one column".to_string(),
12886        ));
12887    }
12888    let fallback_column = result
12889        .records
12890        .first()
12891        .and_then(|record| record.column_names().into_iter().next())
12892        .map(|name| name.to_string());
12893    let column = result.columns.first().cloned().or(fallback_column);
12894    let Some(column) = column else {
12895        return Ok(Vec::new());
12896    };
12897    Ok(result
12898        .records
12899        .iter()
12900        .map(|record| record.get(column.as_str()).cloned().unwrap_or(Value::Null))
12901        .collect())
12902}
12903
12904fn parse_timestamp_to_ms(s: &str) -> Option<u128> {
12905    // Bare integer ms.
12906    if let Ok(n) = s.parse::<u128>() {
12907        return Some(n);
12908    }
12909    // Fallback: ISO-8601 like 2030-01-02 03:04:05 — accept the date
12910    // portion only (midnight UTC). Full RFC3339 parsing is a stretch
12911    // goal; the common case is `'2030-01-01'`.
12912    if let Some(date) = s.split_whitespace().next() {
12913        let parts: Vec<&str> = date.split('-').collect();
12914        if parts.len() == 3 {
12915            let (y, m, d) = (parts[0], parts[1], parts[2]);
12916            if let (Ok(y), Ok(m), Ok(d)) = (y.parse::<i64>(), m.parse::<u32>(), d.parse::<u32>()) {
12917                // Days since 1970-01-01 — simple Julian arithmetic
12918                // suitable for years 1970-2100. Good enough for test
12919                // fixtures; precise parsing lands when we wire chrono.
12920                let days_in = days_from_civil(y, m, d);
12921                return Some((days_in as u128) * 86_400_000u128);
12922            }
12923        }
12924    }
12925    None
12926}
12927
12928/// Days from Unix epoch using H. Hinnant's civil-from-days algorithm.
12929/// Robust for the entire Gregorian range; used by `parse_timestamp_to_ms`.
12930fn days_from_civil(y: i64, m: u32, d: u32) -> i64 {
12931    let y = if m <= 2 { y - 1 } else { y };
12932    let era = if y >= 0 { y } else { y - 399 } / 400;
12933    let yoe = (y - era * 400) as u64; // [0, 399]
12934    let doy = (153 * (if m > 2 { m - 3 } else { m + 9 }) as u64 + 2) / 5 + d as u64 - 1;
12935    let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
12936    era * 146097 + doe as i64 - 719468
12937}
12938
12939fn walk_plan_node(
12940    node: &crate::storage::query::planner::CanonicalLogicalNode,
12941    depth: usize,
12942    out: &mut Vec<crate::storage::query::unified::UnifiedRecord>,
12943) {
12944    use std::sync::Arc;
12945    let mut rec = crate::storage::query::unified::UnifiedRecord::default();
12946    rec.set_arc(Arc::from("op"), Value::text(node.operator.clone()));
12947    rec.set_arc(
12948        Arc::from("source"),
12949        node.source.clone().map(Value::text).unwrap_or(Value::Null),
12950    );
12951    rec.set_arc(Arc::from("est_rows"), Value::Float(node.estimated_rows));
12952    rec.set_arc(Arc::from("est_cost"), Value::Float(node.operator_cost));
12953    rec.set_arc(Arc::from("depth"), Value::Integer(depth as i64));
12954    out.push(rec);
12955    for child in &node.children {
12956        walk_plan_node(child, depth + 1, out);
12957    }
12958}
12959
12960#[cfg(test)]
12961mod inline_graph_tvf_tests {
12962    use super::*;
12963
12964    fn scopes_for(sql: &str) -> HashSet<String> {
12965        let expr = crate::storage::query::parser::parse(sql)
12966            .expect("parse")
12967            .query;
12968        query_expr_result_cache_scopes(&expr)
12969    }
12970
12971    #[test]
12972    fn inline_tvf_cache_scopes_include_source_collections() {
12973        // The result-cache key for the inline form must derive from the
12974        // `nodes`/`edges` source collections so a write to either invalidates
12975        // the cached result (issue #799).
12976        let scopes = scopes_for(
12977            "SELECT * FROM components(nodes => (SELECT id FROM hosts), edges => (SELECT src, dst FROM links))",
12978        );
12979        assert!(scopes.contains("hosts"), "nodes source scoped: {scopes:?}");
12980        assert!(scopes.contains("links"), "edges source scoped: {scopes:?}");
12981    }
12982
12983    #[test]
12984    fn graph_collection_tvf_cache_scope_is_graph_argument() {
12985        // The graph-collection form still materializes the active graph, but
12986        // result-cache invalidation is scoped to the named graph argument so
12987        // INSERT INTO g NODE/EDGE invalidates cached TVF rows.
12988        let scopes = scopes_for("SELECT * FROM components(g)");
12989        assert!(scopes.contains("g"), "collection form scoped: {scopes:?}");
12990    }
12991
12992    #[test]
12993    fn abstract_degree_centrality_counts_undirected_endpoints() {
12994        let nodes = vec!["a".to_string(), "b".to_string(), "c".to_string()];
12995        let edges = vec![
12996            ("a".to_string(), "b".to_string(), 1.0_f32),
12997            ("b".to_string(), "c".to_string(), 1.0_f32),
12998        ];
12999        let degrees = abstract_degree_centrality(&nodes, &edges);
13000        assert_eq!(
13001            degrees,
13002            vec![
13003                ("a".to_string(), 1),
13004                ("b".to_string(), 2),
13005                ("c".to_string(), 1),
13006            ]
13007        );
13008    }
13009}